├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── ParseOracleArcStd.jar ├── README.md ├── cmake └── FindEigen3.cmake ├── config.h.cmake ├── maltparser-1.8.1.jar ├── parser ├── CMakeLists.txt ├── c2.h └── lstm-parse.cc ├── pretrained_embeddings ├── average.100++.filtered.gz ├── cca.100.filtered.gz └── multiCluster.100.filtered.gz ├── train-cross-lingual-parsers.tape └── typological_properties ├── typological_family_averaged_wals.csv ├── typological_properties-almostall.txt ├── typological_properties.txt ├── typological_properties.txt.all ├── typological_properties.txt.all.sept15 ├── typological_properties.txt.first22 ├── typological_properties.txt.naseem12 ├── typological_properties.txt.pat_genus_avg ├── typological_properties.txt.zhang15 ├── typological_properties.xlsx ├── typological_properties_id.txt ├── typological_properties_id_tgt_cs.txt ├── typological_properties_id_tgt_de.txt ├── typological_properties_id_tgt_en.txt ├── typological_properties_id_tgt_es.txt ├── typological_properties_id_tgt_fi.txt ├── typological_properties_id_tgt_fr.txt ├── typological_properties_id_tgt_ga.txt ├── typological_properties_id_tgt_hu.txt ├── typological_properties_id_tgt_it.txt ├── typological_properties_id_tgt_sv.txt └── typological_properties_none.txt /.gitignore: -------------------------------------------------------------------------------- 1 | build-gpu 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "dynet"] 2 | path = dynet 3 | url = git@github.com:clab/dynet.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(dynet) 2 | cmake_minimum_required(VERSION 2.8 FATAL_ERROR) 3 | 4 | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) 5 | set(CMAKE_CXX_FLAGS "-Wall -std=c++11 -O3 -g") 6 | 7 | enable_testing() 8 | 9 | #include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 10 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/dynet) 11 | 12 | # BACKEND=cuda? 13 | if(BACKEND MATCHES "^cuda$") 14 | set(WITH_CUDA_BACKEND 1) 15 | find_package(CUDA REQUIRED) 16 | set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT}) 17 | include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) 18 | add_definitions(-DHAVE_CUDA) 19 | MESSAGE("CUDA_LIBRARIES: ${CUDA_LIBRARIES}") 20 | list(REMOVE_ITEM CUDA_LIBRARIES -lpthread) 21 | set(LIBS ${LIBS} ${CUDA_LIBRARIES}) 22 | endif() 23 | 24 | # look for Boost 25 | set(Boost_REALPATH ON) 26 | find_package(Boost COMPONENTS program_options serialization REQUIRED) 27 | include_directories(${Boost_INCLUDE_DIR}) 28 | set(LIBS ${LIBS} ${Boost_LIBRARIES}) 29 | 30 | # look for Eigen 31 | find_package(Eigen3 REQUIRED) 32 | include_directories(${EIGEN3_INCLUDE_DIR}) 33 | set(WITH_EIGEN_BACKEND 1) 34 | 35 | #configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h) 36 | 37 | add_subdirectory(dynet/dynet) 38 | # add_subdirectory(dynet/examples) 39 | add_subdirectory(parser) 40 | 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /ParseOracleArcStd.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clab/language-universal-parser/6012899f8c6b7018c52f4eac9de6b3486720c568/ParseOracleArcStd.jar -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | * boost-1.60.0 3 | * eigen `hg clone https://bitbucket.org/eigen/eigen` 4 | 5 | # How to use? 6 | ``` 7 | # setup repository # 8 | cd 9 | mkdir git ; cd git/ 10 | git clone git@github.com:clab/language-universal-parser.git 11 | cd language-universal-parser 12 | git submodule init 13 | git submodule update 14 | cd dynet 15 | git pull origin master 16 | cd ../ 17 | 18 | # build the parser (with latest version of dynet) # 19 | cd ~/git/language-universal-parser/dynet 20 | git pull origin master 21 | cd .. ; mkdir build-gpu ; cd build-gpu 22 | cmake -DEIGEN3_INCLUDE_DIR=$EIGEN_ROOT .. # -DBACKEND=cuda is not supported just yet 23 | make -j 10 24 | 25 | # train the parser on small data # 26 | ~/git/language-universal-parser/build-gpu/parser/lstm-parse --train -P --training_data $TRAIN_ARCSTD --dev_data $DEV_ARCSTD --pretrained_dim 50 --pretrained $PRETRAINED_EMBEDDINGS --brown_clusters $PRETRAINED_CLUSTERS --epochs 1 27 | ``` 28 | # How to generate arc-standard transitions? 29 | The parser expects projective treebanks with arc-standard transitions as input (see command lines below). To convert nonprojective treebanks in CoNLL 2006 format to the arc-std oracle files of the pseudo-projective treebanks: 30 | ``` 31 | java -jar maltparser-1.8.1.jar -c pproj -m proj -i $split_lc -o $split_projective -pp baseline 32 | java -jar ParserOracleArcStd.jar -t -1 -l 1 -c treebank.conll -i treebank.conll > treebank.arcstd 33 | ``` 34 | 35 | We recommend that you lowercase word tokens/types in all input files (e.g., pretrained embeddings, Brown clusters, train/dev/test treebanks) before calling the parser. 36 | 37 | # Language typology embeddings 38 | To enable language typology embeddings, use the following command line argument `--typological_properties typology_file`. Sample typology files have been provided in the subdirectory `typological_properties/`. If you enable typology embeddings, please prefix each word in the input files (e.g., `en:book` instead of `book`). The two-letter prefix should match the first field in the typology file. 39 | 40 | # What to cite? 41 | [Many Languages, One Parser](http://arxiv.org/abs/1602.01595) TACL 2016 (to appear) 42 | Waleed Ammar, George Mulcaire, Miguel Ballesteros, Chris Dyer, Noah A. Smith 43 | 44 | [results](https://github.com/clab/language-universal-parser/tree/084eed3b1510fc893c4c92474cdcea1d7c58aa7c) 45 | -------------------------------------------------------------------------------- /cmake/FindEigen3.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find Eigen3 lib 2 | # 3 | # This module supports requiring a minimum version, e.g. you can do 4 | # find_package(Eigen3 3.1.2) 5 | # to require version 3.1.2 or newer of Eigen3. 6 | # 7 | # Once done this will define 8 | # 9 | # EIGEN3_FOUND - system has eigen lib with correct version 10 | # EIGEN3_INCLUDE_DIR - the eigen include directory 11 | # EIGEN3_VERSION - eigen version 12 | 13 | # Copyright (c) 2006, 2007 Montel Laurent, 14 | # Copyright (c) 2008, 2009 Gael Guennebaud, 15 | # Copyright (c) 2009 Benoit Jacob 16 | # Redistribution and use is allowed according to the terms of the 2-clause BSD license. 17 | 18 | if(NOT Eigen3_FIND_VERSION) 19 | if(NOT Eigen3_FIND_VERSION_MAJOR) 20 | set(Eigen3_FIND_VERSION_MAJOR 2) 21 | endif(NOT Eigen3_FIND_VERSION_MAJOR) 22 | if(NOT Eigen3_FIND_VERSION_MINOR) 23 | set(Eigen3_FIND_VERSION_MINOR 91) 24 | endif(NOT Eigen3_FIND_VERSION_MINOR) 25 | if(NOT Eigen3_FIND_VERSION_PATCH) 26 | set(Eigen3_FIND_VERSION_PATCH 0) 27 | endif(NOT Eigen3_FIND_VERSION_PATCH) 28 | 29 | set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}") 30 | endif(NOT Eigen3_FIND_VERSION) 31 | 32 | macro(_eigen3_check_version) 33 | file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header) 34 | 35 | string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}") 36 | set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}") 37 | string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}") 38 | set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}") 39 | string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}") 40 | set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}") 41 | 42 | set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION}) 43 | if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 44 | set(EIGEN3_VERSION_OK FALSE) 45 | else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 46 | set(EIGEN3_VERSION_OK TRUE) 47 | endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION}) 48 | 49 | if(NOT EIGEN3_VERSION_OK) 50 | 51 | message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, " 52 | "but at least version ${Eigen3_FIND_VERSION} is required") 53 | endif(NOT EIGEN3_VERSION_OK) 54 | endmacro(_eigen3_check_version) 55 | 56 | if (EIGEN3_INCLUDE_DIR) 57 | 58 | # in cache already 59 | _eigen3_check_version() 60 | set(EIGEN3_FOUND ${EIGEN3_VERSION_OK}) 61 | 62 | else (EIGEN3_INCLUDE_DIR) 63 | 64 | find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library 65 | PATHS 66 | ${CMAKE_INSTALL_PREFIX}/include 67 | ${KDE4_INCLUDE_DIR} 68 | PATH_SUFFIXES eigen3 eigen 69 | ) 70 | 71 | if(EIGEN3_INCLUDE_DIR) 72 | _eigen3_check_version() 73 | endif(EIGEN3_INCLUDE_DIR) 74 | 75 | include(FindPackageHandleStandardArgs) 76 | find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK) 77 | 78 | mark_as_advanced(EIGEN3_INCLUDE_DIR) 79 | 80 | endif(EIGEN3_INCLUDE_DIR) 81 | 82 | -------------------------------------------------------------------------------- /config.h.cmake: -------------------------------------------------------------------------------- 1 | #ifndef CNN_CONFIG_H_ 2 | #define CNN_CONFIG_H_ 3 | 4 | #cmakedefine WITH_MINERVA_BACKEND @WITH_MINERVA_BACKEND@ 5 | #cmakedefine WITH_THPP_BACKEND @WITH_THPP_BACKEND@ 6 | #cmakedefine WITH_EIGEN_BACKEND @WITH_EIGEN_BACKEND@ 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /maltparser-1.8.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clab/language-universal-parser/6012899f8c6b7018c52f4eac9de6b3486720c568/maltparser-1.8.1.jar -------------------------------------------------------------------------------- /parser/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | PROJECT(dynet:parser) 2 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8) 3 | 4 | ADD_EXECUTABLE(lstm-parse lstm-parse.cc) 5 | target_link_libraries(lstm-parse dynet ${Boost_LIBRARIES}) 6 | 7 | if (WITH_CUDA_BACKEND) 8 | add_dependencies(lstm-parse dynetcuda) 9 | target_link_libraries(lstm-parse dynetcuda) 10 | CUDA_ADD_CUBLAS_TO_TARGET(lstm-parse) 11 | endif (WITH_CUDA_BACKEND) 12 | -------------------------------------------------------------------------------- /parser/c2.h: -------------------------------------------------------------------------------- 1 | #ifndef CPYPDICT_H_ 2 | #define CPYPDICT_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace std; 18 | 19 | namespace cpyp { 20 | 21 | struct TokenInfo { 22 | unsigned word_id = 0; 23 | unsigned lang_id = 0; 24 | unsigned pos_id = 0; // fine-grained 25 | unsigned coarse_pos_id = 0; 26 | unsigned predicted_pos_id = 0; // fine-grained 27 | unsigned predicted_coarse_pos_id = 0; 28 | bool training_oov = true; 29 | }; 30 | 31 | class Corpus { 32 | public: 33 | 34 | bool USE_SPELLING = false; 35 | bool COARSE_ONLY = false; 36 | bool PREDICT_ATTACHMENTS_ONLY = false; 37 | 38 | map> sentences; 39 | map> sentencesDev; 40 | 41 | map> correct_act_sent; 42 | map> correct_act_sentDev; 43 | 44 | unordered_map> pretrained; 45 | unordered_map> typological_properties_map; 46 | unordered_map brown_clusters; 47 | unordered_map brown2_clusters; 48 | 49 | set training_vocab; // words available in the training corpus 50 | set training_pos_vocab; // pos available in the training corpus 51 | set coarse_pos_vocab; // coarse pos available in any corpus 52 | set fine_pos_vocab; // coarse pos available in any corpus 53 | set training_char_vocab; // chars available in the training corpus 54 | 55 | unsigned sentences_count = 0; 56 | unsigned sentencesDev_count = 0; 57 | unsigned actions_count = 0; 58 | 59 | unsigned maxWord = 0; 60 | unsigned maxPos = 1; 61 | unsigned maxLang = 0; 62 | 63 | map wordsToInt; 64 | map intToWords; 65 | map > wordIntsToCharInts; 66 | vector actions; 67 | 68 | map posToInt; 69 | map intToPos; 70 | 71 | int maxChars; 72 | map charsToInt; 73 | map intToChars; 74 | 75 | map langToInt; 76 | map intToLang; 77 | bool use_language_prefix; 78 | 79 | // String literals 80 | static constexpr const char* UNK = "UNK"; 81 | static constexpr const char* BAD0 = ""; 82 | 83 | public: 84 | Corpus() { 85 | maxWord = 0; 86 | maxPos = 0; 87 | maxChars=0; 88 | 89 | // always add the UNK language. 90 | use_language_prefix = false; 91 | langToInt["UNK"] = 0; // unknown language 92 | intToLang[0] = "UNK"; 93 | maxLang = 1; 94 | } 95 | 96 | inline void set_use_language_prefix(bool use) { 97 | use_language_prefix = use; 98 | } 99 | 100 | inline bool get_use_language_prefix() { 101 | return use_language_prefix; 102 | } 103 | 104 | inline unsigned UTF8Len(unsigned char x) { 105 | if (x < 0x80) return 1; 106 | else if ((x >> 5) == 0x06) return 2; 107 | else if ((x >> 4) == 0x0e) return 3; 108 | else if ((x >> 3) == 0x1e) return 4; 109 | else if ((x >> 2) == 0x3e) return 5; 110 | else if ((x >> 1) == 0x7e) return 6; 111 | else return 0; 112 | } 113 | 114 | void ReadTokenInfo(string lang_word_pos, TokenInfo ¤t_token) { 115 | // remove the trailing comma if need be. 116 | if (lang_word_pos[lang_word_pos.size() - 1] == ',') { 117 | lang_word_pos = lang_word_pos.substr(0, lang_word_pos.size() - 1); 118 | } 119 | 120 | // identify the POS. 121 | size_t posIndex = lang_word_pos.rfind('-'); 122 | if (posIndex == string::npos) { 123 | cerr << "lang_word_pos = " << lang_word_pos << endl; 124 | assert(false && "FATAL: Bad treebank format. I can't find the dash between a word and a POS tag."); 125 | } 126 | string pos = lang_word_pos.substr(posIndex + 1); 127 | unsigned pos_id = COARSE_ONLY? 0 : get_or_add_pos(pos); 128 | size_t size_of_coarse_pos_substring = pos.find(':'); 129 | string coarse_pos = 130 | size_of_coarse_pos_substring == string::npos? 131 | pos + "_c" : 132 | pos.substr(0, size_of_coarse_pos_substring); 133 | unsigned coarse_pos_id = get_or_add_pos(coarse_pos); 134 | coarse_pos_vocab.insert(coarse_pos_id); 135 | if (!COARSE_ONLY) { fine_pos_vocab.insert(pos_id); } 136 | string lang_word = lang_word_pos.substr(0, posIndex); 137 | //cerr << "lang_word_pos = " << lang_word_pos << endl; 138 | //cerr << "coarse_pos = " << coarse_pos << endl; 139 | //cerr << "coarse_pos_id = " << coarse_pos_id << endl; 140 | //cerr << "pos = " << pos << endl; 141 | //cerr << "pos_id = " << pos_id << endl << endl; 142 | 143 | // identify the language. 144 | unsigned lang_id = 0; 145 | string lang; 146 | if (use_language_prefix) { 147 | // Each word must be formatted as "en:with" or "fr:avec" 148 | // The only exception here is "ROOT". 149 | if (lang_word != "ROOT" && (lang_word.size() < 3 || lang_word[2] != ':')) { 150 | cerr << "lang_word = " << lang_word << endl; 151 | assert(false && "Language typology is provided but the 2-letter langauge prefix is missing from the current token (lang_word)."); 152 | } 153 | lang = (lang_word == "ROOT")? "__" : lang_word.substr(0,2); 154 | lang_id = get_or_add_lang(lang); 155 | } 156 | 157 | // Identify the "word" for which we tune embeddings by default. 158 | unsigned word_id = get_or_add_word(lang_word); 159 | 160 | // Identify the "surface form", which we use to estimate char-based word embeddings. 161 | unsigned kROOT = get_or_add_word("ROOT"); 162 | if (wordIntsToCharInts.count(word_id) == 0) { 163 | if (word_id == kROOT) { 164 | unsigned special_char = get_or_add_char("ROOT"); 165 | wordIntsToCharInts[kROOT].push_back(special_char); 166 | } else { 167 | string surface_form = (use_language_prefix)? lang_word.substr(3) : lang_word; 168 | 169 | // Add utf8_characters to charsToInt and intToChars if need be. 170 | unsigned j = 0; 171 | while(j < surface_form.length()) { 172 | string utf8_char = ""; 173 | for (unsigned h = j; h < j + UTF8Len(surface_form[j]); h++) { 174 | utf8_char += surface_form[h]; 175 | } 176 | j += UTF8Len(surface_form[j]); 177 | 178 | unsigned char_id = get_or_add_char(utf8_char); 179 | wordIntsToCharInts[word_id].push_back(char_id); 180 | } 181 | } 182 | } 183 | 184 | // Add this token details to the sentence. 185 | current_token.word_id = word_id; 186 | current_token.pos_id = pos_id; 187 | current_token.coarse_pos_id = coarse_pos_id; 188 | current_token.lang_id = lang_id; 189 | } 190 | 191 | inline void load_correct_actions(string file){ 192 | 193 | ifstream actionsFile(file); 194 | //correct_act_sent=new vector>(); 195 | string lineS; 196 | 197 | int count=-1; 198 | int sentence=-1; 199 | bool initial=false; 200 | bool first=true; 201 | get_or_add_word(BAD0); 202 | get_or_add_word(UNK); 203 | get_or_add_char(BAD0); 204 | assert(maxPos == 0); 205 | assert(maxLang > 0); 206 | maxPos = 1; 207 | 208 | vector current_sent; 209 | while (getline(actionsFile, lineS)){ 210 | ReplaceStringInPlace(lineS, "-RRB-", "_RRB_"); 211 | ReplaceStringInPlace(lineS, "-LRB-", "_LRB_"); 212 | if (lineS.empty()) { 213 | count = 0; 214 | if (!first) { 215 | sentences[sentence] = current_sent; 216 | } 217 | 218 | sentences_count = ++sentence; 219 | 220 | initial = true; 221 | current_sent.clear(); 222 | } else if (count == 0) { 223 | first = false; 224 | //stack and buffer, for now, leave it like this. 225 | count = 1; 226 | if (initial) { 227 | // the initial line in each sentence may look like: 228 | // [][the-det, cat-noun, is-verb, on-adp, the-det, mat-noun, ,-punct, ROOT-ROOT] 229 | // first, get rid of the square brackets. 230 | lineS = lineS.substr(3, lineS.size() - 4); 231 | // read the initial line, token by token "the-det," "cat-noun," ... 232 | istringstream iss(lineS); 233 | do { 234 | string lang_word_pos; 235 | iss >> lang_word_pos; 236 | if (lang_word_pos.size() == 0) { continue; } 237 | 238 | TokenInfo tokenInfo; 239 | ReadTokenInfo(lang_word_pos, tokenInfo); 240 | tokenInfo.training_oov = false; // because this is the training data. 241 | training_vocab.insert(tokenInfo.word_id); 242 | current_sent.push_back(tokenInfo); 243 | } while(iss); 244 | } 245 | initial = false; 246 | } 247 | else if (count == 1){ 248 | // find the action string 249 | size_t open_bracket_position = lineS.find('('); 250 | string actionString; 251 | if (PREDICT_ATTACHMENTS_ONLY && open_bracket_position != string::npos) { 252 | actionString = lineS.substr(0, open_bracket_position); 253 | // string label = lineS.substr(open_bracket_position, lineS.length() - open_bracket_position - 1); /*unused*/ 254 | } else { 255 | actionString = lineS; 256 | } 257 | 258 | // add the index of this action to the vector of correct actions for this sentence 259 | auto actionIter = find(actions.begin(), actions.end(), actionString); 260 | if (actionIter == actions.end()) { 261 | actions.push_back(actionString); 262 | cerr << "adding " << actionString << "to the list of possible actions" << endl; 263 | actionIter = find(actions.begin(), actions.end(), actionString); 264 | assert(actionIter != actions.end()); 265 | } 266 | unsigned actionIndex = distance(actions.begin(), actionIter); 267 | correct_act_sent[sentence].push_back(actionIndex); 268 | 269 | count = 0; 270 | } 271 | } 272 | 273 | // Add the last sentence. 274 | if (current_sent.size() > 0) { 275 | sentences[sentence] = current_sent; 276 | sentences_count = ++sentence; 277 | } 278 | 279 | actionsFile.close(); 280 | 281 | // add all pos ids and char ids available now to the correspodning training vocab 282 | for (auto pos: intToPos) { 283 | training_pos_vocab.insert(pos.first); 284 | } 285 | for (auto c: intToChars) { 286 | training_char_vocab.insert(c.first); 287 | } 288 | 289 | cerr << "done" << "\n"; 290 | for (auto a: actions) { 291 | cerr << a << "\n"; 292 | } 293 | actions_count = actions.size(); 294 | cerr << "actions_count:" << actions_count << "\n"; 295 | cerr << "maxWord:" << maxWord << "\n"; 296 | for (unsigned i = 0; i < maxPos; i++) { 297 | cerr << i << ":" << intToPos[i] << "\n"; 298 | } 299 | actions_count = actions.size(); 300 | 301 | } 302 | 303 | inline string lookup_lang(unsigned id) { 304 | if (id < maxLang) { 305 | return intToLang[id]; 306 | } else { 307 | return intToLang[0]; 308 | } 309 | } 310 | 311 | inline unsigned get_or_add_lang(const string& lang) { 312 | unsigned& id = langToInt[lang]; 313 | if (id == 0) { 314 | id = maxLang++; 315 | intToLang[id] = lang; 316 | } 317 | return id; 318 | } 319 | 320 | inline unsigned get_or_add_word(const string& word) { 321 | unsigned& id = wordsToInt[word]; 322 | if (id == 0) { 323 | id = maxWord++; 324 | intToWords[id] = word; 325 | } 326 | return id; 327 | } 328 | 329 | inline unsigned get_or_add_pos(const string& pos) { 330 | unsigned& id = posToInt[pos]; 331 | if (id == 0) { 332 | id = maxPos++; 333 | intToPos[id] = pos; 334 | } 335 | return id; 336 | } 337 | 338 | inline unsigned get_or_add_char(const string& utf8_char) { 339 | unsigned& id = charsToInt[utf8_char]; 340 | if (id == 0) { 341 | id = maxChars++; 342 | intToChars[id] = utf8_char; 343 | } 344 | return id; 345 | } 346 | 347 | inline void load_correct_actionsDev(string file) { 348 | if (training_vocab.size() == 0) { 349 | assert(false && "FATAL: load_correct_actions() MUST be called before load_correct_actionsDev() because otherwise we can't tell if a word in the dev treebank is OOV"); 350 | } 351 | 352 | ifstream actionsFile(file); 353 | string lineS; 354 | 355 | assert(maxPos > 1); 356 | assert(maxWord > 3); 357 | int count = -1; 358 | int sentence_id = -1; 359 | bool initial = false; 360 | bool first = true; 361 | vector current_sent; 362 | while (getline(actionsFile, lineS)) { 363 | ReplaceStringInPlace(lineS, "-RRB-", "_RRB_"); 364 | ReplaceStringInPlace(lineS, "-LRB-", "_LRB_"); 365 | if (lineS.empty()) { 366 | // an empty line marks the end of a sentence. 367 | count = 0; 368 | if (!first) { 369 | sentencesDev[sentence_id] = current_sent; 370 | } 371 | 372 | sentencesDev_count = ++sentence_id; 373 | 374 | initial = true; 375 | current_sent.clear(); 376 | } else if (count == 0) { 377 | first = false; 378 | //stack and buffer, for now, leave it like this. 379 | count = 1; 380 | if (initial) { 381 | // the initial line in each sentence may look like: 382 | // [][the-det, cat-noun, is-verb, on-adp, the-det, mat-noun, ,-punct, ROOT-ROOT] 383 | // first, get rid of the square brackets. 384 | lineS = lineS.substr(3, lineS.size() - 4); 385 | // read the initial line, token by token "the-det," "cat-noun," ... 386 | istringstream iss(lineS); 387 | do { 388 | 389 | // Read token. 390 | string lang_word_pos; 391 | iss >> lang_word_pos; 392 | if (lang_word_pos.size() == 0) { continue; } 393 | 394 | TokenInfo tokenInfo; 395 | ReadTokenInfo(lang_word_pos, tokenInfo); 396 | // it's an OOV if it didn't appear in the training treebank. 397 | tokenInfo.training_oov = (training_vocab.count(tokenInfo.word_id) == 0); 398 | current_sent.push_back(tokenInfo); 399 | 400 | } while(iss); 401 | } 402 | initial = false; 403 | } else if (count == 1) { 404 | size_t open_bracket_position = lineS.find('('); 405 | string actionString; 406 | if (PREDICT_ATTACHMENTS_ONLY && open_bracket_position != string::npos) { 407 | actionString = lineS.substr(0, open_bracket_position); 408 | // string label = lineS.substr(open_bracket_position, lineS.length() - open_bracket_position - 1); /*unused*/ 409 | } else { 410 | actionString = lineS; 411 | } 412 | auto actionIter = find(actions.begin(), actions.end(), actionString); 413 | if (actionIter != actions.end()) { 414 | unsigned actionIndex = distance(actions.begin(), actionIter); 415 | correct_act_sentDev[sentence_id].push_back(actionIndex); 416 | } else { 417 | cerr << "new actionString in dev set: " << actionString << endl; 418 | assert(false); 419 | // TODO: right now, new actions which haven't been observed in training 420 | // are not added to correct_act_sentDev. This may be a problem if the 421 | // training data is little. 422 | } 423 | count=0; 424 | } 425 | } 426 | 427 | // Add the last sentence. 428 | if (current_sent.size() > 0) { 429 | sentencesDev[sentence_id] = current_sent; 430 | sentencesDev_count = ++sentence_id; 431 | } 432 | 433 | actionsFile.close(); 434 | } 435 | 436 | void ReplaceStringInPlace(string& subject, const string& search, 437 | const string& replace) { 438 | size_t pos = 0; 439 | while ((pos = subject.find(search, pos)) != string::npos) { 440 | subject.replace(pos, search.length(), replace); 441 | pos += replace.length(); 442 | } 443 | } 444 | 445 | string ReplaceStringAndReturnNew(string str, const string& from, 446 | const string& to) { 447 | size_t pos = 0; 448 | while ( (pos = str.find(from, pos)) != string::npos) { 449 | str.replace(pos, from.length(), to); 450 | pos += to.length(); 451 | } 452 | return str; 453 | } 454 | 455 | }; 456 | 457 | } // namespace 458 | 459 | #endif 460 | -------------------------------------------------------------------------------- /pretrained_embeddings/average.100++.filtered.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clab/language-universal-parser/6012899f8c6b7018c52f4eac9de6b3486720c568/pretrained_embeddings/average.100++.filtered.gz -------------------------------------------------------------------------------- /pretrained_embeddings/cca.100.filtered.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clab/language-universal-parser/6012899f8c6b7018c52f4eac9de6b3486720c568/pretrained_embeddings/cca.100.filtered.gz -------------------------------------------------------------------------------- /pretrained_embeddings/multiCluster.100.filtered.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clab/language-universal-parser/6012899f8c6b7018c52f4eac9de6b3486720c568/pretrained_embeddings/multiCluster.100.filtered.gz -------------------------------------------------------------------------------- /train-cross-lingual-parsers.tape: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ducttape 2 | 3 | global { 4 | ducttape_experimental_submitters=enable 5 | ducttape_experimental_imports=enable 6 | 7 | # experiments directory 8 | ducttape_output="/usr4/home/wammar/exp-universal-embeddings" 9 | 10 | # language to evaluate on 11 | target_language=(Target: cs="cs" de="de" en="en" es="es" fi="fi" fr="fr" ga="ga" hu="hu" it="it" sv="sv") 12 | 13 | ########### 14 | # CONFIGS # 15 | ########### 16 | use_pretrained="yes" #(UsePretrained: yes="yes" no="") 17 | pretrained_dim=(PretrainedDim: hundred="100" fifty="50") 18 | use_typology=(UseTypology: no="" yes="yes") 19 | pos_dim="12" #(PosDim: default="12" hundred="100" twohundreds="200") 20 | hidden_dim="100" #(HiddenDim: hundred="100" default="64") 21 | lstm_input_dim="100" #(LstmInputDim: hundred="100" default="60") 22 | rel_dim="20" #(RelDim: twenty="20" default="10") 23 | action_dim="20" #(ActionDim: twenty="20" default="16") 24 | unk_prob="0.2" #(UnkProb: default="0.2") 25 | layers="2" #(Layers: default="2") 26 | input_dim="0" #(InputDim: zero="0" hundred="100") 27 | use_spelling=(UseSpelling: no="" yes="yes") 28 | 29 | ############## 30 | # DATA FILES # 31 | ############## 32 | 33 | # tokenized and lowercased monolingual corpora 34 | corpus_cs="/usr3/home/wammar/corpora/monolingual/plain-cs/ud-v1x40+news.2008-2012+nc.v8.tok.lc.1m" 35 | corpus_de="/usr3/home/wammar/corpora/monolingual/plain-de/ud-v1x40+news.2008-2012+nc.v8+ud.v1.lc.tok.1m" 36 | corpus_en="/usr3/home/wammar/corpora/monolingual/plain-en/ud-v1x40+news.2008-2012+nc.v8+ud.v1.lc.tok.1m" 37 | corpus_es="/usr3/home/wammar/corpora/monolingual/plain-es/ud-v1x40+news.2008-2012+nc.v8+ud.v1.lc.tok.1m" 38 | corpus_fi="/usr3/home/wammar/corpora/monolingual/plain-fi/ud-v1x40+wmt15-mono+ud.v1.tok.lc.1m" 39 | corpus_fr="/usr3/home/wammar/corpora/monolingual/plain-fr/ud-v1x40+news.2008-2012+nc.v8+ud.v1.lc.tok.1m" 40 | corpus_ga="/usr3/home/wammar/corpora/monolingual/plain-ga/ud-v1x40+DGT+EUbookshop+EUconst+GNOME+KDE4+Tatoeba+Ubuntu.tok.lc.1m" 41 | corpus_hu="/usr3/home/wammar/corpora/monolingual/hungarian/ud-v1x40+newscrawl_2011_100K+europarl.tok.lc.1m" 42 | corpus_it="/usr3/home/wammar/corpora/monolingual/italian/ud-v1x40+news_2010_100K+europarl.tok.lc.1m" 43 | corpus_sv="/usr3/home/wammar/corpora/monolingual/plain-sv/ud-v1x40+europarl.sv.lc.1m" 44 | 45 | # wiktionary dump 46 | wiktionary_dump="/usr3/home/wammar/corpora/parallel/wiktionary-bilingual-dicts/wiktionary_de+en_2012-04-01_translations.csv.gz" 47 | 48 | # the alignments used by Jiang in his ACL 2015 paper, available for download at 49 | # https://github.com/jiangfeng1124/acl15-clnndep/tree/master/resources/align 50 | jiangacl15_alignments_de_en="/usr3/home/wammar/corpora/parallel/acl15-clnndep-alignments/de-en.align" 51 | jiangacl15_alignments_es_en="/usr3/home/wammar/corpora/parallel/acl15-clnndep-alignments/es-en.align" 52 | jiangacl15_alignments_fr_en="/usr3/home/wammar/corpora/parallel/acl15-clnndep-alignments/fr-en.align" 53 | 54 | # parallel data to be word-aligned then used to extract bilingual dictionaries. 55 | bitext_cs_en="/usr3/home/wammar/corpora/parallel/czech/100k+aer+kazuya+nc_v8.tok.lc.cs-en" 56 | bitext_de_en="/usr3/home/wammar/corpora/parallel/german/wmt14-syntaxcorpus.tok.lc.de-en" 57 | bitext_es_en="/usr3/home/wammar/corpora/parallel/spanish/nc10+europarl5+kazuya.tok.lc.es-en" 58 | bitext_fi_en="/usr3/home/wammar/corpora/parallel/finnish/wmt-train.tok.lc.fi-en" 59 | bitext_fr_en="/usr3/home/wammar/corpora/parallel/french/kazuya+10k+nc10.tok.lc.fr-en" 60 | bitext_ga_en="/usr3/home/wammar/corpora/parallel/irish/DGT+EUbookshop+EUconst+GNOME+KDE4+Tatoeba+Ubuntu.tok.lc.ga-en" 61 | bitext_hu_en="/usr3/home/wammar/corpora/parallel/hungarian/kazuya.tok.lc.hu-en" 62 | bitext_it_en="/usr3/home/wammar/corpora/parallel/italian/kazuya.tok.lc.it-en" 63 | bitext_sv_en="/usr3/home/wammar/corpora/parallel/swedish/kazuya.tok.lc.sv-en" 64 | 65 | # wikipedia dump files needed to extract parallel wikipedia titles as detailed in 66 | # https://github.com/clab/wikipedia-parallel-titles 67 | wikipedia_base_cs="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/cswiki-20150602" 68 | wikipedia_base_de="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/dewiki-20150602" 69 | wikipedia_base_es="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/eswiki-20150602" 70 | wikipedia_base_fi="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/fiwiki-20150602" 71 | wikipedia_base_fr="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/frwiki-20150602" 72 | wikipedia_base_ga="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/gawiki-20150603" 73 | wikipedia_base_hu="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/huwiki-20150602" 74 | wikipedia_base_it="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/itwiki-20150602" 75 | wikipedia_base_sv="/usr3/home/wammar/corpora/parallel/wikipedia-parallel-titles/svwiki-20150602" 76 | 77 | # multilingual punctuation clusters 78 | # TODO: this file only has de+en+es+fr punctuations. Please add the language specific 79 | # version of these puncutations in other languages. 80 | punctuation_clusters="/usr3/home/wammar/corpora/parallel/wiktionary-bilingual-dicts/punctuation-mappings" 81 | 82 | # typological properties 83 | typological_properties="/usr0/home/wammar/git/internal-lstm-parser/typological_properties.txt" 84 | 85 | # universal dependency treebanks (train) 86 | treebank_train_cs="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/cs/cs-ud-train.conllu" 87 | treebank_train_de="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/de/de-ud-train.conllu" 88 | treebank_train_en="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/en/en-ud-train.conllu" 89 | treebank_train_es="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/es/es-ud-train.conllu" 90 | treebank_train_fi="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/fi/fi-ud-train.conllu" 91 | treebank_train_fr="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/fr/fr-ud-train.conllu" 92 | treebank_train_ga="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/ga/ga-ud-train.conllu" 93 | treebank_train_hu="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/hu/hu-ud-train.conllu" 94 | treebank_train_it="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/it/it-ud-train.conllu" 95 | treebank_train_sv="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/sv/sv-ud-train.conllu" 96 | 97 | # universal dependency treebanks (dev) 98 | treebank_dev_cs="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/cs/cs-ud-dev.conllu" 99 | treebank_dev_de="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/de/de-ud-dev.conllu" 100 | treebank_dev_en="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/en/en-ud-dev.conllu" 101 | treebank_dev_es="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/es/es-ud-dev.conllu" 102 | treebank_dev_fi="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/fi/fi-ud-dev.conllu" 103 | treebank_dev_fr="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/fr/fr-ud-dev.conllu" 104 | treebank_dev_ga="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/ga/ga-ud-dev.conllu" 105 | treebank_dev_hu="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/hu/hu-ud-dev.conllu" 106 | treebank_dev_it="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/it/it-ud-dev.conllu" 107 | treebank_dev_sv="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/sv/sv-ud-dev.conllu" 108 | 109 | # universal dependency treebanks (test) 110 | treebank_test_cs="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/cs/cs-ud-test.conllu" 111 | treebank_test_de="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/de/de-ud-test.conllu" 112 | treebank_test_en="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/en/en-ud-test.conllu" 113 | treebank_test_es="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/es/es-ud-test.conllu" 114 | treebank_test_fi="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/fi/fi-ud-test.conllu" 115 | treebank_test_fr="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/fr/fr-ud-test.conllu" 116 | treebank_test_ga="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/ga/ga-ud-test.conllu" 117 | treebank_test_hu="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/hu/hu-ud-test.conllu" 118 | treebank_test_it="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/it/it-ud-test.conllu" 119 | treebank_test_sv="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.1/sv/sv-ud-test.conllu" 120 | 121 | # the CCA word embeddings used by Jiang in his ACL15 paper, available at 122 | # https://drive.google.com/file/d/0B1z04ix6jD_Db3REdHlnREpjMmc/view?usp=sharing 123 | jiangacl15_cca_ende_en="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/cca/en-de/en.50.w2v" 124 | jiangacl15_cca_ende_de="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/cca/en-de/de.50.w2v" 125 | jiangacl15_cca_enfr_en="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/cca/en-fr/en.50.w2v" 126 | jiangacl15_cca_enfr_fr="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/cca/en-fr/fr.50.w2v" 127 | jiangacl15_cca_enes_en="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/cca/en-es/en.50.w2v" 128 | jiangacl15_cca_enes_es="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/cca/en-es/es.50.w2v" 129 | 130 | # the PROJ word embeddings used by Jiang in his ACL15 paper, available at 131 | # https://drive.google.com/file/d/0B1z04ix6jD_Db3REdHlnREpjMmc/view?usp=sharing 132 | jiangacl15_proj_en="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/projected/en.50" 133 | jiangacl15_proj_de="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/projected/de.50" 134 | jiangacl15_proj_fr="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/projected/fr.50" 135 | jiangacl15_proj_es="/usr3/home/wammar/corpora/monolingual/universal/acl15-cl-wemb/projected/es.50" 136 | 137 | ######### 138 | # TOOLS # 139 | ######### 140 | 141 | lstm_parser="/usr0/home/wammar/git/internal-lstm-parser/" 142 | incremental_word2vec="/usr0/home/wammar/git/incremental-word2vec/" 143 | conll_eval="/usr0/home/wammar/wammar-utils/conllx-eval.v1_8.pl" 144 | wammar_utils="/usr0/home/wammar/wammar-utils/" 145 | parse_oracle_arc_std_swap_jar="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.0/treebanks/ParserOracleArcStdWithSwap.jar" 146 | #parse_oracle_arc_std_swap_jar="/usr3/home/wammar/corpora/treebanks/universal-dependencies-1.0/treebanks/ParseOracleArcStd.jar" 147 | wikipedia_parallel_titles="/usr0/home/wammar/git/wikipedia-parallel-titles/" 148 | fast_align="use_giza" #"/usr0/home/wammar/git/fast_align/" 149 | cdec="/usr0/home/wammar/cdec/" 150 | } 151 | 152 | # TODO: controlled experiments between delex and lex 153 | # TODO: controlled experiments to decide which subset of typological properties are best 154 | plan Full { 155 | #reach TrainEmbeddingsOfWordClusters 156 | reach ProcessIndividualTreebanks 157 | reach AggregateTrainTreebanks 158 | reach AggregateDevTreebanks 159 | } 160 | 161 | task EvaluateParserOnTest 162 | :: conll_eval=@ 163 | :: use_typology=@ 164 | < train=(TrainLang: cs=$treebank_train_cs_arcstdswap@ProcessIndividualTreebanks 165 | de=$treebank_train_de_arcstdswap@ProcessIndividualTreebanks 166 | en=$treebank_train_en_arcstdswap@ProcessIndividualTreebanks 167 | es=$treebank_train_es_arcstdswap@ProcessIndividualTreebanks 168 | fi=$treebank_train_fi_arcstdswap@ProcessIndividualTreebanks 169 | fr=$treebank_train_fr_arcstdswap@ProcessIndividualTreebanks 170 | ga=$treebank_train_ga_arcstdswap@ProcessIndividualTreebanks 171 | hu=$treebank_train_hu_arcstdswap@ProcessIndividualTreebanks 172 | it=$treebank_train_it_arcstdswap@ProcessIndividualTreebanks 173 | sv=$treebank_train_sv_arcstdswap@ProcessIndividualTreebanks 174 | v1_but_en=$v1_but_en@AggregateTrainTreebanks 175 | v1_but_cs=$v1_but_cs@AggregateTrainTreebanks 176 | v1_but_de=$v1_but_de@AggregateTrainTreebanks 177 | v1_but_es=$v1_but_es@AggregateTrainTreebanks 178 | v1_but_fi=$v1_but_fi@AggregateTrainTreebanks 179 | v1_but_fr=$v1_but_fr@AggregateTrainTreebanks 180 | v1_but_ga=$v1_but_ga@AggregateTrainTreebanks 181 | v1_but_hu=$v1_but_hu@AggregateTrainTreebanks 182 | v1_but_it=$v1_but_it@AggregateTrainTreebanks 183 | v1_but_sv=$v1_but_sv@AggregateTrainTreebanks 184 | v1=$v1@AggregateTrainTreebanks 185 | ) 186 | < embeddings=(EmbeddingsType: 187 | jiangacl15_cca_ende=$ende_embeddings@PrepareJiangAcl15CcaEmbeddings 188 | jiangacl15_cca_enes=$enes_embeddings@PrepareJiangAcl15CcaEmbeddings 189 | jiangacl15_cca_enfr=$enfr_embeddings@PrepareJiangAcl15CcaEmbeddings 190 | jiangacl15_proj=$embeddings@PrepareJiangAcl15ProjEmbeddings 191 | word_clusters=$cluster_embeddings@TrainEmbeddingsOfWordClusters 192 | none="") 193 | :: use_pretrained=@ 194 | :: lstm_parser=@ 195 | < model=$latest_model@TrainParser 196 | < test=(EvalLang: cs=$treebank_test_cs_arcstdswap@ProcessIndividualTreebanks 197 | de=$treebank_test_de_arcstdswap@ProcessIndividualTreebanks 198 | en=$treebank_test_en_arcstdswap@ProcessIndividualTreebanks 199 | es=$treebank_test_es_arcstdswap@ProcessIndividualTreebanks 200 | fi=$treebank_test_fi_arcstdswap@ProcessIndividualTreebanks 201 | fr=$treebank_test_fr_arcstdswap@ProcessIndividualTreebanks 202 | ga=$treebank_test_ga_arcstdswap@ProcessIndividualTreebanks 203 | hu=$treebank_test_hu_arcstdswap@ProcessIndividualTreebanks 204 | it=$treebank_test_it_arcstdswap@ProcessIndividualTreebanks 205 | sv=$treebank_test_sv_arcstdswap@ProcessIndividualTreebanks) 206 | < test_conll=(EvalLang: 207 | cs=$treebank_test_cs_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 208 | de=$treebank_test_de_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 209 | en=$treebank_test_en_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 210 | es=$treebank_test_es_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 211 | fi=$treebank_test_fi_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 212 | fr=$treebank_test_fr_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 213 | ga=$treebank_test_ga_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 214 | hu=$treebank_test_hu_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 215 | it=$treebank_test_it_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 216 | sv=$treebank_test_sv_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks) 217 | > predictions 218 | > score 219 | > stderr 220 | :: pos_dim=@ 221 | :: hidden_dim=@ 222 | :: lstm_input_dim=@ 223 | :: rel_dim=@ 224 | :: action_dim=@ 225 | :: unk_prob=@ 226 | :: layers=@ 227 | :: input_dim=@ 228 | { 229 | # rebuild the parser 230 | pushd $lstm_parser 231 | make -j2 232 | popd 233 | 234 | # construct the command 235 | command="$lstm_parser/parser/lstm-parse -T $train --model $model -d $test -P" 236 | 237 | if [ $input_dim ]; then 238 | command="$command --input_dim $input_dim " 239 | fi 240 | 241 | if [ $use_pretrained ]; then 242 | command="$command -w $embeddings --pretrained_dim 100 " 243 | fi 244 | 245 | if [ $use_typology ]; then 246 | command="$command -y $lstm_parser/typological_properties.txt " 247 | fi 248 | 249 | if [ $pos_dim ]; then 250 | command="$command --pos_dim $pos_dim " 251 | fi 252 | 253 | if [ $hidden_dim ]; then 254 | command="$command --hidden_dim $hidden_dim " 255 | fi 256 | 257 | if [ $lstm_input_dim ]; then 258 | command="$command --lstm_input_dim $lstm_input_dim " 259 | fi 260 | 261 | if [ $rel_dim ]; then 262 | command="$command --rel_dim $rel_dim " 263 | fi 264 | 265 | if [ $action_dim ]; then 266 | command="$command --action_dim $action_dim " 267 | fi 268 | 269 | if [ $unk_prob ]; then 270 | command="$command --unk_prob $unk_prob " 271 | fi 272 | 273 | if [ $layers ]; then 274 | command="$command --layers $layers " 275 | fi 276 | 277 | # execute command 278 | echo "executing $command..." 279 | $command 2> $stderr > $predictions 280 | 281 | # evaluate 282 | perl $conll_eval -g $test_conll -s $predictions -q > $score 283 | } 284 | 285 | task EvaluateParserOnDev 286 | :: conll_eval=@ 287 | < train=(TrainLang: cs=$treebank_train_cs_arcstdswap@ProcessIndividualTreebanks 288 | de=$treebank_train_de_arcstdswap@ProcessIndividualTreebanks 289 | en=$treebank_train_en_arcstdswap@ProcessIndividualTreebanks 290 | es=$treebank_train_es_arcstdswap@ProcessIndividualTreebanks 291 | fi=$treebank_train_fi_arcstdswap@ProcessIndividualTreebanks 292 | fr=$treebank_train_fr_arcstdswap@ProcessIndividualTreebanks 293 | ga=$treebank_train_ga_arcstdswap@ProcessIndividualTreebanks 294 | hu=$treebank_train_hu_arcstdswap@ProcessIndividualTreebanks 295 | it=$treebank_train_it_arcstdswap@ProcessIndividualTreebanks 296 | sv=$treebank_train_sv_arcstdswap@ProcessIndividualTreebanks 297 | v1_but_en=$v1_but_en@AggregateTrainTreebanks 298 | v1_but_cs=$v1_but_cs@AggregateTrainTreebanks 299 | v1_but_de=$v1_but_de@AggregateTrainTreebanks 300 | v1_but_es=$v1_but_es@AggregateTrainTreebanks 301 | v1_but_fi=$v1_but_fi@AggregateTrainTreebanks 302 | v1_but_fr=$v1_but_fr@AggregateTrainTreebanks 303 | v1_but_ga=$v1_but_ga@AggregateTrainTreebanks 304 | v1_but_hu=$v1_but_hu@AggregateTrainTreebanks 305 | v1_but_it=$v1_but_it@AggregateTrainTreebanks 306 | v1_but_sv=$v1_but_sv@AggregateTrainTreebanks 307 | v1=$v1@AggregateTrainTreebanks 308 | ) 309 | < embeddings=(EmbeddingsType: 310 | jiangacl15_cca_ende=$ende_embeddings@PrepareJiangAcl15CcaEmbeddings 311 | jiangacl15_cca_enes=$enes_embeddings@PrepareJiangAcl15CcaEmbeddings 312 | jiangacl15_cca_enfr=$enfr_embeddings@PrepareJiangAcl15CcaEmbeddings 313 | jiangacl15_proj=$embeddings@PrepareJiangAcl15ProjEmbeddings 314 | word_clusters=$cluster_embeddings@TrainEmbeddingsOfWordClusters 315 | none="") 316 | :: use_pretrained=@ 317 | :: lstm_parser=@ 318 | < model=$latest_model@TrainParser 319 | < dev=(EvalLang: cs=$treebank_dev_cs_arcstdswap@ProcessIndividualTreebanks 320 | de=$treebank_dev_de_arcstdswap@ProcessIndividualTreebanks 321 | en=$treebank_dev_en_arcstdswap@ProcessIndividualTreebanks 322 | es=$treebank_dev_es_arcstdswap@ProcessIndividualTreebanks 323 | fi=$treebank_dev_fi_arcstdswap@ProcessIndividualTreebanks 324 | fr=$treebank_dev_fr_arcstdswap@ProcessIndividualTreebanks 325 | ga=$treebank_dev_ga_arcstdswap@ProcessIndividualTreebanks 326 | hu=$treebank_dev_hu_arcstdswap@ProcessIndividualTreebanks 327 | it=$treebank_dev_it_arcstdswap@ProcessIndividualTreebanks 328 | sv=$treebank_dev_sv_arcstdswap@ProcessIndividualTreebanks) 329 | < dev_conll=(EvalLang: cs=$treebank_dev_cs_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 330 | de=$treebank_dev_de_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 331 | en=$treebank_dev_en_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 332 | es=$treebank_dev_es_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 333 | fi=$treebank_dev_fi_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 334 | fr=$treebank_dev_fr_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 335 | ga=$treebank_dev_ga_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 336 | hu=$treebank_dev_hu_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 337 | it=$treebank_dev_it_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks 338 | sv=$treebank_dev_sv_conllx_lc_coarse_langprefix_clusterid@ProcessIndividualTreebanks) 339 | > predictions 340 | :: use_typology=@ 341 | > score 342 | > stderr 343 | :: pos_dim=@ 344 | :: hidden_dim=@ 345 | :: lstm_input_dim=@ 346 | :: rel_dim=@ 347 | :: action_dim=@ 348 | :: unk_prob=@ 349 | :: layers=@ 350 | :: input_dim=@ 351 | 352 | { 353 | # rebuild the parser 354 | pushd $lstm_parser 355 | make -j2 356 | popd 357 | 358 | # construct the command 359 | command="$lstm_parser/parser/lstm-parse -T $train --model $model -d $dev -P" 360 | 361 | if [ $input_dim ]; then 362 | command="$command --input_dim $input_dim " 363 | fi 364 | 365 | if [ $use_pretrained ]; then 366 | command="$command -w $embeddings --pretrained_dim 100 " 367 | fi 368 | 369 | if [ $use_typology ]; then 370 | command="$command -y $lstm_parser/typological_properties.txt " 371 | fi 372 | 373 | if [ $pos_dim ]; then 374 | command="$command --pos_dim $pos_dim " 375 | fi 376 | 377 | if [ $hidden_dim ]; then 378 | command="$command --hidden_dim $hidden_dim " 379 | fi 380 | 381 | if [ $lstm_input_dim ]; then 382 | command="$command --lstm_input_dim $lstm_input_dim " 383 | fi 384 | 385 | if [ $rel_dim ]; then 386 | command="$command --rel_dim $rel_dim " 387 | fi 388 | 389 | if [ $action_dim ]; then 390 | command="$command --action_dim $action_dim " 391 | fi 392 | 393 | if [ $unk_prob ]; then 394 | command="$command --unk_prob $unk_prob " 395 | fi 396 | 397 | if [ $layers ]; then 398 | command="$command --layers $layers " 399 | fi 400 | 401 | # execute command 402 | echo "executing $command..." 403 | $command 2> $stderr > $predictions 404 | 405 | # evaluate 406 | perl $conll_eval -g $dev_conll -s $predictions -q > $score 407 | } 408 | 409 | task TrainParser 410 | :: use_typology=@ 411 | :: lstm_parser=@ 412 | < global_clustermap=$global_clustermap@ProcessIndividualTreebanks 413 | < train=(TrainLang: cs=$treebank_train_cs_arcstdswap@ProcessIndividualTreebanks 414 | de=$treebank_train_de_arcstdswap@ProcessIndividualTreebanks 415 | en=$treebank_train_en_arcstdswap@ProcessIndividualTreebanks 416 | es=$treebank_train_es_arcstdswap@ProcessIndividualTreebanks 417 | fi=$treebank_train_fi_arcstdswap@ProcessIndividualTreebanks 418 | fr=$treebank_train_fr_arcstdswap@ProcessIndividualTreebanks 419 | ga=$treebank_train_ga_arcstdswap@ProcessIndividualTreebanks 420 | hu=$treebank_train_hu_arcstdswap@ProcessIndividualTreebanks 421 | it=$treebank_train_it_arcstdswap@ProcessIndividualTreebanks 422 | sv=$treebank_train_sv_arcstdswap@ProcessIndividualTreebanks 423 | v1_but_en=$v1_but_en@AggregateTrainTreebanks 424 | v1_but_cs=$v1_but_cs@AggregateTrainTreebanks 425 | v1_but_de=$v1_but_de@AggregateTrainTreebanks 426 | v1_but_es=$v1_but_es@AggregateTrainTreebanks 427 | v1_but_fi=$v1_but_fi@AggregateTrainTreebanks 428 | v1_but_fr=$v1_but_fr@AggregateTrainTreebanks 429 | v1_but_ga=$v1_but_ga@AggregateTrainTreebanks 430 | v1_but_hu=$v1_but_hu@AggregateTrainTreebanks 431 | v1_but_it=$v1_but_it@AggregateTrainTreebanks 432 | v1_but_sv=$v1_but_sv@AggregateTrainTreebanks 433 | v1=$v1@AggregateTrainTreebanks 434 | ) 435 | < dev=(TrainLang: cs=$treebank_dev_cs_arcstdswap@ProcessIndividualTreebanks 436 | de=$treebank_dev_de_arcstdswap@ProcessIndividualTreebanks 437 | en=$treebank_dev_en_arcstdswap@ProcessIndividualTreebanks 438 | es=$treebank_dev_es_arcstdswap@ProcessIndividualTreebanks 439 | fi=$treebank_dev_fi_arcstdswap@ProcessIndividualTreebanks 440 | fr=$treebank_dev_fr_arcstdswap@ProcessIndividualTreebanks 441 | ga=$treebank_dev_ga_arcstdswap@ProcessIndividualTreebanks 442 | hu=$treebank_dev_hu_arcstdswap@ProcessIndividualTreebanks 443 | it=$treebank_dev_it_arcstdswap@ProcessIndividualTreebanks 444 | sv=$treebank_dev_sv_arcstdswap@ProcessIndividualTreebanks 445 | v1_but_en=$v1_but_en@AggregateDevTreebanks 446 | v1_but_cs=$v1_but_cs@AggregateDevTreebanks 447 | v1_but_de=$v1_but_de@AggregateDevTreebanks 448 | v1_but_es=$v1_but_es@AggregateDevTreebanks 449 | v1_but_fi=$v1_but_fi@AggregateDevTreebanks 450 | v1_but_fr=$v1_but_fr@AggregateDevTreebanks 451 | v1_but_ga=$v1_but_ga@AggregateDevTreebanks 452 | v1_but_hu=$v1_but_hu@AggregateDevTreebanks 453 | v1_but_it=$v1_but_it@AggregateDevTreebanks 454 | v1_but_sv=$v1_but_sv@AggregateDevTreebanks 455 | v1=$v1@AggregateDevTreebanks 456 | ) 457 | < embeddings=(EmbeddingsType: 458 | jiangacl15_cca_ende=$ende_embeddings@PrepareJiangAcl15CcaEmbeddings 459 | jiangacl15_cca_enes=$enes_embeddings@PrepareJiangAcl15CcaEmbeddings 460 | jiangacl15_cca_enfr=$enfr_embeddings@PrepareJiangAcl15CcaEmbeddings 461 | jiangacl15_proj=$embeddings@PrepareJiangAcl15ProjEmbeddings 462 | word_clusters=$cluster_embeddings@TrainEmbeddingsOfWordClusters 463 | none="") 464 | :: use_pretrained=@ 465 | > stderr 466 | > stdout 467 | > latest_model 468 | :: pos_dim=@ 469 | :: hidden_dim=@ 470 | :: lstm_input_dim=@ 471 | :: rel_dim=@ 472 | :: action_dim=@ 473 | :: unk_prob=@ 474 | :: layers=@ 475 | :: input_dim=@ 476 | :: use_spelling=@ 477 | { 478 | # rebuild the parser 479 | pushd $lstm_parser 480 | make -j2 481 | popd 482 | 483 | # construct the command 484 | command="$lstm_parser/parser/lstm-parse -T $train -d $dev -t -P --clusters $global_clustermap" 485 | 486 | if [ $input_dim ]; then 487 | command="$command --input_dim $input_dim " 488 | fi 489 | 490 | if [ $use_pretrained ]; then 491 | command="$command -w $embeddings --pretrained_dim 100 " 492 | fi 493 | 494 | if [ $use_typology ]; then 495 | command="$command -y $lstm_parser/typological_properties.txt " 496 | fi 497 | 498 | if [ $pos_dim ]; then 499 | command="$command --pos_dim $pos_dim " 500 | fi 501 | 502 | if [ $hidden_dim ]; then 503 | command="$command --hidden_dim $hidden_dim " 504 | fi 505 | 506 | if [ $lstm_input_dim ]; then 507 | command="$command --lstm_input_dim $lstm_input_dim " 508 | fi 509 | 510 | if [ $rel_dim ]; then 511 | command="$command --rel_dim $rel_dim " 512 | fi 513 | 514 | if [ $action_dim ]; then 515 | command="$command --action_dim $action_dim " 516 | fi 517 | 518 | if [ $unk_prob ]; then 519 | command="$command --unk_prob $unk_prob " 520 | fi 521 | 522 | if [ $use_spelling ]; then 523 | command="$command --use_spelling " 524 | fi 525 | 526 | if [ $layers ]; then 527 | command="$command --layers $layers " 528 | fi 529 | 530 | # execute command 531 | echo "executing $command..." 532 | $command 2> $stderr > $stdout 533 | } 534 | 535 | # Aggregate treebanks of two or more source languages 536 | # arrow #14, #15 537 | task AggregateTrainTreebanks 538 | ## individual files (arcstdswap) 539 | < en=$treebank_train_en_arcstdswap@ProcessIndividualTreebanks 540 | < cs=$treebank_train_cs_arcstdswap@ProcessIndividualTreebanks 541 | < de=$treebank_train_de_arcstdswap@ProcessIndividualTreebanks 542 | < es=$treebank_train_es_arcstdswap@ProcessIndividualTreebanks 543 | < fi=$treebank_train_fi_arcstdswap@ProcessIndividualTreebanks 544 | < fr=$treebank_train_fr_arcstdswap@ProcessIndividualTreebanks 545 | < ga=$treebank_train_ga_arcstdswap@ProcessIndividualTreebanks 546 | < hu=$treebank_train_hu_arcstdswap@ProcessIndividualTreebanks 547 | < it=$treebank_train_it_arcstdswap@ProcessIndividualTreebanks 548 | < sv=$treebank_train_sv_arcstdswap@ProcessIndividualTreebanks 549 | 550 | # outputs files (arcstdswp) 551 | 552 | > v1_but_en 553 | > v1_but_sv 554 | > v1_but_cs 555 | > v1_but_de 556 | > v1_but_es 557 | > v1_but_fi 558 | > v1_but_fr 559 | > v1_but_ga 560 | > v1_but_hu 561 | > v1_but_it 562 | > v1 563 | 564 | { 565 | # train with nine treebanks (arcstdswp) 566 | cat $cs $de $es $fi $fr $ga $hu $it $sv > $v1_but_en # target = any but en 567 | cat $en $cs $de $es $fi $fr $ga $hu $it > $v1_but_sv # target = any but cs, de, es, fi, fr, ga, hu, it 568 | cat $en $de $es $fi $fr $ga $hu $it $sv > $v1_but_cs # target = cs 569 | cat $en $cs $es $fi $fr $ga $hu $it $sv > $v1_but_de # target = de 570 | cat $en $cs $de $fi $fr $ga $hu $it $sv > $v1_but_es # target = es 571 | cat $en $cs $de $es $fr $ga $hu $it $sv > $v1_but_fi # target = fi 572 | cat $en $cs $de $es $fi $ga $hu $it $sv > $v1_but_fr # target = fr 573 | cat $en $cs $de $es $fi $fr $hu $it $sv > $v1_but_ga # target = ga 574 | cat $en $cs $de $es $fi $fr $ga $it $sv > $v1_but_hu # target = hu 575 | cat $en $cs $de $es $fi $fr $ga $hu $sv > $v1_but_it # target = it 576 | 577 | # train with ten treebanks (arcstdswp) 578 | cat $en $cs $de $es $fi $fr $ga $hu $it $sv > $v1 579 | 580 | cat v1_but_cs | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_cs_nolabel 581 | cat v1_but_de | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_de_nolabel 582 | cat v1_but_en | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_en_nolabel 583 | cat v1_but_es | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_es_nolabel 584 | cat v1_but_fi | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_fi_nolabel 585 | cat v1_but_fr | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_fr_nolabel 586 | cat v1_but_ga | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_ga_nolabel 587 | cat v1_but_hu | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_hu_nolabel 588 | cat v1_but_it | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_it_nolabel 589 | cat v1_but_sv | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_sv_nolabel 590 | cat v1 | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_nolabel 591 | } 592 | 593 | 594 | # Aggregate dev sections of the treebanks of two or more source languages 595 | # arrow #14, #15 596 | task AggregateDevTreebanks 597 | ## input files (arcstdswap) 598 | < en=$treebank_dev_en_arcstdswap@ProcessIndividualTreebanks 599 | < cs=$treebank_dev_cs_arcstdswap@ProcessIndividualTreebanks 600 | < de=$treebank_dev_de_arcstdswap@ProcessIndividualTreebanks 601 | < es=$treebank_dev_es_arcstdswap@ProcessIndividualTreebanks 602 | < fi=$treebank_dev_fi_arcstdswap@ProcessIndividualTreebanks 603 | < fr=$treebank_dev_fr_arcstdswap@ProcessIndividualTreebanks 604 | < ga=$treebank_dev_ga_arcstdswap@ProcessIndividualTreebanks 605 | < hu=$treebank_dev_hu_arcstdswap@ProcessIndividualTreebanks 606 | < it=$treebank_dev_it_arcstdswap@ProcessIndividualTreebanks 607 | < sv=$treebank_dev_sv_arcstdswap@ProcessIndividualTreebanks 608 | 609 | ## output files (arcstdswap) 610 | > v1_but_sv 611 | > v1_but_cs 612 | > v1_but_de 613 | > v1_but_en 614 | > v1_but_es 615 | > v1_but_fi 616 | > v1_but_fr 617 | > v1_but_ga 618 | > v1_but_hu 619 | > v1_but_it 620 | > v1 621 | 622 | :: wammar_utils=@ 623 | { 624 | # only use the first 150 sentences from each language (arcstdswap) 625 | python $wammar_utils/clip-file-after-kth-match.py -i $cs -o cs.150 -k 150 -p "\n\n" 626 | python $wammar_utils/clip-file-after-kth-match.py -i $de -o de.150 -k 150 -p "\n\n" 627 | python $wammar_utils/clip-file-after-kth-match.py -i $en -o en.150 -k 150 -p "\n\n" 628 | python $wammar_utils/clip-file-after-kth-match.py -i $es -o es.150 -k 150 -p "\n\n" 629 | python $wammar_utils/clip-file-after-kth-match.py -i $fi -o fi.150 -k 150 -p "\n\n" 630 | python $wammar_utils/clip-file-after-kth-match.py -i $fr -o fr.150 -k 150 -p "\n\n" 631 | python $wammar_utils/clip-file-after-kth-match.py -i $ga -o ga.150 -k 150 -p "\n\n" 632 | python $wammar_utils/clip-file-after-kth-match.py -i $hu -o hu.150 -k 150 -p "\n\n" 633 | python $wammar_utils/clip-file-after-kth-match.py -i $it -o it.150 -k 150 -p "\n\n" 634 | python $wammar_utils/clip-file-after-kth-match.py -i $sv -o sv.150 -k 150 -p "\n\n" 635 | 636 | # train with nine treebanks 637 | cat cs.150 de.150 es.150 fi.150 fr.150 ga.150 hu.150 it.150 sv.150 > $v1_but_en # target = en 638 | cat en.150 cs.150 de.150 es.150 fi.150 fr.150 ga.150 hu.150 it.150 > $v1_but_sv # target = sv 639 | cat en.150 de.150 es.150 fi.150 fr.150 ga.150 hu.150 it.150 sv.150 > $v1_but_cs # target = cs 640 | cat en.150 cs.150 es.150 fi.150 fr.150 ga.150 hu.150 it.150 sv.150 > $v1_but_de # target = de 641 | cat en.150 cs.150 de.150 fi.150 fr.150 ga.150 hu.150 it.150 sv.150 > $v1_but_es # target = es 642 | cat en.150 cs.150 de.150 es.150 fr.150 ga.150 hu.150 it.150 sv.150 > $v1_but_fi # target = fi 643 | cat en.150 cs.150 de.150 es.150 fi.150 ga.150 hu.150 it.150 sv.150 > $v1_but_fr # target = fr 644 | cat en.150 cs.150 de.150 es.150 fi.150 fr.150 hu.150 it.150 sv.150 > $v1_but_ga # target = ga 645 | cat en.150 cs.150 de.150 es.150 fi.150 fr.150 ga.150 it.150 sv.150 > $v1_but_hu # target = hu 646 | cat en.150 cs.150 de.150 es.150 fi.150 fr.150 ga.150 hu.150 sv.150 > $v1_but_it # target = it 647 | 648 | # train with ten treebanks 649 | cat en.150 cs.150 de.150 es.150 fi.150 fr.150 ga.150 hu.150 it.150 sv.150 > $v1 650 | 651 | # no label version 652 | cat v1_but_cs | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_cs_nolabel 653 | cat v1_but_de | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_de_nolabel 654 | cat v1_but_en | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_en_nolabel 655 | cat v1_but_es | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_es_nolabel 656 | cat v1_but_fi | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_fi_nolabel 657 | cat v1_but_fr | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_fr_nolabel 658 | cat v1_but_ga | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_ga_nolabel 659 | cat v1_but_hu | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_hu_nolabel 660 | cat v1_but_it | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_it_nolabel 661 | cat v1_but_sv | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_but_sv_nolabel 662 | cat v1 | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > v1_nolabel 663 | 664 | } 665 | 666 | # Process individual conllu files in the universal dependencies treebanks, transforming each into 667 | # the arc-standard format (with swap actions). 668 | # arrow #12 669 | task ProcessIndividualTreebanks 670 | :: wammar_utils=@ 671 | :: parse_oracle_arc_std_swap_jar=@ 672 | 673 | :: treebank_train_cs=@ 674 | :: treebank_train_de=@ 675 | :: treebank_train_en=@ 676 | :: treebank_train_es=@ 677 | :: treebank_train_fi=@ 678 | :: treebank_train_fr=@ 679 | :: treebank_train_ga=@ 680 | :: treebank_train_hu=@ 681 | :: treebank_train_it=@ 682 | :: treebank_train_sv=@ 683 | 684 | :: treebank_dev_cs=@ 685 | :: treebank_dev_de=@ 686 | :: treebank_dev_en=@ 687 | :: treebank_dev_es=@ 688 | :: treebank_dev_fi=@ 689 | :: treebank_dev_fr=@ 690 | :: treebank_dev_ga=@ 691 | :: treebank_dev_hu=@ 692 | :: treebank_dev_it=@ 693 | :: treebank_dev_sv=@ 694 | 695 | :: treebank_test_cs=@ 696 | :: treebank_test_de=@ 697 | :: treebank_test_en=@ 698 | :: treebank_test_es=@ 699 | :: treebank_test_fi=@ 700 | :: treebank_test_fr=@ 701 | :: treebank_test_ga=@ 702 | :: treebank_test_hu=@ 703 | :: treebank_test_it=@ 704 | :: treebank_test_sv=@ 705 | 706 | < word_clusters=$augmented_word_clusters@TrainEmbeddingsOfWordClusters 707 | 708 | > treebank_train_cs_arcstdswap 709 | > treebank_train_de_arcstdswap 710 | > treebank_train_en_arcstdswap 711 | > treebank_train_es_arcstdswap 712 | > treebank_train_fi_arcstdswap 713 | > treebank_train_fr_arcstdswap 714 | > treebank_train_ga_arcstdswap 715 | > treebank_train_hu_arcstdswap 716 | > treebank_train_it_arcstdswap 717 | > treebank_train_sv_arcstdswap 718 | 719 | > treebank_dev_cs_arcstdswap 720 | > treebank_dev_de_arcstdswap 721 | > treebank_dev_en_arcstdswap 722 | > treebank_dev_es_arcstdswap 723 | > treebank_dev_fi_arcstdswap 724 | > treebank_dev_fr_arcstdswap 725 | > treebank_dev_ga_arcstdswap 726 | > treebank_dev_hu_arcstdswap 727 | > treebank_dev_it_arcstdswap 728 | > treebank_dev_sv_arcstdswap 729 | 730 | > treebank_test_cs_arcstdswap 731 | > treebank_test_de_arcstdswap 732 | > treebank_test_en_arcstdswap 733 | > treebank_test_es_arcstdswap 734 | > treebank_test_fi_arcstdswap 735 | > treebank_test_fr_arcstdswap 736 | > treebank_test_ga_arcstdswap 737 | > treebank_test_hu_arcstdswap 738 | > treebank_test_it_arcstdswap 739 | > treebank_test_sv_arcstdswap 740 | 741 | ## clustermap 742 | > global_clustermap 743 | 744 | ## clusterids 745 | > treebank_dev_cs_conllx_lc_coarse_langprefix_clusterid 746 | > treebank_dev_de_conllx_lc_coarse_langprefix_clusterid 747 | > treebank_dev_en_conllx_lc_coarse_langprefix_clusterid 748 | > treebank_dev_es_conllx_lc_coarse_langprefix_clusterid 749 | > treebank_dev_fi_conllx_lc_coarse_langprefix_clusterid 750 | > treebank_dev_fr_conllx_lc_coarse_langprefix_clusterid 751 | > treebank_dev_ga_conllx_lc_coarse_langprefix_clusterid 752 | > treebank_dev_hu_conllx_lc_coarse_langprefix_clusterid 753 | > treebank_dev_it_conllx_lc_coarse_langprefix_clusterid 754 | > treebank_dev_sv_conllx_lc_coarse_langprefix_clusterid 755 | 756 | > treebank_test_cs_conllx_lc_coarse_langprefix_clusterid 757 | > treebank_test_de_conllx_lc_coarse_langprefix_clusterid 758 | > treebank_test_en_conllx_lc_coarse_langprefix_clusterid 759 | > treebank_test_es_conllx_lc_coarse_langprefix_clusterid 760 | > treebank_test_fi_conllx_lc_coarse_langprefix_clusterid 761 | > treebank_test_fr_conllx_lc_coarse_langprefix_clusterid 762 | > treebank_test_ga_conllx_lc_coarse_langprefix_clusterid 763 | > treebank_test_hu_conllx_lc_coarse_langprefix_clusterid 764 | > treebank_test_it_conllx_lc_coarse_langprefix_clusterid 765 | > treebank_test_sv_conllx_lc_coarse_langprefix_clusterid 766 | 767 | { 768 | # Setting the class path didn't work for me. Just copy the jar file to the current directory. 769 | cp $parse_oracle_arc_std_swap_jar ./ParseOracleArcStdSwap.jar 770 | 771 | # For each language and each of train/dev/test: 772 | # * Remove phrases and comments, which were recently introduced in the conllu format, and is not 773 | # supported by the oracle parse arc-std tool nor the LSTM parser. 774 | # * Lowercase everything. 775 | # * Swap the coarse vs. fine POS tag columns. We want to use the coarse one because it's universal, 776 | # but the arcstd oracle parse tool uses the fine POS. Hence the need for this step. 777 | # * Extract oracle actions for training a transition-based parser. 778 | 779 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_cs -o conllx 780 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 781 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 782 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "cs:" 783 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_cs_clustermap 784 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_cs_arcstdswap 785 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_cs_conllx_lc_coarse_langprefix_clusterid 786 | cat $treebank_train_cs_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_cs_arcstdswap_nolabel 787 | 788 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_de -o conllx 789 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 790 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 791 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "de:" 792 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_de_clustermap 793 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_de_arcstdswap 794 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_de_conllx_lc_coarse_langprefix_clusterid 795 | cat $treebank_train_de_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_de_arcstdswap_nolabel 796 | 797 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_en -o conllx 798 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 799 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 800 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "en:" 801 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_en_clustermap 802 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_en_arcstdswap 803 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_en_conllx_lc_coarse_langprefix_clusterid 804 | cat $treebank_train_en_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_en_arcstdswap_nolabel 805 | 806 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_es -o conllx 807 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 808 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 809 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "es:" 810 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_es_clustermap 811 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_es_arcstdswap 812 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_es_conllx_lc_coarse_langprefix_clusterid 813 | cat $treebank_train_es_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_es_arcstdswap_nolabel 814 | 815 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_fi -o conllx 816 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 817 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 818 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "fi:" 819 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_fi_clustermap 820 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_fi_arcstdswap 821 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_fi_conllx_lc_coarse_langprefix_clusterid 822 | cat $treebank_train_fi_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_fi_arcstdswap_nolabel 823 | 824 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_fr -o conllx 825 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 826 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 827 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "fr:" 828 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_fr_clustermap 829 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_fr_arcstdswap 830 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_fr_conllx_lc_coarse_langprefix_clusterid 831 | cat $treebank_train_fr_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_fr_arcstdswap_nolabel 832 | 833 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_ga -o conllx 834 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 835 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 836 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "ga:" 837 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_ga_clustermap 838 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_ga_arcstdswap 839 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_ga_conllx_lc_coarse_langprefix_clusterid 840 | cat $treebank_train_ga_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_ga_arcstdswap_nolabel 841 | 842 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_hu -o conllx 843 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 844 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 845 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "hu:" 846 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_hu_clustermap 847 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_hu_arcstdswap 848 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_hu_conllx_lc_coarse_langprefix_clusterid 849 | cat $treebank_train_hu_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_hu_arcstdswap_nolabel 850 | 851 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_it -o conllx 852 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 853 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 854 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "it:" 855 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_it_clustermap 856 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_it_arcstdswap 857 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_it_conllx_lc_coarse_langprefix_clusterid 858 | cat $treebank_train_it_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_it_arcstdswap_nolabel 859 | 860 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_train_sv -o conllx 861 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 862 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 863 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "sv:" 864 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_train_sv_clustermap 865 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_train_sv_arcstdswap 866 | cp conllx.lc.coarse.langprefix.clusterid treebank_train_sv_conllx_lc_coarse_langprefix_clusterid 867 | cat $treebank_train_sv_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_train_sv_arcstdswap_nolabel 868 | 869 | ########################### 870 | # process dev treebanks 871 | 872 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_cs -o conllx 873 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 874 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 875 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "cs:" 876 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_cs_clustermap 877 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_cs_arcstdswap 878 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_cs_conllx_lc_coarse_langprefix_clusterid 879 | cat $treebank_dev_cs_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_cs_arcstdswap_nolabel 880 | 881 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_de -o conllx 882 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 883 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 884 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "de:" 885 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_de_clustermap 886 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_de_arcstdswap 887 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_de_conllx_lc_coarse_langprefix_clusterid 888 | cat $treebank_dev_de_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_de_arcstdswap_nolabel 889 | 890 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_en -o conllx 891 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 892 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 893 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "en:" 894 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_en_clustermap 895 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_en_arcstdswap 896 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_en_conllx_lc_coarse_langprefix_clusterid 897 | cat $treebank_dev_en_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_en_arcstdswap_nolabel 898 | 899 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_es -o conllx 900 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 901 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 902 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "es:" 903 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_es_clustermap 904 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_es_arcstdswap 905 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_es_conllx_lc_coarse_langprefix_clusterid 906 | cat $treebank_dev_es_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_es_arcstdswap_nolabel 907 | 908 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_fi -o conllx 909 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 910 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 911 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "fi:" 912 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_fi_clustermap 913 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_fi_arcstdswap 914 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_fi_conllx_lc_coarse_langprefix_clusterid 915 | cat $treebank_dev_fi_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_fi_arcstdswap_nolabel 916 | 917 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_fr -o conllx 918 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 919 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 920 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "fr:" 921 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_fr_clustermap 922 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_fr_arcstdswap 923 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_fr_conllx_lc_coarse_langprefix_clusterid 924 | cat $treebank_dev_fr_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_fr_arcstdswap_nolabel 925 | 926 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_ga -o conllx 927 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 928 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 929 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "ga:" 930 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_ga_clustermap 931 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_ga_arcstdswap 932 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_ga_conllx_lc_coarse_langprefix_clusterid 933 | cat $treebank_dev_ga_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_ga_arcstdswap_nolabel 934 | 935 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_hu -o conllx 936 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 937 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 938 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "hu:" 939 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_hu_clustermap 940 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_hu_arcstdswap 941 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_hu_conllx_lc_coarse_langprefix_clusterid 942 | cat $treebank_dev_hu_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_hu_arcstdswap_nolabel 943 | 944 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_it -o conllx 945 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 946 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 947 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "it:" 948 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_it_clustermap 949 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_it_arcstdswap 950 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_it_conllx_lc_coarse_langprefix_clusterid 951 | cat $treebank_dev_it_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_it_arcstdswap_nolabel 952 | 953 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_dev_sv -o conllx 954 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 955 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 956 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "sv:" 957 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_dev_sv_clustermap 958 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_dev_sv_arcstdswap 959 | cp conllx.lc.coarse.langprefix.clusterid $treebank_dev_sv_conllx_lc_coarse_langprefix_clusterid 960 | cat $treebank_dev_sv_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_dev_sv_arcstdswap_nolabel 961 | 962 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_cs -o conllx 963 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 964 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 965 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "cs:" 966 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_cs_clustermap 967 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_cs_arcstdswap 968 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_cs_conllx_lc_coarse_langprefix_clusterid 969 | cat $treebank_test_cs_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_cs_arcstdswap_nolabel 970 | 971 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_de -o conllx 972 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 973 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 974 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "de:" 975 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_de_clustermap 976 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_de_arcstdswap 977 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_de_conllx_lc_coarse_langprefix_clusterid 978 | cat $treebank_test_de_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_de_arcstdswap_nolabel 979 | 980 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_en -o conllx 981 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 982 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 983 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "en:" 984 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_en_clustermap 985 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_en_arcstdswap 986 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_en_conllx_lc_coarse_langprefix_clusterid 987 | cat $treebank_test_en_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_en_arcstdswap_nolabel 988 | 989 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_es -o conllx 990 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 991 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 992 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "es:" 993 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_es_clustermap 994 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_es_arcstdswap 995 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_es_conllx_lc_coarse_langprefix_clusterid 996 | cat $treebank_test_es_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_es_arcstdswap_nolabel 997 | 998 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_fi -o conllx 999 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 1000 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 1001 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "fi:" 1002 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_fi_clustermap 1003 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_fi_arcstdswap 1004 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_fi_conllx_lc_coarse_langprefix_clusterid 1005 | cat $treebank_test_fi_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_fi_arcstdswap_nolabel 1006 | 1007 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_fr -o conllx 1008 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 1009 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 1010 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "fr:" 1011 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_fr_clustermap 1012 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_fr_arcstdswap 1013 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_fr_conllx_lc_coarse_langprefix_clusterid 1014 | cat $treebank_test_fr_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_fr_arcstdswap_nolabel 1015 | 1016 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_ga -o conllx 1017 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 1018 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 1019 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "ga:" 1020 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_ga_clustermap 1021 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_ga_arcstdswap 1022 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_ga_conllx_lc_coarse_langprefix_clusterid 1023 | cat $treebank_test_ga_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_ga_arcstdswap_nolabel 1024 | 1025 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_hu -o conllx 1026 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 1027 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 1028 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "hu:" 1029 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_hu_clustermap 1030 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_hu_arcstdswap 1031 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_hu_conllx_lc_coarse_langprefix_clusterid 1032 | cat $treebank_test_hu_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_hu_arcstdswap_nolabel 1033 | 1034 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_it -o conllx 1035 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 1036 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 1037 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "it:" 1038 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_it_clustermap 1039 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_it_arcstdswap 1040 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_it_conllx_lc_coarse_langprefix_clusterid 1041 | cat $treebank_test_it_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_it_arcstdswap_nolabel 1042 | 1043 | python $wammar_utils/convert-conllu-to-conll06.py -i $treebank_test_sv -o conllx 1044 | python $wammar_utils/lowercase.py -i conllx -o conllx.lc 1045 | python $wammar_utils/swap-conll-columns.py -i conllx.lc -o conllx.lc.coarse -j 4 -k 5 1046 | python $wammar_utils/add-prefix-to-conll-column.py -i conllx.lc.coarse -o conllx.lc.coarse.langprefix -j 2 -p "sv:" 1047 | python $wammar_utils/replace-words-in-conll-corpus.py -i conllx.lc.coarse.langprefix -o conllx.lc.coarse.langprefix.clusterid -d $word_clusters -p -c treebank_test_sv_clustermap 1048 | java -jar ParseOracleArcStdSwap.jar -t -1 -l 1 -c conllx.lc.coarse.langprefix.clusterid -i conllx.lc.coarse.langprefix.clusterid > $treebank_test_sv_arcstdswap 1049 | cp conllx.lc.coarse.langprefix.clusterid $treebank_test_sv_conllx_lc_coarse_langprefix_clusterid 1050 | cat $treebank_test_sv_arcstdswap | sed -r 's/-ARC\(.*\)$/-ARC(default)/' > treebank_test_sv_arcstdswap_nolabel 1051 | 1052 | # create global cluster map 1053 | cat *_clustermap > $global_clustermap 1054 | } 1055 | 1056 | # Jiang Guo used a simple heuristic, which he calls PROJ, for inducing multilingual word embeddings. 1057 | # First, he initialized English (monolingual) word embeddings using word2vec. 1058 | # Then he tuned the English embeddings towards the training objective of the English parser. 1059 | # Then he computed the embeddings of words in other languages in the same (English) vector space by computing, 1060 | # for each non-English word, the weighted average embedding of translationally-equivalent English words. 1061 | # Until Jiang makes the code for inducing these embeddings available, we can only compare to the 1062 | # final embeddings he obtained. 1063 | # arrow #11 1064 | task PrepareJiangAcl15ProjEmbeddings 1065 | :: jiangacl15_proj_en=@ 1066 | :: jiangacl15_proj_es=@ 1067 | :: jiangacl15_proj_fr=@ 1068 | :: jiangacl15_proj_de=@ 1069 | :: wammar_utils=@ 1070 | > embeddings 1071 | { 1072 | 1073 | python $wammar_utils/prefix.py -i $jiangacl15_proj_en -o jiangacl15_proj_en.langprefix -p "en:" 1074 | python $wammar_utils/prefix.py -i $jiangacl15_proj_es -o jiangacl15_proj_es.langprefix -p "es:" 1075 | python $wammar_utils/prefix.py -i $jiangacl15_proj_fr -o jiangacl15_proj_fr.langprefix -p "fr:" 1076 | python $wammar_utils/prefix.py -i $jiangacl15_proj_de -o jiangacl15_proj_de.langprefix -p "de:" 1077 | cat jiangacl15_proj_en.langprefix jiangacl15_proj_es.langprefix jiangacl15_proj_fr.langprefix jiangacl15_proj_de.langprefix > $embeddings 1078 | rm jiangacl15_proj_* 1079 | } 1080 | 1081 | # Using bilingual CCA (Faruqui and Dyer 2014), Jiang Guo obtained embeddings for three language pairs. 1082 | # This task just puts each of these embeddings in the same format as our home-grown multilingual embeddings. 1083 | # Note that these embeddings are only good for transferring from one source language to one target language. 1084 | # arrow #11(b) 1085 | task PrepareJiangAcl15CcaEmbeddings 1086 | :: jiangacl15_cca_ende_en=@ 1087 | :: jiangacl15_cca_ende_de=@ 1088 | :: jiangacl15_cca_enfr_en=@ 1089 | :: jiangacl15_cca_enfr_fr=@ 1090 | :: jiangacl15_cca_enes_en=@ 1091 | :: jiangacl15_cca_enes_es=@ 1092 | :: wammar_utils=@ 1093 | > ende_embeddings 1094 | > enfr_embeddings 1095 | > enes_embeddings 1096 | { 1097 | python $wammar_utils/prefix.py -i $jiangacl15_cca_ende_en -o jiangacl15_cca_ende_en.langprefix -p "en:" 1098 | python $wammar_utils/prefix.py -i $jiangacl15_cca_ende_de -o jiangacl15_cca_ende_de.langprefix -p "de:" 1099 | cat jiangacl15_cca_ende_en.langprefix jiangacl15_cca_ende_de.langprefix > $ende_embeddings 1100 | python $wammar_utils/prefix.py -i $jiangacl15_cca_enfr_en -o jiangacl15_cca_enfr_en.langprefix -p "en:" 1101 | python $wammar_utils/prefix.py -i $jiangacl15_cca_enfr_fr -o jiangacl15_cca_enfr_fr.langprefix -p "fr:" 1102 | cat jiangacl15_cca_enfr_en.langprefix jiangacl15_cca_enfr_fr.langprefix > $enfr_embeddings 1103 | python $wammar_utils/prefix.py -i $jiangacl15_cca_enes_en -o jiangacl15_cca_enes_en.langprefix -p "en:" 1104 | python $wammar_utils/prefix.py -i $jiangacl15_cca_enes_es -o jiangacl15_cca_enes_es.langprefix -p "es:" 1105 | cat jiangacl15_cca_enes_en.langprefix jiangacl15_cca_enes_es.langprefix > $enes_embeddings 1106 | rm jiangacl15_cca_* 1107 | } 1108 | 1109 | # First replace each word in the monolingual corpora with the string that identifies its multilingual word cluster. 1110 | # Then concatenate all monolingual corpora in the same file and train incremental_word2vec structured skipgram embeddings for 1111 | # the word clusters. Finally, for each cluster of size K words, repeat its embedding K times, once for each of the 1112 | # K words. 1113 | # arrow #7, #8, #9, and #10 1114 | task TrainEmbeddingsOfWordClusters 1115 | :: corpus_cs=@ 1116 | :: corpus_de=@ 1117 | :: corpus_en=@ 1118 | :: corpus_es=@ 1119 | :: corpus_fi=@ 1120 | :: corpus_fr=@ 1121 | :: corpus_ga=@ 1122 | :: corpus_hu=@ 1123 | :: corpus_it=@ 1124 | :: corpus_sv=@ 1125 | :: wammar_utils=@ 1126 | :: incremental_word2vec=@ 1127 | :: cdec=@ 1128 | < word_clusters=$word_clusters@MakeMultilingualWordClusters 1129 | > cluster_embeddings 1130 | > augmented_word_clusters 1131 | :: pretrained_dim=@ 1132 | { 1133 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d $word_clusters -l "cs:" -i $corpus_cs -o corpus_cs.langprefix -od word_clusters_cs 1134 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs -l "de:" -i $corpus_de -o corpus_de.langprefix -od word_clusters_cs+de 1135 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs+de -l "en:" -i $corpus_en -o corpus_en.langprefix -od word_clusters_cs+de+en 1136 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs+de+en -l "es:" -i $corpus_es -o corpus_es.langprefix -od word_clusters_cs+de+en+es 1137 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs+de+en+es -l "fi:" -i $corpus_fi -o corpus_fi.langprefix -od word_clusters_cs+de+en+es+fi 1138 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs+de+en+es+fi -l "fr:" -i $corpus_fr -o corpus_fr.langprefix -od word_clusters_cs+de+en+es+fi+fr 1139 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs+de+en+es+fi+fr -l "ga:" -i $corpus_ga -o corpus_ga.langprefix -od word_clusters_cs+de+en+es+fi+fr+ga 1140 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs+de+en+es+fi+fr+ga -l "hu:" -i $corpus_hu -o corpus_hu.langprefix -od word_clusters_cs+de+en+es+fi+fr+ga+hu 1141 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs+de+en+es+fi+fr+ga+hu -l "it:" -i $corpus_it -o corpus_it.langprefix -od word_clusters_cs+de+en+es+fi+fr+ga+hu+it 1142 | python $wammar_utils/replace-words-in-monolingual-corpus.py -d word_clusters_cs+de+en+es+fi+fr+ga+hu+it -l "sv:" -i $corpus_sv -o corpus_sv.langprefix -od augmented_word_clusters 1143 | cat corpus_cs.langprefix corpus_de.langprefix corpus_en.langprefix corpus_es.langprefix corpus_fi.langprefix corpus_fr.langprefix corpus_ga.langprefix corpus_hu.langprefix corpus_it.langprefix corpus_sv.langprefix > corpus.langprefix 1144 | 1145 | # train cluster embeddings for all languages 1146 | $incremental_word2vec/word2vec -train corpus.langprefix -min-count 40 -output cluster_embeddings -threads 32 -size $pretrained_dim -iter 20 1147 | 1148 | } 1149 | 1150 | # combine all bilingual dictionaries made available by the tasks MakeBilingualDictionaries* 1151 | # and compute the transitive closure of translationally equivalent words across languages. 1152 | # For example, if both "dog ||| chien", "dog ||| male" and "dog ||| hund" appear in *.en-fr, 1153 | # in *.en-fr, and in *en-de, the resulting cluster may look like: 1154 | # en:dog_|_fr:chien_|_fr:male_|_de:hund, which is written in four separate lines in the output 1155 | # file as follows: 1156 | # en:dog ||| en:dog_|_fr:chien_|_fr:male_|_de:hund 1157 | # fr:chien ||| en:dog_|_fr:chien_|_fr:male_|_de:hund 1158 | # fr:male ||| en:dog_|_fr:chien_|_fr:male_|_de:hund 1159 | # de:hund ||| en:dog_|_fr:chien_|_fr:male_|_de:hund 1160 | # Everything should be lowercased in the output file. To avoid catastorphically large closures, 1161 | # we filter out some items from the bilingual dictionaries. 1162 | # arrow #4, #5, #18 1163 | task MakeMultilingualWordClusters 1164 | < wiktionary_bi_dicts=$wiktionary_bi_dicts@MakeBilingualDictionariesFromWiktionary 1165 | < parallel_bi_dicts=$parallel_bi_dicts@MakeBilingualDictionariesFromBitext 1166 | < jiangacl15_bi_dicts=$jiangacl15_bi_dicts@MakeBilingualDictionariesFromJiangAcl15 1167 | :: wammar_utils=@ 1168 | # > punctuation_clusters 1169 | > word_clusters 1170 | { 1171 | python $wammar_utils/map-words-to-transitive-closures.py -i $wiktionary_bi_dicts/wiktionary.* $parallel_bi_dicts/parallel.* $jiangacl15_bi_dicts/jiangacl15.* -o word_clusters 1172 | # python $wammar_utils/generate-crosslingual-punctuation-mappings.py -o $punctuation_clusters 1173 | # cat $punctuation_clusters word_clusters_without_punc > $word_clusters 1174 | # rm word_clusters_without_punc 1175 | } 1176 | 1177 | # read the wiktionary translations dump, and extract word pair translations for each language pair 1178 | # (there's a total of 10! language pairs), and write a bilingual dictionary file for each language pair. 1179 | # the output directory wiktionary_bi_dicts should contain files titled similar to wiktionary.en-fr which 1180 | # contain lines such as "dog ||| chien". Bilingual dictionaries should be lowercased. 1181 | # arrow #1 1182 | task MakeBilingualDictionariesFromWiktionary 1183 | :: wiktionary_dump=@ 1184 | :: wammar_utils=@ 1185 | > wiktionary_bi_dicts 1186 | { 1187 | python $wammar_utils/wiktionary-multilingual-to-bilingual-dictionaries.py -rawfile $wiktionary_dump -outdir $wiktionary_bi_dicts -langs "czech|german|english|spanish|finnish|french|irish|hungarian|italian|swedish" -lowercase 1188 | } 1189 | 1190 | # Inputs are the alignment files Jiang Guo used to induce word embeddings. This is nice because we can compare 1191 | # his multilingual embeddings based on the same word pairs. This task just does some filtering and reformats 1192 | # the bilingual dictionary files to be consistent with other sources of bilingual dictionaries. The output file 1193 | # jiangacl15.fr-en should contain "chien ||| dog" 1194 | # arrow #19 1195 | task MakeBilingualDictionariesFromJiangAcl15 1196 | :: jiangacl15_alignments_de_en=@ 1197 | :: jiangacl15_alignments_es_en=@ 1198 | :: jiangacl15_alignments_fr_en=@ 1199 | :: wammar_utils=@ 1200 | > jiangacl15_bi_dicts 1201 | { 1202 | mkdir $jiangacl15_bi_dicts 1203 | python $wammar_utils/convert-to-one-target-per-line.py -i $jiangacl15_alignments_de_en -o $jiangacl15_bi_dicts/jiangacl15.de-en 1204 | python $wammar_utils/convert-to-one-target-per-line.py -i $jiangacl15_alignments_es_en -o $jiangacl15_bi_dicts/jiangacl15.es-en 1205 | python $wammar_utils/convert-to-one-target-per-line.py -i $jiangacl15_alignments_fr_en -o $jiangacl15_bi_dicts/jiangacl15.fr-en 1206 | } 1207 | 1208 | # read in parallel data (tokenized and lowercased). Process the dump files necessary for extracting 1209 | # wikipedia parallel titles, and append them to the parallel data. Then align the aggregate 1210 | # (parallel+wikipedia) to obtain high quality word pair translations. 1211 | # The output file parallel_bi_dicts/parallel.en-fi should contain lines that look like "dog ||| chien" 1212 | # arrow #6 and #2 1213 | task MakeBilingualDictionariesFromBitext 1214 | :: bitext_cs_en=@ 1215 | :: bitext_de_en=@ 1216 | :: bitext_es_en=@ 1217 | :: bitext_fi_en=@ 1218 | :: bitext_fr_en=@ 1219 | :: bitext_ga_en=@ 1220 | :: bitext_hu_en=@ 1221 | :: bitext_it_en=@ 1222 | :: bitext_sv_en=@ 1223 | :: wikipedia_parallel_titles=@ 1224 | :: wikipedia_base_cs=@ 1225 | :: wikipedia_base_de=@ 1226 | :: wikipedia_base_es=@ 1227 | :: wikipedia_base_fi=@ 1228 | :: wikipedia_base_fr=@ 1229 | :: wikipedia_base_ga=@ 1230 | :: wikipedia_base_hu=@ 1231 | :: wikipedia_base_it=@ 1232 | :: wikipedia_base_sv=@ 1233 | :: wammar_utils=@ 1234 | :: fast_align=@ 1235 | :: cdec=@ 1236 | > parallel_bi_dicts 1237 | { 1238 | mkdir $parallel_bi_dicts 1239 | 1240 | # extract parallel titles from wikipedia 1241 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_cs > wikipedia_titles.cs-en 1242 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_de > wikipedia_titles.de-en 1243 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_es > wikipedia_titles.es-en 1244 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_fi > wikipedia_titles.fi-en 1245 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_fr > wikipedia_titles.fr-en 1246 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_ga > wikipedia_titles.ga-en 1247 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_hu > wikipedia_titles.hu-en 1248 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_it > wikipedia_titles.it-en 1249 | $wikipedia_parallel_titles/build-corpus.sh en $wikipedia_base_sv > wikipedia_titles.sv-en 1250 | 1251 | # tokenize and lowercase 1252 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.cs-en >wikipedia_titles.tok.cs-en 1253 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.cs-en -o wikipedia_titles.tok.lc.cs-en 1254 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.de-en >wikipedia_titles.tok.de-en 1255 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.de-en -o wikipedia_titles.tok.lc.de-en 1256 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.es-en >wikipedia_titles.tok.es-en 1257 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.es-en -o wikipedia_titles.tok.lc.es-en 1258 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.fi-en >wikipedia_titles.tok.fi-en 1259 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.fi-en -o wikipedia_titles.tok.lc.fi-en 1260 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.fr-en >wikipedia_titles.tok.fr-en 1261 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.fr-en -o wikipedia_titles.tok.lc.fr-en 1262 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.ga-en >wikipedia_titles.tok.ga-en 1263 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.ga-en -o wikipedia_titles.tok.lc.ga-en 1264 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.hu-en >wikipedia_titles.tok.hu-en 1265 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.hu-en -o wikipedia_titles.tok.lc.hu-en 1266 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.it-en >wikipedia_titles.tok.it-en 1267 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.it-en -o wikipedia_titles.tok.lc.it-en 1268 | python $cdec/corpus/tokenize-parallel.py wikipedia_titles.sv-en >wikipedia_titles.tok.sv-en 1269 | python $wammar_utils/lowercase.py -i wikipedia_titles.tok.sv-en -o wikipedia_titles.tok.lc.sv-en 1270 | 1271 | # aggregate parallel data of each language 1272 | cat wikipedia_titles.tok.lc.cs-en $bitext_cs_en > aggregate.cs-en 1273 | cat wikipedia_titles.tok.lc.de-en $bitext_de_en > aggregate.de-en 1274 | cat wikipedia_titles.tok.lc.es-en $bitext_es_en > aggregate.es-en 1275 | cat wikipedia_titles.tok.lc.fi-en $bitext_fi_en > aggregate.fi-en 1276 | cat wikipedia_titles.tok.lc.fr-en $bitext_fr_en > aggregate.fr-en 1277 | cat wikipedia_titles.tok.lc.ga-en $bitext_ga_en > aggregate.ga-en 1278 | cat wikipedia_titles.tok.lc.hu-en $bitext_hu_en > aggregate.hu-en 1279 | cat wikipedia_titles.tok.lc.it-en $bitext_it_en > aggregate.it-en 1280 | cat wikipedia_titles.tok.lc.sv-en $bitext_sv_en > aggregate.sv-en 1281 | 1282 | # filter then align the aggregate parallel data, and filter parameters 1283 | $cdec/corpus/filter-length.pl aggregate.cs-en > aggregate.filtered.cs-en 1284 | $fast_align/fast_align -i aggregate.filtered.cs-en -v -d -o -c params.cs-en >/dev/null 1285 | $fast_align/fast_align -i aggregate.filtered.cs-en -v -d -o -r -c params.en-cs >/dev/null 1286 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.cs-en -ir params.en-cs -d $parallel_bi_dicts/parallel.cs-en 1287 | 1288 | $cdec/corpus/filter-length.pl aggregate.de-en > aggregate.filtered.de-en 1289 | $fast_align/fast_align -i aggregate.filtered.de-en -v -d -o -c params.de-en >/dev/null 1290 | $fast_align/fast_align -i aggregate.filtered.de-en -v -d -o -r -c params.en-de >/dev/null 1291 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.de-en -ir params.en-de -d $parallel_bi_dicts/parallel.de-en 1292 | 1293 | $cdec/corpus/filter-length.pl aggregate.es-en > aggregate.filtered.es-en 1294 | $fast_align/fast_align -i aggregate.filtered.es-en -v -d -o -c params.es-en >/dev/null 1295 | $fast_align/fast_align -i aggregate.filtered.es-en -v -d -o -r -c params.en-es >/dev/null 1296 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.es-en -ir params.en-es -d $parallel_bi_dicts/parallel.es-en 1297 | 1298 | $cdec/corpus/filter-length.pl aggregate.fi-en > aggregate.filtered.fi-en 1299 | $fast_align/fast_align -i aggregate.filtered.fi-en -v -d -o -c params.fi-en >/dev/null 1300 | $fast_align/fast_align -i aggregate.filtered.fi-en -v -d -o -r -c params.en-fi >/dev/null 1301 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.fi-en -ir params.en-fi -d $parallel_bi_dicts/parallel.fi-en 1302 | 1303 | $cdec/corpus/filter-length.pl aggregate.fr-en > aggregate.filtered.fr-en 1304 | $fast_align/fast_align -i aggregate.filtered.fr-en -v -d -o -c params.fr-en >/dev/null 1305 | $fast_align/fast_align -i aggregate.filtered.fr-en -v -d -o -r -c params.en-fr >/dev/null 1306 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.fr-en -ir params.en-fr -d $parallel_bi_dicts/parallel.fr-en 1307 | 1308 | $cdec/corpus/filter-length.pl aggregate.ga-en > aggregate.filtered.ga-en 1309 | $fast_align/fast_align -i aggregate.filtered.ga-en -v -d -o -c params.ga-en >/dev/null 1310 | $fast_align/fast_align -i aggregate.filtered.ga-en -v -d -o -r -c params.en-ga >/dev/null 1311 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.ga-en -ir params.en-ga -d $parallel_bi_dicts/parallel.ga-en 1312 | 1313 | $cdec/corpus/filter-length.pl aggregate.hu-en > aggregate.filtered.hu-en 1314 | $fast_align/fast_align -i aggregate.filtered.hu-en -v -d -o -c params.hu-en >/dev/null 1315 | $fast_align/fast_align -i aggregate.filtered.hu-en -v -d -o -r -c params.en-hu >/dev/null 1316 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.hu-en -ir params.en-hu -d $parallel_bi_dicts/parallel.hu-en 1317 | 1318 | $cdec/corpus/filter-length.pl aggregate.it-en > aggregate.filtered.it-en 1319 | $fast_align/fast_align -i aggregate.filtered.it-en -v -d -o -c params.it-en >/dev/null 1320 | $fast_align/fast_align -i aggregate.filtered.it-en -v -d -o -r -c params.en-it >/dev/null 1321 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.it-en -ir params.en-it -d $parallel_bi_dicts/parallel.it-en 1322 | 1323 | $cdec/corpus/filter-length.pl aggregate.sv-en > aggregate.filtered.sv-en 1324 | $fast_align/fast_align -i aggregate.filtered.sv-en -v -d -o -c params.sv-en >/dev/null 1325 | $fast_align/fast_align -i aggregate.filtered.sv-en -v -d -o -r -c params.en-sv >/dev/null 1326 | python $wammar_utils/filter-word-alignment-parameters.py -t 0.05 -if params.sv-en -ir params.en-sv -d $parallel_bi_dicts/parallel.sv-en 1327 | 1328 | # remove intermediate files 1329 | rm aggregate.filtered.cs-en aggregate.cs-en wikipedia_titles.tok.lc.cs-en wikipedia_titles.tok.cs-en wikipedia_titles.cs-en params.cs-en params.en-cs 1330 | rm aggregate.filtered.de-en aggregate.de-en wikipedia_titles.tok.lc.de-en wikipedia_titles.tok.de-en wikipedia_titles.de-en params.de-en params.en-de 1331 | rm aggregate.filtered.es-en aggregate.es-en wikipedia_titles.tok.lc.es-en wikipedia_titles.tok.es-en wikipedia_titles.es-en params.es-en params.en-es 1332 | rm aggregate.filtered.fi-en aggregate.fi-en wikipedia_titles.tok.lc.fi-en wikipedia_titles.tok.fi-en wikipedia_titles.fi-en params.fi-en params.en-fi 1333 | rm aggregate.filtered.fr-en aggregate.fr-en wikipedia_titles.tok.lc.fr-en wikipedia_titles.tok.fr-en wikipedia_titles.fr-en params.fr-en params.en-fr 1334 | rm aggregate.filtered.ga-en aggregate.ga-en wikipedia_titles.tok.lc.ga-en wikipedia_titles.tok.ga-en wikipedia_titles.ga-en params.ga-en params.en-ga 1335 | rm aggregate.filtered.hu-en aggregate.hu-en wikipedia_titles.tok.lc.hu-en wikipedia_titles.tok.hu-en wikipedia_titles.hu-en params.hu-en params.en-hu 1336 | rm aggregate.filtered.it-en aggregate.it-en wikipedia_titles.tok.lc.it-en wikipedia_titles.tok.it-en wikipedia_titles.it-en params.it-en params.en-it 1337 | rm aggregate.filtered.sv-en aggregate.sv-en wikipedia_titles.tok.lc.sv-en wikipedia_titles.tok.sv-en wikipedia_titles.sv-en params.sv-en params.en-sv 1338 | } 1339 | -------------------------------------------------------------------------------- /typological_properties/typological_properties-almostall.txt: -------------------------------------------------------------------------------- 1 | cs 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 0 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 2 | de 1 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 3 | en 1 0 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 1 1 1 0 0 0 1 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 1 0 4 | es 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 5 | fr 1 0 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 6 | fi 1 1 1 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0 7 | ga 0 1 1 1 0 0 1 0 0 0 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1 8 | hu 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 1 1 0 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0 9 | it 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 10 | sv 1 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 1 0 1 0 0 1 0 1 0 1 2 | de 0 1 0 1 0 0 1 0 1 0 1 0 1 3 | en 1 0 0 1 0 1 0 0 1 0 1 0 1 4 | es 1 0 0 1 0 0 1 0 0 1 1 0 1 5 | fr 1 0 0 1 0 0 1 0 0 1 1 0 1 6 | fi 1 0 0 0 1 0 0 1 1 0 1 0 1 7 | ga 0 0 1 1 0 0 1 0 0 1 0 1 1 8 | hu 0 1 0 0 1 0 0 1 1 0 1 0 1 9 | it 1 0 0 1 0 0 1 0 0 1 1 0 1 10 | sv 1 0 0 1 0 0 0 1 1 0 1 0 1 11 | __ 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties.txt.all: -------------------------------------------------------------------------------- 1 | cs 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 0 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 2 | de 1 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 3 | en 1 0 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 1 1 1 0 0 0 1 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 1 0 4 | es 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 5 | fr 1 0 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 6 | fi 1 1 1 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0 7 | ga 0 1 1 1 0 0 1 0 0 0 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1 8 | hu 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 1 1 0 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0 9 | it 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 10 | sv 1 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties.txt.all.sept15: -------------------------------------------------------------------------------- 1 | cs 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 0 2 | de 1 0 1 1 1 0 0 1 0 0 0 1 0 1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 1 0 3 | en 1 0 0 1 0 0 0 1 0 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 0 4 | es 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 5 | fi 1 1 0 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 0 6 | fr 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 7 | ga 1 1 1 0 0 1 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1 0 1 8 | hu 1 1 1 1 1 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 9 | it 1 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 10 | sv 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 0 11 | __ 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties.txt.first22: -------------------------------------------------------------------------------- 1 | fr 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 2 | EN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 | en 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 4 | de 1.0 0.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 5 | ES 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6 | es 1.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0 1.0 7 | cs 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 8 | fi 1.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 1.0 9 | ga 0.0 1.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 1.0 0.5 0.5 10 | hu 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 11 | it 1.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 12 | sv 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 1.0 1.0 0.0 13 | __ 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 14 | -------------------------------------------------------------------------------- /typological_properties/typological_properties.txt.naseem12: -------------------------------------------------------------------------------- 1 | cs 1 0 0 1 0 1 0 0 1 0 1 0 1 2 | de 0 1 0 1 0 0 1 0 1 0 1 0 1 3 | en 1 0 0 1 0 1 0 0 1 0 1 0 1 4 | es 1 0 0 1 0 0 1 0 0 1 1 0 1 5 | fr 1 0 0 1 0 0 1 0 0 1 1 0 1 6 | fi 1 0 0 0 1 0 0 1 1 0 1 0 1 7 | ga 0 0 1 1 0 0 1 0 0 1 0 1 1 8 | hu 0 1 0 0 1 0 0 1 1 0 1 0 1 9 | it 1 0 0 1 0 0 1 0 0 1 1 0 1 10 | sv 1 0 0 1 0 0 0 1 1 0 1 0 1 11 | __ 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties.txt.pat_genus_avg: -------------------------------------------------------------------------------- 1 | de -1 1 -1 1 -1 0 0 -1 0.7 -1 0 -1 -1 -0.7 -1 -1 -1 -1 -1 1 -1 -1 1 1 1 1 0 1 1 0 1 -1 0.25 0.5 1 -1 1 -1 -1 -1 1 -1 -1 1 -1 1 -1 -1 -1 -1 -1 -1 1 -1 -1 1 -1 -1 -1 1 1 0.7 -0.3 -1 -1 1 1 -1 -1 0.3 -1 -1 1 1 0.3 -1 -1 -1 -1 -0.3 1 1 -1 -1 0.3 1 1 1 -1 -1 -0.3 -1 0 0 0 -0.3 -1 1 -1 1 -1 1 1 1 1 -1 -1 1 0 0 0 2 | en -1 1 -1 1 -1 0 0 -1 0.7 -1 0 -1 -1 -0.7 -1 -1 -1 -1 -1 1 -1 -1 1 1 1 1 0 1 1 0 1 -1 0.25 0.5 1 -1 1 -1 -1 -1 1 -1 -1 1 -1 1 -1 -1 -1 -1 -1 -1 1 -1 -1 1 -1 -1 -1 1 1 0.7 -0.3 -1 -1 1 1 -1 -1 0.3 -1 -1 1 1 0.3 -1 -1 -1 -1 -0.3 1 1 -1 -1 0.3 1 1 1 -1 -1 -0.3 -1 0 0 0 -0.3 -1 1 -1 1 -1 1 1 1 1 -1 -1 1 0 0 0 3 | es -1 1 -1 1 -1 -0.6 0.6 -1 1 -1 -0.6 -0.6 -1 -1 -1 -1 -1 1 1 1 0 -1 1 1 1 1 1 1 1 0 1 -1 -1 1 -1 1 1 -1 -1 -1 1 -1 -1 1 -1 1 -1 -0.2 -1 -1 -0.2 -1 1 -1 -1 -0.6 0.6 -1 -1 1 0.7 -0.25 0.7 -0.7 -1 1 1 -1 -1 -1 -1 -1 1 0.6 1 -1 -1 -1 -1 -0.6 1 1 -1 0 -0.6 1 1 1 -1 -1 -0.6 -1 -1 1 -1 -1 0 0 -1 -1 -1 1 1 0 -1 -1 -1 1 0 0 0 4 | fr -1 1 -1 1 -1 -0.6 0.6 -1 1 -1 -0.6 -0.6 -1 -1 -1 -1 -1 1 1 1 0 -1 1 1 1 1 1 1 1 0 1 -1 -1 1 -1 1 1 -1 -1 -1 1 -1 -1 1 -1 1 -1 -0.2 -1 -1 -0.2 -1 1 -1 -1 -0.6 0.6 -1 -1 1 0.7 -0.25 0.7 -0.7 -1 1 1 -1 -1 -1 -1 -1 1 0.6 1 -1 -1 -1 -1 -0.6 1 1 -1 0 -0.6 1 1 1 -1 -1 -0.6 -1 -1 1 -1 -1 0 0 -1 -1 -1 1 1 0 -1 -1 -1 1 0 0 0 5 | it -1 1 -1 1 -1 -0.6 0.6 -1 1 -1 -0.6 -0.6 -1 -1 -1 -1 -1 1 1 1 0 -1 1 1 1 1 1 1 1 0 1 -1 -1 1 -1 1 1 -1 -1 -1 1 -1 -1 1 -1 1 -1 -0.2 -1 -1 -0.2 -1 1 -1 -1 -0.6 0.6 -1 -1 1 0.7 -0.25 0.7 -0.7 -1 1 1 -1 -1 -1 -1 -1 1 0.6 1 -1 -1 -1 -1 -0.6 1 1 -1 0 -0.6 1 1 1 -1 -1 -0.6 -1 -1 1 -1 -1 0 0 -1 -1 -1 1 1 0 -1 -1 -1 1 0 0 0 6 | pt -1 1 -1 1 -1 -0.6 0.6 -1 1 -1 -0.6 -0.6 -1 -1 -1 -1 -1 1 1 1 0 -1 1 1 1 1 1 1 1 0 1 -1 -1 1 -1 1 1 -1 -1 -1 1 -1 -1 1 -1 1 -1 -0.2 -1 -1 -0.2 -1 1 -1 -1 -0.6 0.6 -1 -1 1 0.7 -0.25 0.7 -0.7 -1 1 1 -1 -1 -1 -1 -1 1 0.6 1 -1 -1 -1 -1 -0.6 1 1 -1 0 -0.6 1 1 1 -1 -1 -0.6 -1 -1 1 -1 -1 0 0 -1 -1 -1 1 1 0 -1 -1 -1 1 0 0 0 7 | sv -1 1 -1 1 -1 0 0 -1 0.7 -1 0 -1 -1 -0.7 -1 -1 -1 -1 -1 1 -1 -1 1 1 1 1 0 1 1 0 1 -1 0.25 0.5 1 -1 1 -1 -1 -1 1 -1 -1 1 -1 1 -1 -1 -1 -1 -1 -1 1 -1 -1 1 -1 -1 -1 1 1 0.7 -0.3 -1 -1 1 1 -1 -1 0.3 -1 -1 1 1 0.3 -1 -1 -1 -1 -0.3 1 1 -1 -1 0.3 1 1 1 -1 -1 -0.3 -1 0 0 0 -0.3 -1 1 -1 1 -1 1 1 1 1 -1 -1 1 0 0 0 8 | __ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | -------------------------------------------------------------------------------- /typological_properties/typological_properties.txt.zhang15: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 1 1 0 1 0 2 | de 1 0 1 0 0 1 0 1 0 3 | en 1 0 0 0 1 1 0 1 0 4 | es 0 0 1 0 1 1 0 0 1 5 | fi 1 0 0 1 1 0 1 1 0 6 | fr 1 0 1 0 1 1 0 0 1 7 | ga 0 1 1 0 1 1 0 0 1 8 | hu 1 0 0 1 1 0 1 1 0 9 | it 0 0 1 0 1 1 0 0 1 10 | pt 1 0 1 0 1 1 0 0 1 11 | sv 1 0 0 1 1 1 0 1 0 12 | __ 0 0 0 0 0 0 0 0 0 13 | -------------------------------------------------------------------------------- /typological_properties/typological_properties.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clab/language-universal-parser/6012899f8c6b7018c52f4eac9de6b3486720c568/typological_properties/typological_properties.xlsx -------------------------------------------------------------------------------- /typological_properties/typological_properties_id.txt: -------------------------------------------------------------------------------- 1 | ar 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 | bg 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | cs 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 | da 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 | de 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | el 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 | en 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | es 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | et 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10 | eu 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | fa 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | fi 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 | fr 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 | he 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 | hi 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 | hr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 17 | hu 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 | id 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 19 | it 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 20 | la 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 21 | nl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 22 | no 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 23 | pl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 24 | pt 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 25 | qu 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 26 | ro 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 27 | sl 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 28 | sr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 29 | sv 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 30 | ta 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 31 | __ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 32 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_cs.txt: -------------------------------------------------------------------------------- 1 | cs 0 0.1127272727 0.1163636364 0.1054545455 0.1309090909 0.1018181818 0.08363636364 0.1054545455 0.1163636364 0.1272727273 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fr 0 0 0 0 1 0 0 0 0 0 0 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_de.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0.1135531136 0 0.1245421245 0.1135531136 0.08791208791 0.1172161172 0.08424908425 0.1062271062 0.1245421245 0.1282051282 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fr 0 0 0 0 1 0 0 0 0 0 0 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_en.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0.1146953405 0.1218637993 0 0.1218637993 0.09677419355 0.1254480287 0.07885304659 0.08602150538 0.1254480287 0.1290322581 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fr 0 0 0 0 1 0 0 0 0 0 0 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_es.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0.09830508475 0.1050847458 0.1152542373 0 0.09491525424 0.1559322034 0.09830508475 0.07796610169 0.1423728814 0.1118644068 0 5 | fr 0 0 0 0 1 0 0 0 0 0 0 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_fi.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fi 0.140077821 0.09338521401 0.1050583658 0.1089494163 0 0.1050583658 0.07003891051 0.140077821 0.1050583658 0.1322957198 0 6 | fr 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_fr.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fr 0.09556313993 0.1092150171 0.1194539249 0.156996587 0.09215017065 0 0.09556313993 0.07508532423 0.1399317406 0.1160409556 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_ga.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fr 0 0 0 0 1 0 0 0 0 0 0 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0.1069767442 0.1069767442 0.1023255814 0.1348837209 0.08372093023 0.1302325581 0 0.08837209302 0.1395348837 0.1069767442 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_hu.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fr 0 0 0 0 1 0 0 0 0 0 0 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0.1244635193 0.1244635193 0.1030042918 0.09871244635 0.1545064378 0.09442060086 0.08154506438 0 0.1030042918 0.1158798283 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_it.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fr 0 0 0 0 1 0 0 0 0 0 0 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0.1063122924 0.1129568106 0.1162790698 0.1395348837 0.08970099668 0.1362126246 0.09966777409 0.07973421927 0 0.1196013289 0 10 | sv 0 0 0 0 0 0 0 0 0 1 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_id_tgt_sv.txt: -------------------------------------------------------------------------------- 1 | cs 1 0 0 0 0 0 0 0 0 0 0 2 | de 0 1 0 0 0 0 0 0 0 0 0 3 | en 0 0 1 0 0 0 0 0 0 0 0 4 | es 0 0 0 1 0 0 0 0 0 0 0 5 | fr 0 0 0 0 1 0 0 0 0 0 0 6 | fi 0 0 0 0 0 1 0 0 0 0 0 7 | ga 0 0 0 0 0 0 1 0 0 0 0 8 | hu 0 0 0 0 0 0 0 1 0 0 0 9 | it 0 0 0 0 0 0 0 0 1 0 0 10 | sv 0.1194539249 0.1194539249 0.1228668942 0.1126279863 0.1160409556 0.1160409556 0.07849829352 0.09215017065 0.1228668942 0 0 11 | __ 0 0 0 0 0 0 0 0 0 0 1 12 | -------------------------------------------------------------------------------- /typological_properties/typological_properties_none.txt: -------------------------------------------------------------------------------- 1 | cs 0 0 0 0 0 0 0 0 0 0 0 0 0 2 | de 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | en 0 0 0 0 0 0 0 0 0 0 0 0 0 4 | es 0 0 0 0 0 0 0 0 0 0 0 0 0 5 | fr 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | fi 0 0 0 0 0 0 0 0 0 0 0 0 0 7 | ga 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | hu 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | it 0 0 0 0 0 0 0 0 0 0 0 0 0 10 | pt 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | sv 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | __ 0 0 0 0 0 0 0 0 0 0 0 0 0 13 | --------------------------------------------------------------------------------