├── .gitignore ├── .idea ├── ant.xml ├── compiler.xml ├── copyright │ └── profiles_settings.xml ├── encodings.xml ├── libraries │ ├── Maven__cc_mallet_mallet_2_0_7.xml │ ├── Maven__com_google_guava_guava_18_0.xml │ ├── Maven__com_googlecode_matrix_toolkits_java_mtj_0_9_14.xml │ ├── Maven__com_googlecode_netlib_java_netlib_java_0_9_3.xml │ ├── Maven__com_miglayout_miglayout_core_4_2.xml │ ├── Maven__com_miglayout_miglayout_swing_4_2.xml │ ├── Maven__commons_cli_commons_cli_1_2.xml │ ├── Maven__commons_io_commons_io_1_3_2.xml │ ├── Maven__commons_logging_commons_logging_1_1_1.xml │ ├── Maven__edu_nyu_cs_proteus_Jet_1_8_2_depfix_20.xml │ ├── Maven__edu_nyu_cs_proteus_opennlp_maxent_customized_3_0_0_Mallet_3.xml │ ├── Maven__edu_umass_cs_mallet_crf_0_1.xml │ ├── Maven__fanseparser_fanseparser_0_2_2.xml │ ├── Maven__jgrapht_jgrapht_0_6_0.xml │ ├── Maven__joda_time_joda_time_1_2_1.xml │ ├── Maven__junit_junit_4_10.xml │ ├── Maven__net_sf_jwordnet_jwnl_1_4_rc3.xml │ ├── Maven__net_sf_trove4j_trove4j_2_0_2.xml │ ├── Maven__net_sourceforge_f2j_arpack_combined_all_0_1.xml │ ├── Maven__org_apache_commons_commons_exec_1_1.xml │ ├── Maven__org_apache_commons_commons_math3_3_2.xml │ ├── Maven__org_beanshell_bsh_2_0b4.xml │ ├── Maven__org_hamcrest_hamcrest_core_1_1.xml │ ├── Maven__org_jdom_jdom_1_1.xml │ ├── Maven__org_jyaml_jyaml_1_2.xml │ ├── Maven__org_la4j_la4j_0_6_0.xml │ ├── Maven__org_slf4j_slf4j_api_1_7_5.xml │ ├── Maven__parser_stub_parser_stub_0_1.xml │ └── Maven__pnuts_pnuts_1_2.xml ├── misc.xml ├── modules.xml ├── uiDesigner.xml └── vcs.xml ├── COPYRIGHT ├── ICE.iml ├── LICENSE ├── README.md ├── build.xml ├── docs ├── ICE_Design.md ├── ice1.png ├── ice2.png ├── ice3.png ├── ice4.png ├── ice5.png ├── iceman.md ├── rankentities.png └── rankrelations.png ├── ice-release-script ├── lib ├── commons-cli-1.2.jar ├── commons-exec-1.1.jar ├── commons-io-1.3.2.jar ├── commons-math3-3.2.jar ├── fest-assert-1.2.jar ├── fest-reflect-1.2.jar ├── fest-swing-1.2.jar ├── fest-util-1.1.2.jar ├── guava-18.0.jar ├── jet-all-1.8.1.jar ├── jet-all-1.9.0.jar ├── jet-all.jar ├── junit-4.0.jar ├── la4j-0.6.0.jar ├── miglayout-core-4.2.jar └── miglayout-swing-4.2.jar ├── pom.xml ├── release-script └── prepare_ice.sh └── src ├── main ├── java │ └── edu │ │ └── nyu │ │ └── jet │ │ ├── aceJet │ │ ├── AceEntityContext.java │ │ ├── AnchoredTreeSet.java │ │ ├── ArgEmbeddingAnchoredPathSet.java │ │ ├── EventTrees.java │ │ ├── RelaxedDepPathRelationTagger.java │ │ ├── SimAnchoredPathSet.java │ │ ├── SimAnchoredTreeSet.java │ │ └── TypedRelationExtractorScorer.java │ │ └── ice │ │ ├── controllers │ │ ├── Nice.java │ │ └── NiceBootstrapper.java │ │ ├── entityset │ │ ├── EmbeddingEntitySetExpander.java │ │ ├── Entity.java │ │ ├── EntityIndexerBox.java │ │ ├── EntitySetExpander.java │ │ ├── EntitySetIndexer.java │ │ ├── EntitySetRankThread.java │ │ ├── EntitySetRerankThread.java │ │ ├── MIRAEntitySetExpander.java │ │ ├── RankChoiceEntity.java │ │ ├── Simulation.java │ │ └── TypelessEntitySetIndexer.java │ │ ├── events │ │ ├── .DS_Store │ │ ├── DepTreeMap.java │ │ ├── EventBootstrap.java │ │ ├── EventBuilderFrame.java │ │ ├── EventBuilderThread.java │ │ ├── EventFinder.java │ │ ├── IceEvent.java │ │ ├── IceTree.java │ │ ├── IceTreeFactory.java │ │ ├── IceTreeSet.java │ │ ├── PhraseLemmatizer.java │ │ ├── SwingEventsPanel.java │ │ └── icetreeset │ │ ├── models │ │ ├── Corpus.java │ │ ├── DepPath.java │ │ ├── DepPathMap.java │ │ ├── DepPathRegularizer.java │ │ ├── DepPaths.java │ │ ├── DepRecord.java │ │ ├── IceEntitySet.java │ │ ├── IcePath.java │ │ ├── IcePathFactory.java │ │ ├── IcePreprocessor.java │ │ ├── IceRelation.java │ │ ├── JetEngineBuilder.java │ │ ├── MatcherNode.java │ │ ├── MatcherPath.java │ │ ├── PathMatcher.java │ │ ├── RelationFinder.java │ │ ├── WordEmbedding.java │ │ └── Words.java │ │ ├── package-info.java │ │ ├── relation │ │ ├── Bootstrap.java │ │ └── PathRelationExtractor.java │ │ ├── terminology │ │ ├── Term.java │ │ ├── TermCounter.java │ │ └── TermRanker.java │ │ ├── uicomps │ │ ├── EntitySetEditorFrame.java │ │ ├── EntitySetRankerFrame.java │ │ ├── Ice.java │ │ ├── IceCellRenderer.java │ │ ├── ListFilter.java │ │ ├── RelationBuilderFrame.java │ │ ├── RelationBuilderThread.java │ │ ├── RelationFilter.java │ │ └── TermFilter.java │ │ ├── utils │ │ ├── AnnotationStartComparator.java │ │ ├── FileNameSchema.java │ │ ├── IceUtils.java │ │ ├── LexUtils.java │ │ ├── ProcessFarm.java │ │ ├── ProgressMonitorI.java │ │ ├── Ratio.java │ │ └── SwingProgressMonitor.java │ │ └── views │ │ ├── Refreshable.java │ │ ├── cli │ │ ├── IceCLI.java │ │ └── package.html │ │ └── swing │ │ ├── SwingCorpusPanel.java │ │ ├── SwingEntitiesPanel.java │ │ ├── SwingEntitySetPanel.java │ │ ├── SwingIceStatusPanel.java │ │ ├── SwingPathsPanel.java │ │ ├── SwingRelationsPanel.java │ │ └── package.html └── python │ ├── extract_field.py │ └── weight_gold.py ├── models └── data │ ├── QuantifierPatterns.txt │ └── apf.v5.1.1.dtd ├── props ├── ice.yml ├── iceprops ├── onomaprops ├── parseprops └── props ├── retired ├── ActiveLearner.java ├── ArgEmbeddingBootstrap.java ├── BatchMaeToApf.java ├── BunescuMooneyFeatureExtractor.java ├── DepPathFeatureExtractor.java ├── DepPathSameConstitsFeatureExtractor.java ├── DepPathTypeFeatureExtractor.java ├── EventItem.java ├── IceCLI6.java ├── LexicalSimilarityBootstrap.java ├── MaeToApf.java ├── RelationBuilder.java ├── RelationEditorFrame.java ├── RelationFeatureExtractor.java ├── RelationOracle.java ├── RichBootstrap.java ├── SameConstitFeatureExtractor.java ├── TokenFeatureExtractor.java └── TokenTypeFeatureExtractor.java ├── scripts ├── icecli ├── icecli6 ├── runice.sh └── runtagger.sh └── test ├── java └── edu │ └── nyu │ └── jet │ └── ice │ ├── FestTest.java │ └── models │ └── DepPathsTest.java ├── resources ├── tinyCorpus2 │ ├── doc1.txt │ ├── doc2.txt │ ├── doc3.txt │ └── doc4.txt ├── tinyCorpus3 │ ├── doc1.txt │ ├── doc2.txt │ ├── doc3.txt │ └── doc4.txt └── tinyCorpus4 │ ├── doc1.txt │ ├── doc2.txt │ ├── doc3.txt │ └── doc4.txt └── scripts ├── checkCount ├── checkLength └── validateCLI /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipr 2 | *.iws 3 | .idea/workspace.xml 4 | .idea/tasks.xml 5 | .idea/artifacts/** 6 | config/** 7 | target/ 8 | *.bak 9 | *.b 10 | *.save 11 | tmp/ 12 | .DS_Store/ 13 | build/ 14 | report/ 15 | docs/api 16 | ice-all-0.2.1.jar 17 | ice-all.jar -------------------------------------------------------------------------------- /.idea/ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__cc_mallet_mallet_2_0_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_guava_guava_18_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_googlecode_matrix_toolkits_java_mtj_0_9_14.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_googlecode_netlib_java_netlib_java_0_9_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_miglayout_miglayout_core_4_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_miglayout_miglayout_swing_4_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_io_commons_io_1_3_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_logging_commons_logging_1_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__edu_nyu_cs_proteus_Jet_1_8_2_depfix_20.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__edu_nyu_cs_proteus_opennlp_maxent_customized_3_0_0_Mallet_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__edu_umass_cs_mallet_crf_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__fanseparser_fanseparser_0_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__jgrapht_jgrapht_0_6_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__joda_time_joda_time_1_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__junit_junit_4_10.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_sf_jwordnet_jwnl_1_4_rc3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_sf_trove4j_trove4j_2_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_sourceforge_f2j_arpack_combined_all_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_exec_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_math3_3_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_beanshell_bsh_2_0b4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_jdom_jdom_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_jyaml_jyaml_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_la4j_la4j_0_6_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__parser_stub_parser_stub_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__pnuts_pnuts_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | ICE is made possible by the following open source projects: 2 | 3 | Jet: The Java Extraction Toolkit 4 | 5 | Jet Copyright (c) 1999-2014 Ralph Grishman 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the 8 | License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 . Unless required by 9 | applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 10 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language 11 | governing permissions and limitations under the License. 12 | 13 | la4j (Linear Algebra for Java) 14 | 15 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this software except in compliance with 16 | the License. 17 | 18 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an 19 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific 20 | language governing permissions and limitations under the License. 21 | 22 | Mig Layout 23 | 24 | Licensed under the New BSD License: https://code.google.com/p/miglayout/ 25 | 26 | Apache Commons (io, math3) 27 | 28 | Licensed under The Apache Software License, Version 2.0: http://commons.apache.org/ 29 | -------------------------------------------------------------------------------- /ICE.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2015 Ralph Grishman 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ICE: Integrated Customization Environment for Information Extraction 2 | 3 | Licensed under the Apache 2.0 license. 4 | 5 | # Information Extraction 6 | 7 | Information extraction is the process of identifying in text all the instances of 8 | specified types of entities, relations, and events. Building an extraction 9 | system for a new domain involves a substantial effort in text analysis and design. 10 | ICE, the Integrated Customization Environment for Information Extraction, 11 | is designed to ease this task by providing an integrated set of analysis tools. 12 | ICE is built on top of JET, NYU's Java Extraction Toolkit. 13 | 14 | # Running Ice Using Binary Release 15 | 16 | JET and ICE are avaiable as github repositories (rgrishman/jet and rgrishman/ice) 17 | and as binary distribution tar files. To use the binary distributions, simply 18 | download JET and ICE to separate directories and untar them. Set the environment variables 19 | ICE_HOME and JET_HOME to point to the root directories of the distributions and put the 20 | bin directory for ICE on the path. 21 | 22 | Then ICE can be invoked with 23 | 24 | runice.sh 25 | 26 | Note that ICE requires that you add two corpora to ICE before it will 27 | let you do anything else. 28 | 29 | ICE requires quite a few files, listed in [Files](docs/Files.txt), all accessed 30 | through the ICE_HOME and JET_HOME shell variables. These files should all be set 31 | up by the binary distibution. 32 | 33 | # Running the Ice Tagger 34 | 35 | Using ICE you can build up a set of patterns to capture the information you 36 | want to extract from the text. For example, if you want to extract data 37 | on the employment of corporate executives, you might have patterns such 38 | as *person* joins *company* and *company* promoted *person*. After you 39 | have accumulated an initial set of patterns, you can *export* them to JET. 40 | You can then use the JET tagger to extract comparable information from new, 41 | previously unseen text. 42 | 43 | To run the tagger, use the command 44 | 45 | runtagger.sh props txtFileList keyFileList apfFileList 46 | 47 | where *props* is a JET properties file provided as part of the ICE dstribution; 48 | *txtFileList* is the list of text input files (one per line) 49 | *keyFileList* is the corresponding list of keys, 50 | and *apfFileList* is the list of output files in Ace apf format. 51 | 52 | The tagger uses 'perfect entities', which are obtained from the key files, and 53 | extracts relations based on the patterns exported from ICE. 54 | 55 | # Building Ice from Source Using ant 56 | 57 | We assume that you have git and ant installed on your system. 58 | 59 | ICE uses JET to do much of the low-level linguistic processing, and so a copy of JET 60 | is compiled into ICE. This necessitates a 2-step process whenever JET is updated: 61 | first rebuild JET, then build ICE. 62 | 63 | ## Building JET 64 | 65 | Create an empty directory called *export* under the JET_HOME directory. 66 | Get a copy of *jet-release-script* from the JET git repository and run it. It will 67 | produce a JET binary distribution (a tar file named jet-all.jar). 68 | 69 | ## Building ICE` 70 | 71 | Create another empty directory called *export* under the ICE_HOME directory. 72 | Get a copy pf ice-release-script from the ICE git repository and run it. It will 73 | produce an ICE binary distribution (a tar file named ice-all.jar). 74 | 75 | # Running maven 76 | 77 | Fo those who prefer *maven*, we also provide the necessary *pom.xml* files. These 78 | build and install JET in the local repository and then build ICE. 79 | Maven can be invoked with 80 | 81 | mvn package 82 | 83 | If everything works, you should find 84 | ICE-0.2.0-jar-with-dependencies.jar (the fatjar) in target/ This 85 | should be renamed ice-all.jar and moved to the ICE_HOME directory. 86 | 87 | # User Manual 88 | 89 | Please refer to [Iceman](docs/iceman.md) for usage of ICE.. 90 | -------------------------------------------------------------------------------- /docs/ICE_Design.md: -------------------------------------------------------------------------------- 1 | 9 | 10 | # ICE Design Document 11 | 12 | *Oct. 4, 2015* 13 | 14 | ## OVERVIEW 15 | 16 | ICE relies on a detailed analysis of the corpora to provide guidance to 17 | the user in building entity sets and relations. In the initial design for ICE, 18 | users initiated various steps of this analysis through the ICE console after 19 | adding a new corpus, This made for OK demos but proved to be unrealistic for 20 | large corpora because each step took so long (several hours). It made more 21 | sense to compute as much as possible in advance (through a batch job). 22 | 23 | This led us to consider how much could be precomputed, even at the cost of some 24 | additional complexity. For example, the parser operates on sequences of lexical 25 | items, some of which are multi-word terms identified by the user in the process 26 | of building an extraction model. The parses are needed in turn for defining new 27 | relations. The simplest system structure would reparse the corpus each time the 28 | user added some terms but this would introduce an unacceptable delay into the 29 | user's session each time some terms are added. 30 | 31 | We break this dependency by precomputing dependency parses with only single token 32 | lexical items items. If we want to find the dependency path between multi-word 33 | entities, we use the head word to represent the multi-word entity. Head word 34 | is determined by the IcePreprocessor.findTermHead() function. 35 | 36 | ## WHAT IS PRECOMPUTED 37 | 38 | All the information precomputed for corpus X is stored in directory cache/X. 39 | 40 | This includes 5 files. 41 | 42 | * __ENAMEX tags for each document__: 43 | stored in file documentName.names, 44 | one name per line, format: type \t start \t end 45 | 46 | * __POS tags for each document__: 47 | stored in file documentName.pos, 48 | one token per line, format: POS \t start \t end 49 | 50 | * __the extent of each entity mention in a document__: 51 | stored in file documentName.jetExtent, 52 | one entity mention per line, format: MENTION_ID \t start \t end 53 | Note that we will use names (captured by name tagger), nouns, 54 | and pronouns for bootstrapping, so it is necessary to keep track 55 | of all name mentions. 56 | 57 | * __dependency parse of each document__: 58 | stored in file documentName.dep, one dependency relation 59 | per line. These parses do not include any transformations (??). 60 | 61 | * __Ace document produced by AceJet__: 62 | stored in file documentName.ace, in APF format. 63 | 64 | ## WHAT IS COMPUTED BASED ON PRECOMPUTED INFORMATION 65 | 66 | The following files are computed based on the precomputed information. 67 | Note that after preprocessing, ICE will try to generate initial versions 68 | of these files. However, unlike precomputed files that never change after 69 | preprocessing, the user can regenerate the following files any time. 70 | 71 | * __the count of each possible term in each document__: 72 | 73 | * __aggregate term counts over the corpus__: 74 | stored in file counts 75 | 76 | * __dependency paths over the corpus__: 77 | 78 | - file Relations: dependency paths between entities, 79 | including endpoints, with frequency count 80 | 81 | - file RelationRepr: typed dependency paths with linearization 82 | and single example; represented in memory as an 83 | instance of the DepPathMap class 84 | 85 | - file Relationtypes: typed dependency paths with frequencies, ranked 86 | 87 | - file Relationtypes.source.dict: typed dependency paths with 88 | frequency and single example, ranked 89 | 90 | all four files with dependency paths are generated by DepPaths.main(), 91 | called from RelationFinder.run(). RelationFinder is called from 92 | IcePreprocessor (after generating the 'Precomputed' data) to create 93 | an initial version of these files. The user can update the files 94 | after some entity sets have been added by pushing one of the buttons 95 | on the Paths panel and then asking for recompilation of the paths. 96 | 97 | 107 | 108 | 111 | 112 | ## WHAT IS REPRESENTED INTERNALLY (AND WHEN IT IS COMPUTED) 113 | 114 | -------------------------------------------------------------------------------- /docs/ice1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/docs/ice1.png -------------------------------------------------------------------------------- /docs/ice2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/docs/ice2.png -------------------------------------------------------------------------------- /docs/ice3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/docs/ice3.png -------------------------------------------------------------------------------- /docs/ice4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/docs/ice4.png -------------------------------------------------------------------------------- /docs/ice5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/docs/ice5.png -------------------------------------------------------------------------------- /docs/rankentities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/docs/rankentities.png -------------------------------------------------------------------------------- /docs/rankrelations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/docs/rankrelations.png -------------------------------------------------------------------------------- /ice-release-script: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | # 3 | # tcsh shell script which makes a copy (clone) of the ICE project in github 4 | # and constructs a distribution tar for ICE 5 | # 6 | # Run this script in a fresh directory 7 | # 8 | setenv JET_PACKAGE $JET_HOME/export/jet-bin/jet-190128.tar.gz 9 | setenv JAVA_TOOL_OPTIONS "-Dfile.encoding=UTF-8" 10 | rm -rf ice-bin 11 | mkdir ice-bin 12 | cd ice-bin 13 | curl http://daringfireball.net/projects/downloads/Markdown_1.0.1.zip > Markdown_1.0.1.zip 14 | unzip Markdown_1.0.1.zip 15 | echo "Unpacking Jet package..." 16 | tar zxvf $JET_PACKAGE 17 | mv props jet-props 18 | git clone https://github.com/rgrishman/ice.git clone-dir 19 | cp jet-all.jar clone-dir/lib 20 | pushd clone-dir 21 | git checkout newmaster 22 | ant dist-all-jar 23 | popd 24 | cp clone-dir/ice-all.jar . 25 | perl Markdown_1.0.1/Markdown.pl clone-dir/README.md > README.html 26 | perl Markdown_1.0.1/Markdown.pl clone-dir/docs/iceman.md > docs/iceman.html 27 | perl Markdown_1.0.1/Markdown.pl clone-dir/docs/ICE_Design.md > docs/ICE_Design.html 28 | cp clone-dir/docs/*.png docs/ 29 | cp clone-dir/LICENSE ./ 30 | cp clone-dir/COPYRIGHT ./ 31 | # scripts 32 | cp clone-dir/src/scripts/runice.sh ./bin 33 | cp clone-dir/src/scripts/runtagger.sh ./bin 34 | cp clone-dir/src/scripts/icecli ./bin 35 | cp clone-dir/src/scripts/icecli6 ./bin 36 | # ice.yml, iceprops, onomaprops, parseprops, props 37 | cp clone-dir/src/props/* ./ 38 | # quantifierPatterns and ACE DTD 39 | cp clone-dir/src/models/data/* ./data/ 40 | # files for export from ICE 41 | touch acedata/ice_onoma.dict 42 | touch acedata/EDTypesFromUser.dict 43 | touch acedata/iceRelationModel 44 | chmod u+x bin/runice.sh 45 | chmod u+x bin/runtagger.sh 46 | chmod u+x bin/icecli6 47 | chmod u+x bin/icecli 48 | rm -rf Markdown* 49 | rm -rf clone-dir 50 | cd .. 51 | echo "Building ICE tar" 52 | set date = `date +'%y%m%d'` 53 | tar zcvf ice-$date.tar.gz ice-bin/ 54 | -------------------------------------------------------------------------------- /lib/commons-cli-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/commons-cli-1.2.jar -------------------------------------------------------------------------------- /lib/commons-exec-1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/commons-exec-1.1.jar -------------------------------------------------------------------------------- /lib/commons-io-1.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/commons-io-1.3.2.jar -------------------------------------------------------------------------------- /lib/commons-math3-3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/commons-math3-3.2.jar -------------------------------------------------------------------------------- /lib/fest-assert-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/fest-assert-1.2.jar -------------------------------------------------------------------------------- /lib/fest-reflect-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/fest-reflect-1.2.jar -------------------------------------------------------------------------------- /lib/fest-swing-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/fest-swing-1.2.jar -------------------------------------------------------------------------------- /lib/fest-util-1.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/fest-util-1.1.2.jar -------------------------------------------------------------------------------- /lib/guava-18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/guava-18.0.jar -------------------------------------------------------------------------------- /lib/jet-all-1.8.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/jet-all-1.8.1.jar -------------------------------------------------------------------------------- /lib/jet-all-1.9.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/jet-all-1.9.0.jar -------------------------------------------------------------------------------- /lib/jet-all.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/jet-all.jar -------------------------------------------------------------------------------- /lib/junit-4.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/junit-4.0.jar -------------------------------------------------------------------------------- /lib/la4j-0.6.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/la4j-0.6.0.jar -------------------------------------------------------------------------------- /lib/miglayout-core-4.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/miglayout-core-4.2.jar -------------------------------------------------------------------------------- /lib/miglayout-swing-4.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/lib/miglayout-swing-4.2.jar -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | edu.nyu.cs.proteus 8 | ICE 9 | 0.2.1 10 | 11 | 12 | 13 | 14 | org.apache.maven.plugins 15 | maven-compiler-plugin 16 | 2.2 17 | 18 | 8 19 | 8 20 | 21 | 22 | 23 | org.apache.maven.plugins 24 | maven-assembly-plugin 25 | 2.5.5 26 | 27 | 28 | jar-with-dependencies 29 | 30 | 31 | 32 | 33 | assemble-all 34 | package 35 | 36 | single 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | org.la4j 47 | la4j 48 | 0.6.0 49 | 50 | 51 | com.google.guava 52 | guava 53 | 18.0 54 | 55 | 56 | com.miglayout 57 | miglayout-swing 58 | 4.2 59 | 60 | 61 | org.apache.commons 62 | commons-math3 63 | 3.2 64 | 65 | 66 | commons-io 67 | commons-io 68 | 1.3.2 69 | 70 | 71 | edu.nyu 72 | Jet 73 | 1.9.5 74 | 75 | 76 | commons-cli 77 | commons-cli 78 | 1.2 79 | 80 | 81 | org.apache.commons 82 | commons-exec 83 | 1.1 84 | 85 | 86 | net.sf.trove4j 87 | trove4j 88 | 3.0.3 89 | 90 | 91 | 92 | junit 93 | junit 94 | 4.0 95 | test 96 | 97 | 98 | 99 | org.easytesting 100 | fest-assert 101 | 1.4 102 | test 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /release-script/prepare_ice.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/tcsh 2 | # 3 | # input: JET distribution tar 4 | # clone-dir 5 | # output: ICE distribution tar 6 | # 7 | # Run this script in a fresh directory 8 | # 9 | setenv JET_PACKAGE $JET_HOME/jet-170101.tar.gz 10 | setenv JAVA_TOOL_OPTIONS "-Dfile.encoding=UTF-8" 11 | rm -rf ice-bin 12 | mkdir ice-bin 13 | cd ice-bin 14 | curl http://daringfireball.net/projects/downloads/Markdown_1.0.1.zip > Markdown_1.0.1.zip 15 | unzip Markdown_1.0.1.zip 16 | echo "Unpacking Jet package..." 17 | tar zxvf $JET_PACKAGE 18 | mv props jet-props 19 | git clone https://github.com/rgrishman/ice.git clone-dir 20 | cp jet-all.jar clone-dir/lib 21 | pushd clone-dir 22 | git checkout newmaster 23 | ant dist-all-jar 24 | popd 25 | cp clone-dir/ice-all.jar . 26 | perl Markdown_1.0.1/Markdown.pl clone-dir/README.md > README.html 27 | perl Markdown_1.0.1/Markdown.pl clone-dir/docs/iceman.md > docs/iceman.html 28 | perl Markdown_1.0.1/Markdown.pl clone-dir/docs/ICE_Design.md > docs/ICE_Design.html 29 | cp clone-dir/docs/*.png docs/ 30 | cp clone-dir/LICENSE ./ 31 | cp clone-dir/COPYRIGHT ./ 32 | # scripts 33 | cp clone-dir/src/scripts/runice.sh ./bin 34 | cp clone-dir/src/scripts/runtagger.sh ./bin 35 | cp clone-dir/src/scripts/icecli ./bin 36 | cp clone-dir/src/scripts/icecli6 ./bin 37 | # ice.yml, iceprops, onomaprops, parseprops, props 38 | cp clone-dir/src/props/* ./ 39 | # quantifierPatterns and ACE DTD 40 | cp clone-dir/src/models/data/* ./data/ 41 | # files for export from ICE 42 | touch acedata/ice_onoma.dict 43 | touch acedata/EDTypesFromUser.dict 44 | touch acedata/iceRelationModel 45 | chmod u+x bin/runice.sh 46 | chmod u+x bin/runtagger.sh 47 | chmod u+x bin/icecli6 48 | chmod u+x bin/icecli 49 | rm -rf Markdown* 50 | rm -rf clone-dir 51 | cd .. 52 | echo "Building ICE tar" 53 | set date = `date +'%y%m%d'` 54 | tar zcvf ice-$date.tar.gz ice-bin/ 55 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/aceJet/AnchoredTreeSet.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.aceJet; 2 | 3 | import edu.nyu.jet.ice.events.IceTree; 4 | import edu.nyu.jet.ice.events.IceTreeFactory; 5 | import edu.nyu.jet.ice.models.WordEmbedding; 6 | import java.util.*; 7 | import java.io.*; 8 | 9 | public class AnchoredTreeSet implements Iterable { 10 | 11 | ArrayList paths = new ArrayList(); 12 | Map> pathIndex = new HashMap>(); 13 | Map> argIndex = new HashMap>(); 14 | int count; 15 | 16 | public AnchoredTreeSet (String fileName) throws IOException { 17 | BufferedReader reader = new BufferedReader (new FileReader (fileName)); 18 | String line; 19 | count = 0; 20 | while ((line = reader.readLine()) != null) { 21 | add (line); 22 | } 23 | System.out.println ("Loaded " + count + " trees."); 24 | } 25 | 26 | public void add (String line) { 27 | int j = line.indexOf("\t"); 28 | if (j >= 0) { 29 | line = line.substring(j + 1); 30 | } 31 | IceTree p = IceTreeFactory.getIceTree(line); 32 | if (p == null) 33 | return; 34 | paths.add(p); 35 | String trigger = p.getTrigger(); 36 | if (pathIndex.get(trigger) == null) 37 | pathIndex.put(trigger, new ArrayList()); 38 | pathIndex.get(trigger).add(p); 39 | String args = p.getArgValueForRole("nsubj") + ":" + p.getArgValueForRole("dobj"); 40 | if (argIndex.get(args) == null) 41 | argIndex.put(args, new ArrayList()); 42 | argIndex.get(args).add(p); 43 | count++; 44 | } 45 | 46 | public List getByTree (String path) { 47 | return pathIndex.get(path); 48 | } 49 | 50 | public List getByArgs (String arg1, String arg2) { 51 | return getByArgs (arg1 + ":" + arg2); 52 | } 53 | 54 | public List getByArgs (String args) { 55 | return argIndex.get(args); 56 | }static public double[] embed (List paths) { 57 | int dim = WordEmbedding.getDim(); 58 | for (IceTree ip : paths) 59 | ip.embed(); 60 | double[] result = new double[dim]; 61 | for (int j=0; j < dim; j++) { 62 | result[j] = paths.get(0).embed()[j]; 63 | } 64 | for (int i=1; i < paths.size(); i++) { 65 | for (int j=0; j < dim; j++) { 66 | result[i] += paths.get(i).embed()[j]; 67 | } 68 | } 69 | return result; 70 | } 71 | 72 | /** 73 | * returns an Iterator over the paths in the AnchoredTreeSet. 74 | */ 75 | 76 | @Override 77 | public Iterator iterator() { 78 | return paths.iterator(); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/aceJet/ArgEmbeddingAnchoredPathSet.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.aceJet; 2 | 3 | import edu.nyu.jet.ice.models.PathMatcher; 4 | import edu.nyu.jet.ice.utils.IceUtils; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Set; 11 | 12 | /** 13 | * Find similar paths. Unlike AnchorPathSet, which requires exact match between args, 14 | * this version uses PathMatcher to find similar paths. 15 | * 16 | * @author yhe 17 | * @version 1.0 18 | */ 19 | public class ArgEmbeddingAnchoredPathSet extends AnchoredPathSet { 20 | 21 | private Map embedding = null; 22 | public static final int DIM_SIZE = 200; 23 | private double threshold = 0.6; 24 | 25 | public ArgEmbeddingAnchoredPathSet(String fileName, 26 | Map embedding, 27 | double threshold) 28 | throws IOException { 29 | super(fileName); 30 | this.embedding = embedding; 31 | this.threshold = threshold; 32 | } 33 | 34 | public ArgEmbeddingAnchoredPathSet(String fileName) throws IOException { 35 | super(fileName); 36 | } 37 | 38 | public List similarPaths(AnchoredPath p, 39 | Set exclusionSet) { 40 | List result = new ArrayList(); 41 | double[] pVec = argVector(p); 42 | 43 | for (AnchoredPath path : paths) { 44 | if (exclusionSet.contains(path.path)) { 45 | continue; 46 | } 47 | double[] pathVec = argVector(path); 48 | double score = IceUtils.innerProduct(pVec, pathVec)/2; 49 | if (score > threshold) { 50 | System.err.println(path.path + " : " + p + " = " + score); 51 | exclusionSet.add(path.path); 52 | result.add(path); 53 | } 54 | } 55 | return result; 56 | } 57 | 58 | private double[] argVector(AnchoredPath p) { 59 | double[] pVec = new double[DIM_SIZE * 2]; 60 | if (embedding.containsKey(p.arg1)) { 61 | double[] vec = embedding.get(p.arg1); 62 | for (int i = 0; i < vec.length; i++) { 63 | pVec[i] = vec[i]; 64 | } 65 | } 66 | if (embedding.containsKey(p.arg2)) { 67 | double[] vec = embedding.get(p.arg2); 68 | for (int i = 0; i < vec.length; i++) { 69 | pVec[i + DIM_SIZE] = vec[i]; 70 | } 71 | } 72 | return pVec; 73 | } 74 | 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/aceJet/SimAnchoredPathSet.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.aceJet; 2 | 3 | import edu.nyu.jet.ice.models.PathMatcher; 4 | import edu.nyu.jet.ice.models.WordEmbedding; 5 | import edu.nyu.jet.ice.models.IcePath; 6 | 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | /** 12 | * Find similar paths. Unlike AnchorPathSet, which requires exact match between args, 13 | * this version uses PathMatcher or WordEmbeddings to find similar paths. 14 | * 15 | * @author yhe 16 | * @version 1.0 17 | */ 18 | public class SimAnchoredPathSet extends AnchoredPathSet { 19 | 20 | private PathMatcher matcher = null; 21 | private double threshold = 0.3; 22 | private boolean useWE = false; 23 | 24 | public SimAnchoredPathSet(String fileName, PathMatcher matcher, double threshold) 25 | throws IOException { 26 | super(fileName); 27 | this.matcher = matcher; 28 | this.threshold = threshold; 29 | useWE = WordEmbedding.isLoaded(); 30 | } 31 | 32 | public SimAnchoredPathSet(String fileName) throws IOException { 33 | super(fileName); 34 | } 35 | 36 | public List similarPaths(IcePath centroid) { 37 | if (useWE) { 38 | String[] x = centroid.getPathString().split("--"); 39 | if (x.length > 1) centroid = new IcePath(x[1].trim()); 40 | } 41 | List result = new ArrayList(); 42 | for (AnchoredPath path : paths) { 43 | double score = 0.; 44 | if (useWE) { 45 | String string1 = centroid.getPathString(); 46 | String string2 = path.toString(); 47 | score = WordEmbedding.pathSimilarity(string1, string2); 48 | } else { 49 | ; // XXX score = 1 - (matcher.matchPaths("UNK -- " + path.path + " -- UNK", 50 | // XXX "UNK -- " + p + " -- UNK") / (p.getPath().split(":").length + 1)); 51 | } 52 | if (score > threshold) { 53 | // XXX System.err.println(path.path + " : " + p + " = " + score); 54 | result.add(path); 55 | } 56 | } 57 | return result; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/aceJet/SimAnchoredTreeSet.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.aceJet; 2 | 3 | import edu.nyu.jet.ice.events.IceTree; 4 | import edu.nyu.jet.ice.models.PathMatcher; 5 | import edu.nyu.jet.ice.models.WordEmbedding; 6 | 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | /** 12 | * Find similar paths. Unlike AnchorPathSet, which requires exact match between args, 13 | * this version uses PathMatcher or WordEmbeddings to find similar paths. 14 | * 15 | * @author yhe 16 | * @version 1.0 17 | */ 18 | public class SimAnchoredTreeSet extends AnchoredTreeSet { 19 | 20 | private PathMatcher matcher = null; 21 | private double threshold = 0.3; 22 | private boolean useWE = false; 23 | 24 | public SimAnchoredTreeSet(String fileName, PathMatcher matcher, double threshold) 25 | throws IOException { 26 | super(fileName); 27 | this.matcher = matcher; 28 | this.threshold = threshold; 29 | useWE = WordEmbedding.isLoaded(); 30 | } 31 | 32 | public SimAnchoredTreeSet(String fileName) throws IOException { 33 | super(fileName); 34 | } 35 | 36 | public List similarPaths(String p) { 37 | if (useWE) { 38 | String[] x = p.split("--"); 39 | if (x.length > 1) p = x[1].trim(); 40 | } 41 | List result = new ArrayList(); 42 | /* 43 | for (IceTree path : paths) { 44 | double score; 45 | if (useWE) { 46 | score = WordEmbedding.pathSimilarity(p, path.path); 47 | } else { 48 | score = 1 - (matcher.matchPaths("UNK -- " + path.path + " -- UNK", 49 | "UNK -- " + p + " -- UNK") / (p.split(":").length + 1)); 50 | } 51 | if (score > threshold) { 52 | System.err.println(path.path + " : " + p + " = " + score); 53 | result.add(path); 54 | } 55 | } 56 | */ 57 | return result; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/entityset/EmbeddingEntitySetExpander.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.entityset; 2 | 3 | import org.la4j.Vector; 4 | import org.la4j.vector.dense.BasicVector; 5 | import org.la4j.vector.DenseVector; 6 | import org.la4j.vector.sparse.CompressedVector; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.util.HashMap; 12 | import java.util.HashSet; 13 | import java.util.List; 14 | import java.util.StringTokenizer; 15 | 16 | /** 17 | * Expand entity sets according to distributional similarity. Each noun group is represented by 18 | * its skip-gram embedding trained by word2vec. 19 | * 20 | * This class is not used by default by the ICE GUI, but can be activated easily. 21 | */ 22 | public class EmbeddingEntitySetExpander extends EntitySetExpander { 23 | 24 | public EmbeddingEntitySetExpander() { 25 | 26 | } 27 | 28 | public EmbeddingEntitySetExpander(String indexFileName, List seeds) { 29 | used = new HashSet(); 30 | entityFeatureDict = new HashMap(); 31 | String line; 32 | try { 33 | BufferedReader br = new BufferedReader(new FileReader(indexFileName)); 34 | line = br.readLine(); 35 | int featureSize = Integer.valueOf(line.split(" ")[1]); 36 | centroid = new BasicVector(featureSize); 37 | negativeCentroid = new BasicVector(featureSize); 38 | 39 | while ((line = br.readLine()) != null) { 40 | StringTokenizer tok = new StringTokenizer(line); 41 | String word = tok.nextToken().replaceAll("_", " "); 42 | Vector v = new BasicVector(featureSize); //BasicVector 43 | int i = 0; 44 | while (tok.hasMoreTokens()) { 45 | v.set(i, Double.valueOf(tok.nextToken())); 46 | i++; 47 | } 48 | entityFeatureDict.put(word, v); 49 | } 50 | br.close(); 51 | } catch (IOException e) { 52 | e.printStackTrace(); 53 | } 54 | if (seeds != null) { 55 | for (String seed : seeds) { 56 | if (entityFeatureDict.containsKey(seed)) { 57 | centroid = centroid.add(entityFeatureDict.get(seed)); 58 | } 59 | } 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/entityset/Entity.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.entityset; 2 | 3 | /** 4 | * Java bean for entities in entity set expansion. equality is based on 5 | * text and type, NOT score. 6 | * 7 | * @author yhe 8 | * @version 1.0 9 | */ 10 | public class Entity { 11 | private String text; 12 | private String type; 13 | private double score; 14 | 15 | public Entity(String text) { 16 | this.text = text; 17 | } 18 | 19 | public Entity(String text, String type, double score) { 20 | this.score = score; 21 | this.text = text; 22 | this.type = type; 23 | } 24 | 25 | public String getText() { 26 | return text; 27 | } 28 | 29 | public void setText(String text) { 30 | this.text = text; 31 | } 32 | 33 | public String getType() { 34 | return type; 35 | } 36 | 37 | public void setType(String type) { 38 | this.type = type; 39 | } 40 | 41 | public double getScore() { 42 | return score; 43 | } 44 | 45 | public void setScore(double score) { 46 | this.score = score; 47 | } 48 | 49 | public static Entity fromString(String line) { 50 | // StringTokenizer t = new StringTokenizer(line, " \t\n\r"); 51 | // double score = Double.valueOf(t.nextToken()); 52 | // String[] parts = t.nextToken().split("/"); 53 | String[] parts = line.trim().split("\\t"); 54 | if (parts.length == 2) { 55 | double score = Double.valueOf(parts[0]); 56 | String[] smallParts = parts[1].split("/"); 57 | if (smallParts.length == 2 && 58 | smallParts[0].length() > 1 && 59 | Character.isLetter(smallParts[0].charAt(0))) { 60 | return new Entity(smallParts[0], smallParts[1], score); 61 | } 62 | } 63 | return null; 64 | } 65 | 66 | 67 | @Override 68 | public boolean equals(Object o) { 69 | if (this == o) return true; 70 | if (o == null || getClass() != o.getClass()) return false; 71 | 72 | Entity entity = (Entity) o; 73 | 74 | if (text != null ? !text.equals(entity.text) : entity.text != null) return false; 75 | if (type != null ? !type.equals(entity.type) : entity.type != null) return false; 76 | 77 | return true; 78 | } 79 | 80 | @Override 81 | public int hashCode() { 82 | int result = text != null ? text.hashCode() : 0; 83 | result = 31 * result + (type != null ? type.hashCode() : 0); 84 | return result; 85 | } 86 | 87 | @Override 88 | public String toString() { 89 | return text; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/entityset/EntityIndexerBox.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.entityset;// -*- tab-width: 4 -*- 2 | //Title: JET-ICE 3 | //Version: 1.72 4 | //Copyright: Copyright (c) 2014 5 | //Author: Ralph Grishman 6 | //Description: A Java-based Information Extraction Tool -- Customization Environment 7 | 8 | import edu.nyu.jet.ice.models.Corpus; 9 | import edu.nyu.jet.ice.models.IceEntitySet; 10 | import edu.nyu.jet.ice.utils.FileNameSchema; 11 | import edu.nyu.jet.ice.utils.IceUtils; 12 | import edu.nyu.jet.ice.utils.ProgressMonitorI; 13 | import edu.nyu.jet.ice.utils.SwingProgressMonitor; 14 | import edu.nyu.jet.ice.entityset.*; 15 | import edu.nyu.jet.ice.uicomps.Ice; 16 | import net.miginfocom.swing.MigLayout; 17 | 18 | import javax.swing.*; 19 | import javax.swing.border.*; 20 | import java.awt.*; 21 | import java.awt.event.ActionEvent; 22 | import java.awt.event.ActionListener; 23 | import java.io.File; 24 | import java.io.FileReader; 25 | import java.io.IOException; 26 | import java.util.*; 27 | import java.util.List; 28 | 29 | /** 30 | * Button for initiating entity indexing and selecting cutoff. 31 | */ 32 | 33 | public class EntityIndexerBox { 34 | 35 | public static JFrame currentFrame = Ice.mainFrame; 36 | 37 | public void buildIndex(double cutoff, String inType) { 38 | 39 | Corpus selectedCorpus = Ice.selectedCorpus; 40 | EntitySetIndexerThread indexer = new EntitySetIndexerThread( 41 | FileNameSchema.getTermsFileName(selectedCorpus.getName()), 42 | inType, 43 | String.valueOf(cutoff), 44 | "onomaprops", 45 | selectedCorpus.getDocListFileName(), 46 | selectedCorpus.getDirectory(), 47 | selectedCorpus.getFilter(), 48 | FileNameSchema.getEntitySetIndexFileName(selectedCorpus.getName(), inType) 49 | ); 50 | 51 | indexer.start(); 52 | } 53 | 54 | public Box makeSwingBox() { 55 | Box box = Box.createVerticalBox(); 56 | box.setOpaque(false); 57 | TitledBorder border = new TitledBorder("Index Entities"); 58 | box.setMinimumSize(new Dimension(480, 32)); 59 | box.setBorder(border); 60 | JPanel indexBox = new JPanel(); 61 | indexBox.setOpaque(false); 62 | indexBox.setLayout(new MigLayout()); 63 | 64 | JLabel cutoffLabel = new JLabel("Cutoff"); 65 | final JTextField cutoffField = new JTextField(); 66 | cutoffField.setColumns(5); 67 | cutoffField.setText("3"); 68 | indexBox.add(cutoffLabel); 69 | indexBox.add(cutoffField); 70 | 71 | JButton buildIndexButton = new JButton("Index"); 72 | indexBox.add(buildIndexButton); 73 | 74 | box.add(indexBox); 75 | 76 | buildIndexButton.addActionListener(new ActionListener() { 77 | public void actionPerformed(ActionEvent actionEvent) { 78 | double cutoff = 0.0; 79 | try { 80 | cutoff = Double.valueOf(cutoffField.getText()); 81 | if (cutoff < 0.0) { 82 | throw new Exception("Wrong cutoff value"); 83 | } 84 | } 85 | catch (Exception e) { 86 | JOptionPane.showMessageDialog(Ice.mainFrame, 87 | "Cutoff is a number larger than 0.0", 88 | "Cutoff value error", 89 | JOptionPane.ERROR_MESSAGE); 90 | e.printStackTrace(); 91 | return; 92 | } 93 | 94 | int docCount = Ice.selectedCorpus.getNumberOfDocs(); 95 | ProgressMonitorI progressMonitor = new SwingProgressMonitor(Ice.mainFrame, "Building index...", 96 | "Initializing Jet", 0, docCount + 5); 97 | EntitySetIndexer.setDefaultProgressMonitor(progressMonitor); 98 | 99 | File termFile = new File(FileNameSchema.getTermsFileName(Ice.selectedCorpusName)); 100 | if (!termFile.exists()) { 101 | JOptionPane.showMessageDialog(Ice.mainFrame, 102 | "Entities file does not exist. Please run find entities first.", 103 | "Find entities first", 104 | JOptionPane.ERROR_MESSAGE); 105 | return; 106 | } 107 | 108 | buildIndex(cutoff, "nn"); 109 | } 110 | }); 111 | return box; 112 | } 113 | 114 | } 115 | 116 | class EntitySetIndexerThread extends Thread { 117 | String[] args; 118 | 119 | EntitySetIndexerThread (String countFile, String type, String cutoff, 120 | String propsFile, String docList, String inputDir, 121 | String inputSuffix, String outputFile) { 122 | args = new String[8]; 123 | args[0] = countFile; 124 | args[1] = type; 125 | args[2] = cutoff; 126 | args[3] = propsFile; 127 | args[4] = docList; 128 | args[5] = inputDir; 129 | args[6] = inputSuffix; 130 | args[7] = outputFile; 131 | } 132 | 133 | public void run() { 134 | try { 135 | Thread.sleep(1000); 136 | EntitySetIndexer.main(args); 137 | } catch (Exception e) { 138 | System.err.println ("Exception in EntitySetIndexer:\n"); 139 | e.printStackTrace(); 140 | } 141 | } 142 | } 143 | 144 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/entityset/EntitySetRankThread.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.entityset; 2 | 3 | import edu.nyu.jet.ice.uicomps.EntitySetRankerFrame; 4 | 5 | /** 6 | * Wrapper of the EntitySetExpander to run it in a separate Thread 7 | * 8 | * @author yhe 9 | * @version 1.0 10 | */ 11 | public class EntitySetRankThread extends Thread { 12 | private EntitySetExpander expander; 13 | private EntitySetRankerFrame frame; 14 | private boolean showWindow; 15 | 16 | public EntitySetRankThread (EntitySetExpander expander, EntitySetRankerFrame frame, boolean showWindow) { 17 | this.expander = expander; 18 | this.frame = frame; 19 | this.showWindow = showWindow; 20 | } 21 | 22 | public EntitySetRankThread (EntitySetExpander expander, EntitySetRankerFrame frame) { 23 | this.expander = expander; 24 | this.frame = frame; 25 | this.showWindow = false; 26 | } 27 | 28 | @Override 29 | public void run() { 30 | try { 31 | Thread.sleep(1000); 32 | showWindow = expander.rank(); 33 | frame.updateList(); 34 | //frame.updateLists(expander.getPositives(), expander.getNegatives()); 35 | if (showWindow) { 36 | // frame.setSize(400, 525); 37 | frame.setAlwaysOnTop(true); 38 | frame.setLocationRelativeTo(null); 39 | frame.setVisible(true); 40 | frame.listPane.revalidate(); 41 | frame.listPane.repaint(); 42 | frame.rankedList.revalidate(); 43 | frame.rankedList.repaint(); 44 | // frame.positiveList.revalidate(); 45 | // frame.negativeList.revalidate(); 46 | // frame.positiveList.repaint(); 47 | // frame.negativeList.repaint(); 48 | } 49 | //count++; 50 | } catch (Exception e) { 51 | System.err.println ("Exception in EntitySetExpander:\n"); 52 | e.printStackTrace(); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/entityset/EntitySetRerankThread.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.entityset; 2 | 3 | import edu.nyu.jet.ice.uicomps.EntitySetRankerFrame; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Wrapper of the EntitySetExpander to run entity reranking in a separate thread 9 | * 10 | * @author yhe 11 | * @version 1.0 12 | */ 13 | public class EntitySetRerankThread extends Thread { 14 | private EntitySetExpander expander; 15 | private EntitySetRankerFrame frame; 16 | private List entities; 17 | 18 | 19 | public EntitySetRerankThread(EntitySetExpander expander, EntitySetRankerFrame frame, List entities) { 20 | this.expander = expander; 21 | this.frame = frame; 22 | this.entities = entities; 23 | } 24 | 25 | @Override 26 | public void run() { 27 | try { 28 | Thread.sleep(1000); 29 | expander.rerank(entities); 30 | frame.updateList(); 31 | frame.listPane.validate(); 32 | frame.listPane.repaint(); 33 | } catch (Exception e) { 34 | System.err.println ("Exception in EntitySetExpander:\n"); 35 | e.printStackTrace(); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/entityset/MIRAEntitySetExpander.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.entityset; 2 | 3 | import org.la4j.Vector; 4 | import org.la4j.vector.sparse.CompressedVector; 5 | 6 | import java.util.*; 7 | 8 | /** 9 | * Entity set expansion using the MIRA-trained perceptron 10 | * 11 | * This is an alternative to the default distance based algorithm in EntitySetExpander. It has 12 | * similar performance as the default algorithm, so it is not activated by default in the current 13 | * system. 14 | * 15 | * Using the binary version of MIRA: 16 | * Koby Crammer and Yoram Singer. Ultraconservative Online Algorithms for Multiclass Problems 17 | * in JMLR 3 (2003): 965 18 | * 19 | * @author yhe 20 | * @version 1.0 21 | */ 22 | public class MIRAEntitySetExpander extends EntitySetExpander { 23 | 24 | private Vector parameters; 25 | public static final int EPOCHS = 50; 26 | 27 | public MIRAEntitySetExpander(String indexFileName, List seeds) { 28 | super(indexFileName, seeds); 29 | parameters = new CompressedVector(this.getFeatureSize()); 30 | } 31 | 32 | private double tao(double y, Vector wbar, Vector xbar) { 33 | double x = - y * wbar.innerProduct(xbar) / xbar.innerProduct(xbar); 34 | if (x < 0) return 0; 35 | if (x <= 1) return x; 36 | return 1; 37 | } 38 | 39 | 40 | @Override 41 | public void updateParameters() { 42 | List positives = getPositives(); 43 | List negatives = getNegatives(); 44 | List examples = new ArrayList(positives.size() + negatives.size()); 45 | Map featureDict = getEntityFeatureDict(); 46 | Vector parameters = new CompressedVector(this.parameters.length()); 47 | for (String positive : positives) { 48 | examples.add(new Entity(positive, "noun", 1)); 49 | Vector p = featureDict.get(positive); 50 | p = p.divide(p.sum()); 51 | parameters = parameters.add(p); 52 | } 53 | for (String negative : negatives) { 54 | examples.add(new Entity(negative, "noun", -1)); 55 | Vector n = featureDict.get(negative); 56 | n = n.divide(n.sum()); 57 | parameters = parameters.subtract(n); 58 | } 59 | List parametersHistory = new ArrayList(); 60 | 61 | // Random r = new Random(); 62 | // for (int i = 0; i < parameters.length(); i++) { 63 | // double v = r.nextBoolean() ? r.nextDouble() : -r.nextDouble(); 64 | // parameters.set(i, v); 65 | // } 66 | for (int i = 0; i < EPOCHS; i++) { 67 | Collections.shuffle(examples); 68 | for (int t = 0; t < examples.size(); t++) { 69 | Entity x = examples.get(t); 70 | Vector xt = featureDict.get(x.getText()); 71 | double xtSum = xt.sum(); 72 | xt = xt.divide(xtSum); 73 | double yhat = xt.innerProduct(parameters); 74 | if (yhat * examples.get(t).getScore() <= 0) { 75 | // should update 76 | Vector delta = xt.multiply(tao(x.getScore(), parameters, xt) * x.getScore()); 77 | //delta = delta.divide(i); 78 | parameters = parameters.add(delta); 79 | } 80 | } 81 | Vector history = new CompressedVector(parameters.length()); 82 | for (int j = 0; j < parameters.length(); j++) { 83 | if (parameters.get(j) != 0) { 84 | history.set(j, parameters.get(j)); 85 | } 86 | } 87 | parametersHistory.add(history); 88 | } 89 | this.parameters = new CompressedVector(this.parameters.length()); 90 | for (int i = 0; i < parametersHistory.size(); i++) { 91 | this.parameters = this.parameters.add(parametersHistory.get(i)); 92 | } 93 | } 94 | 95 | public void rerank(List entities) { 96 | Map entityFeatureDict = getEntityFeatureDict(); 97 | 98 | if (progressMonitor != null) { 99 | progressMonitor.setNote("Calculating similarity..."); 100 | progressMonitor.setProgress(0); 101 | progressMonitor.setMaximum(entityFeatureDict.size() + 5); 102 | try { 103 | Thread.sleep(200); 104 | } 105 | catch (InterruptedException e) { 106 | e.printStackTrace(); 107 | } 108 | } 109 | int count = 0; 110 | boolean isCanceled = false; 111 | for (Entity e : entities) { 112 | Vector v = entityFeatureDict.get(e.getText()); 113 | // double score = sim.measureSimilarity(centroid, v); 114 | // score -= GAMMA * sim.measureSimilarity(negativeCentroid, v); 115 | double vSum = v.sum(); 116 | v = v.divide(vSum); 117 | double score = parameters.innerProduct(v); 118 | //Entity e = new Entity(k, "", -score); 119 | e.setScore(-score); 120 | count++; 121 | //System.out.println(count); 122 | if (progressMonitor != null) { 123 | if (progressMonitor.isCanceled()) { 124 | isCanceled = true; 125 | break; 126 | } 127 | progressMonitor.setProgress(count); 128 | } 129 | } 130 | if (progressMonitor != null) { 131 | progressMonitor.setNote("Sorting..."); 132 | } 133 | Collections.sort(entities, new SimilarityComparator()); 134 | if (progressMonitor != null && !isCanceled) { 135 | progressMonitor.setNote("Done."); 136 | progressMonitor.setProgress(progressMonitor.getMaximum()); 137 | } 138 | if (!isCanceled) { 139 | rankedEntities = entities; 140 | } 141 | } 142 | 143 | 144 | } 145 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/entityset/RankChoiceEntity.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.entityset; 2 | 3 | /** 4 | * Java bean to record user decision on whether an entity belongs to an entity set. 5 | */ 6 | public class RankChoiceEntity extends Entity { 7 | public enum EntityDecision { 8 | YES, NO, UNDECIDED; 9 | } 10 | 11 | private EntityDecision decision; 12 | 13 | public RankChoiceEntity(String text) { 14 | super(text); 15 | decision = EntityDecision.UNDECIDED; 16 | } 17 | 18 | public RankChoiceEntity(String text, String type, double score) { 19 | super(text, type, score); 20 | decision = EntityDecision.UNDECIDED; 21 | } 22 | 23 | public EntityDecision getDecision() { 24 | return decision; 25 | } 26 | 27 | public void setDecision(EntityDecision decision) { 28 | this.decision = decision; 29 | } 30 | 31 | @Override 32 | public String toString() { 33 | return decision == EntityDecision.UNDECIDED ? getText() : getText() + " / " + getDecision(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/events/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rgrishman/ice/c9ce8c012a9f0b8600e3fb71583f85067172b3bd/src/main/java/edu/nyu/jet/ice/events/.DS_Store -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/events/EventBuilderThread.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.events; 2 | 3 | import edu.nyu.jet.ice.relation.Bootstrap; 4 | import edu.nyu.jet.ice.events.SwingEventsPanel; 5 | 6 | /** 7 | * When the 'expand' button om the relation frame is pushed, the SwingEventsPanel 8 | * creates a EventBuilderThread to perform the computations required to 9 | * generate a list of candidate relation patterns (which are then reviewed 10 | * by the user). 11 | * 12 | * Created by yhe on 10/14/14. 13 | */ 14 | public class EventBuilderThread extends Thread { 15 | 16 | String[] args; 17 | // RelationBuilder builder; 18 | EventBootstrap bootstrap; 19 | String arg1; 20 | String arg2; 21 | EventBuilderFrame frame; 22 | SwingEventsPanel swingEventsPanel; 23 | 24 | public EventBuilderThread (String seed, 25 | String eventInstanceFileName, 26 | String pathListFileName, 27 | /* RelationBuildera*/ Object builder, 28 | EventBootstrap bootstrap, 29 | EventBuilderFrame frame, 30 | SwingEventsPanel swingEventsPanel) { 31 | args = new String[3]; 32 | args[0] = seed; 33 | String[] parts = seed.trim().toLowerCase().split(" "); 34 | if (parts.length > 1) { 35 | arg1 = parts[0].toUpperCase(); 36 | arg2 = parts[parts.length - 1].toUpperCase(); 37 | } 38 | args[1] = eventInstanceFileName; 39 | args[2] = pathListFileName; 40 | // this.builder = builder; 41 | this.bootstrap = bootstrap; 42 | this.frame = frame; 43 | this.swingEventsPanel = swingEventsPanel; 44 | } 45 | 46 | public void run() { 47 | try { 48 | bootstrap.initialize(args[0], args[1]); 49 | frame.updateList(); 50 | frame.setLocationRelativeTo(null); 51 | frame.setVisible(true); 52 | frame.listPane.revalidate(); 53 | frame.listPane.repaint(); 54 | frame.rankedList.revalidate(); 55 | frame.rankedList.repaint(); 56 | } catch (Exception e) { 57 | System.err.println("Exception in Jet.RelationAL.Bootstrap: "); 58 | e.printStackTrace(); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/events/EventFinder.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.events; 2 | 3 | import edu.nyu.jet.ice.models.*; 4 | import edu.nyu.jet.ice.uicomps.Ice; 5 | import edu.nyu.jet.ice.utils.ProgressMonitorI; 6 | import edu.nyu.jet.ice.utils.SwingProgressMonitor; 7 | 8 | import javax.swing.*; 9 | import java.io.IOException; 10 | 11 | /** 12 | * Counts all IceTrees in corpus. 13 | */ 14 | 15 | public class EventFinder extends Thread { 16 | 17 | String[] args; 18 | String types; 19 | JTextArea area; 20 | int numberOfDocs; 21 | ProgressMonitorI eventProgressMonitor = null; 22 | 23 | public EventFinder(String docListFileName, String directory, String filter, 24 | String instances, String types, JTextArea area, int numberOfDocs, 25 | ProgressMonitorI eventProgressMonitor) { 26 | args = new String[4]; 27 | args[0] = "parseprops"; 28 | args[1] = docListFileName; 29 | args[2] = directory; 30 | args[3] = filter; 31 | this.types = types; 32 | this.area = area; 33 | this.numberOfDocs = numberOfDocs; 34 | this.eventProgressMonitor = eventProgressMonitor; 35 | } 36 | 37 | public void run() { 38 | try { 39 | // force monitor to display during long initialization 40 | try { 41 | Thread.sleep(1000); 42 | } catch (InterruptedException ignore) { 43 | } 44 | if (null != eventProgressMonitor) { 45 | eventProgressMonitor.setProgress(2); 46 | } 47 | DepTreeMap depTreeMap = DepTreeMap.getInstance(); 48 | depTreeMap.unpersist(); 49 | DepPaths.progressMonitor = eventProgressMonitor; 50 | DepPaths.main(args); 51 | // Corpus.sort("event-temp", types); // <===== 52 | depTreeMap.loadTrees(true); 53 | // if(area != null) { 54 | // Corpus.displayTerms(types, 40, area, Corpus.eventFilter); 55 | // } 56 | } catch (IOException e) { 57 | System.out.println("IOException in DepTrees " + e); 58 | e.printStackTrace(System.err); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/events/IceEvent.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.events; 2 | 3 | import edu.nyu.jet.Logger; 4 | import edu.nyu.jet.LoggerFactory; 5 | import edu.nyu.jet.ice.models.*; 6 | 7 | import java.util.*; 8 | 9 | /** 10 | * An event defined using ICE. It consists of the types of its 11 | * arguments and a set of dependency trees. 12 | */ 13 | 14 | public class IceEvent { 15 | 16 | static final Logger logger = LoggerFactory.getLogger(IceEvent.class); 17 | 18 | private String name = ""; 19 | 20 | private List reprs = new ArrayList(); 21 | 22 | private List trees = new ArrayList(); 23 | 24 | private List negTrees = new ArrayList(); 25 | 26 | 27 | // ---- property methods ----- 28 | 29 | /** 30 | * Returns the name of the event. 31 | */ 32 | 33 | public String getName() {return name;} 34 | 35 | /** 36 | * Set the name of the event. 37 | */ 38 | 39 | public void setName (String s) {name = s;} 40 | 41 | /** 42 | * Add 'repr' to the set of English phrases representing the relation. 43 | */ 44 | 45 | public void addRepr (String repr) { 46 | if (! reprs.contains(repr)) 47 | reprs.add(repr); 48 | } 49 | 50 | public void removeRepr (String repr) { 51 | reprs.remove(repr); 52 | } 53 | 54 | public List getReprs() { 55 | return reprs; 56 | } 57 | 58 | public void updateTrees() { 59 | DepTreeMap depTreeMap = DepTreeMap.getInstance(); 60 | trees = new ArrayList(); 61 | depTreeMap.loadTrees(); 62 | for (String repr : reprs) { 63 | List trees = depTreeMap.findTree(repr); 64 | addTrees(trees); 65 | } 66 | } 67 | 68 | /** 69 | * Returns the list of trees associated with this event. 70 | */ 71 | 72 | public List getTrees() { 73 | return trees; 74 | } 75 | 76 | public void setTrees (List it) { 77 | trees = it; 78 | } 79 | 80 | public void removeTree (IceTree tree) { 81 | trees.remove(tree); 82 | } 83 | 84 | /** 85 | * Add tree as one of the trees for this event. 86 | */ 87 | 88 | public void addTrees (List trees) { 89 | if (trees != null) 90 | for (IceTree p : trees) 91 | addTree(p); 92 | } 93 | 94 | public List getNegTrees() { 95 | return negTrees; 96 | } 97 | 98 | public void setNegTrees(List negTrees) { 99 | this.negTrees = negTrees; 100 | } 101 | 102 | public void addNegTree (IceTree negTree) { 103 | negTrees.add(negTree); 104 | } 105 | 106 | // ---- constructors ----- 107 | 108 | public IceEvent (String name) { 109 | this.name = name; 110 | } 111 | 112 | public IceEvent () { 113 | this ("?"); 114 | } 115 | 116 | @Override 117 | public String toString() { 118 | return name; 119 | } 120 | 121 | public String report () { 122 | String r = name + " (event)\n"; 123 | for (IceTree tree : trees) 124 | r += tree + "\n"; 125 | return r; 126 | } 127 | 128 | public void addTree (IceTree it) { 129 | trees.add(it); 130 | logger.info ("Added tree {} to event {}", it, name); 131 | } 132 | 133 | } 134 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/events/IceTreeFactory.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.events; 2 | 3 | import java.util.*; 4 | 5 | public class IceTreeFactory { 6 | 7 | public static Map iceTrees = new HashMap< String, IceTree>(); 8 | 9 | /** 10 | * performs 4 functions: 11 | * - parses the String 's' into a role-based strucure 12 | * - standardizes rhe argument order with 'sortArgs' 13 | * - generates string repr with normalized role order and spacing 14 | * - creates unique copy with this normalized string 15 | */ 16 | 17 | public static IceTree getIceTree (String s) { 18 | // analyze String into Tree 19 | String[] triggerAndArgs = s.split(" "); 20 | String trigger; 21 | if (triggerAndArgs.length < 2) { 22 | trigger = "?"; 23 | System.out.println("invalid event: " + s); 24 | // Thread.dumpStack(); 25 | return null; 26 | } else { 27 | trigger = triggerAndArgs[0]; 28 | List argRoleList = new ArrayList(); 29 | List argValueList = new ArrayList(); 30 | List entityTypeList = new ArrayList(); 31 | List mentionType = new ArrayList(); 32 | 33 | for (int i = 1; i < triggerAndArgs.length; i++) { 34 | String arg = triggerAndArgs[i]; 35 | String role; 36 | String type; 37 | String value; 38 | int colon = arg.indexOf(":"); 39 | if (colon >= 0) { 40 | role = arg.substring(0, colon); 41 | String typeAndValue = arg.substring(colon + 1); 42 | int equal = typeAndValue.indexOf("="); 43 | if (equal >= 0) { 44 | type = typeAndValue.substring(0, equal); 45 | value = typeAndValue.substring(equal + 1); 46 | } else { 47 | type = typeAndValue; 48 | value = null; 49 | } 50 | } else { 51 | int equal = arg.indexOf("="); 52 | if (equal >= 0) { 53 | role = arg.substring(0, equal); 54 | type = null; 55 | value = arg.substring(equal + 1); 56 | } else { 57 | System.out.println("invalid event: " + s); 58 | Thread.dumpStack(); 59 | break; 60 | } 61 | } 62 | argRoleList.add(role); 63 | argValueList.add(value); 64 | entityTypeList.add(type); 65 | mentionType.add(IceTree.MentionType.UNKNOWN); 66 | } 67 | String[] argRole = argRoleList.toArray(new String[0]); 68 | String[] argValue = argValueList.toArray(new String[0]); 69 | String[] entityType = entityTypeList.toArray(new String[0]); 70 | sortArgs (argRole, entityType, argValue); 71 | return IceTreeFactory.getIceTree (trigger, argRole, entityType, argValue); 72 | } 73 | } 74 | 75 | public static IceTree getIceTree (String trigger, String[] argRole, 76 | String[] entityType, String[] argValue) { 77 | sortArgs (argRole, entityType, argValue); 78 | String s = IceTree.core(trigger, argRole, entityType, argValue); 79 | IceTree it = iceTrees.get(s); 80 | if (it == null) { 81 | it = new IceTree(trigger, argRole, entityType, argValue); 82 | iceTrees.put(s, it);; 83 | } 84 | return it; 85 | } 86 | 87 | /** 88 | * Creates a normal form for IceTrees in which the roles are 89 | * stored in lexicographic order. 90 | */ 91 | public static void sortArgs (String[] argRole, String[] argValue, String[] argType) { 92 | TreeMap valueMap = new TreeMap(); 93 | TreeMap typeMap = new TreeMap(); 94 | for (int i = 0; i < argRole.length; i++) { 95 | valueMap.put(argRole[i], argValue[i]); 96 | typeMap.put(argRole[i], argType[i]); 97 | } 98 | int j = 0; 99 | for (String role : valueMap.keySet() ) { 100 | argRole[j] = role; 101 | argValue[j] = valueMap.get(role); 102 | argType[j] = typeMap.get(role); 103 | j++; 104 | } 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/events/IceTreeSet.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.events; 2 | 3 | import java.util.*; 4 | import java.io.*; 5 | 6 | public class IceTreeSet { 7 | 8 | List list = new ArrayList(); 9 | int count; 10 | int numTrees = 0; 11 | 12 | IceTreeSet (String fileName, int threshold) { 13 | try { 14 | BufferedReader reader = new BufferedReader (new FileReader (fileName)); 15 | String line; 16 | while ((line = reader.readLine()) != null) { 17 | int j = line.indexOf("\t"); 18 | if (j >= 0) { 19 | count = Integer.parseInt(line.substring(0, j)); 20 | if (count < threshold) 21 | continue;; 22 | line = line.substring(j + 1); 23 | } 24 | IceTree iceTree = IceTreeFactory.getIceTree(line); 25 | iceTree.count = count; 26 | list.add(iceTree); 27 | numTrees++; 28 | } 29 | } catch (IOException e) { 30 | e.printStackTrace(); 31 | } catch (NumberFormatException e) { 32 | e.printStackTrace(); 33 | } 34 | System.out.println("loaded " + numTrees + " entries from " + fileName); 35 | } 36 | 37 | IceTreeSet (String fileName) { 38 | this (fileName, 1); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/events/PhraseLemmatizer.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.events; 2 | 3 | import java.io.*; 4 | import java.util.*; 5 | import edu.nyu.jet.JetTest; 6 | import edu.nyu.jet.lex.Tokenizer; 7 | import edu.nyu.jet.lex.Stemmer; 8 | import edu.nyu.jet.tipster.*; 9 | import edu.nyu.jet.hmm.HMMTagger; 10 | 11 | public class PhraseLemmatizer { 12 | 13 | public static void main (String[] args) throws IOException { 14 | JetTest.tagger = new HMMTagger(); 15 | JetTest.tagger.load("../jet/data/pos_hmm.txt"); 16 | // reads line 17 | BufferedReader reader = new BufferedReader 18 | (new InputStreamReader (System.in)); 19 | String line; 20 | while ((line = reader.readLine()) != null) { 21 | if (line.startsWith("x")) 22 | return; 23 | System.out.println("result : " + lemmatize(line)); 24 | } 25 | } 26 | 27 | public static String lemmatize (String phrase) { 28 | Stemmer stemmer = Stemmer.getDefaultStemmer(); 29 | Document doc = new Document(phrase); 30 | Span span = doc.fullSpan(); 31 | Tokenizer.tokenize (doc, span); 32 | Vector tokens = doc.annotationsOfType("token"); 33 | StringBuilder sb = new StringBuilder(); 34 | if (JetTest.tagger == null) { 35 | for (int i = 0; i < tokens.size(); i++) { 36 | String word = doc.text(tokens.get(i)).trim(); 37 | String stem = stemmer.getStem (word, "?"); 38 | sb.append(stem); 39 | sb.append(" "); 40 | } 41 | } else { 42 | JetTest.tagger.tagPenn (doc, span); 43 | Vector posvec = doc.annotationsOfType("constit"); 44 | for (int i = 0; i < tokens.size(); i++) { 45 | String word = doc.text(tokens.get(i)).trim(); 46 | String pos = (String) posvec.get(i).get("cat"); 47 | String stem = stemmer.getStem (word, pos); 48 | sb.append(stem); 49 | sb.append(" "); 50 | } 51 | } 52 | return sb.toString().trim(); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/events/icetreeset: -------------------------------------------------------------------------------- 1 | public class IceTreeSet implements Iterable { 2 | 3 | List list = new ArrayList(); 4 | int count; 5 | 6 | IceTreeSet (String fileName) { 7 | try { 8 | BufferedReader reader = new BufferedReader (new FileReader (fileName)); 9 | String line; 10 | while ((line = reader.readLine()) != null) { 11 | int j = line.indexOf("\t"); 12 | if (j >= 0) { 13 | count = Integer.parseInt(line.substring(0, j)); 14 | line = line.substring(j + 1); 15 | } 16 | IceTree iceTree = IceTreeFactory.getIceTree(line); 17 | iceTree.count = count; 18 | list.add(iceTree); 19 | } 20 | } catch (IOException e) { 21 | e.printStackTrace(); 22 | } catch (NumberFormatException e) { 23 | e.printStackTrace(); 24 | } 25 | } 26 | 27 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/DepPathRegularizer.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import edu.nyu.jet.lex.Stemmer; 4 | import edu.nyu.jet.parser.SyntacticRelation; 5 | 6 | import java.util.HashSet; 7 | import java.util.Set; 8 | 9 | /** 10 | * Regularizer for DepPath: deletes quantity constructs such as 'pound of X' 11 | * from dependency paths. 12 | * 13 | * @author yhe 14 | * @version 1.0 15 | */ 16 | public class DepPathRegularizer { 17 | 18 | /** 19 | * set of quantity words 20 | */ 21 | 22 | public Set quantifiers = new HashSet(); 23 | 24 | private Stemmer stemmer = Stemmer.getDefaultStemmer(); 25 | 26 | { 27 | quantifiers.add("ounce"); 28 | quantifiers.add("gram"); 29 | quantifiers.add("kilogram"); 30 | quantifiers.add("quantity"); 31 | quantifiers.add("kilo"); 32 | quantifiers.add("pound"); 33 | quantifiers.add("amount"); 34 | } 35 | 36 | /** 37 | * Returns dependency path 'p' with quantity phrase 'Q of X' reduced to 'X'. 38 | */ 39 | 40 | public DepPath regularize(DepPath p) { 41 | DepPath result = p.copy(); 42 | 43 | SyntacticRelation prevRelation = null; 44 | for (SyntacticRelation r : p.getRelations()) { 45 | // prep_of: when using transformation 46 | // prep: when not using transformation 47 | if ((r.type.equals("prep_of") || r.type.equals("prep")) && 48 | quantifiers.contains( 49 | stemmer.getStem(r.sourceWord.trim().toLowerCase(), "NN")) && 50 | prevRelation != null) { 51 | prevRelation.targetPos = r.targetPos; 52 | prevRelation.targetPosn = r.targetPosn; 53 | prevRelation.targetWord = r.targetWord; 54 | } 55 | else { 56 | if (prevRelation != null) { 57 | result.append(prevRelation); 58 | } 59 | if ((r.type.equals("prep_of-1") || r.type.equals("prep-1")) && 60 | quantifiers.contains( 61 | stemmer.getStem(r.targetWord.trim().toLowerCase(), "NN"))) { 62 | prevRelation = null; 63 | } 64 | else { 65 | prevRelation = r; 66 | } 67 | } 68 | } 69 | if (prevRelation != null) { 70 | result.append(prevRelation); 71 | } 72 | return result; 73 | } 74 | 75 | public String regularize(String p) { 76 | String result = p; 77 | for (String w : quantifiers) { 78 | result = result.replaceAll(":" + w + ":prep_of", ""); 79 | } 80 | result = result.replaceAll("rcmod:\\d+:", ""); 81 | if (!p.equals(result)) { 82 | // System.err.println("Before regularization:" + p); 83 | // System.err.println("After regularization:" + result); 84 | } 85 | return result; 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/DepRecord.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import gnu.trove.map.hash.TObjectIntHashMap; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | /** 12 | * Java Bean record of a typed dependency path 13 | * Fields include: LDP, lexicalized LDP, example, count 14 | * Created by yhe on 11/22/15. 15 | */ 16 | public class DepRecord { 17 | private String ldp; 18 | private String lexicalLdp; 19 | private String example; 20 | private int count; 21 | 22 | public DepRecord(String ldp, String lexicalLdp, String example, int count) { 23 | this.ldp = ldp; 24 | this.lexicalLdp = lexicalLdp; 25 | this.example = example; 26 | this.count = count; 27 | } 28 | 29 | public List loadFromFile(String relationReprFile, TObjectIntHashMap typedRelationCountMap) 30 | throws IOException { 31 | BufferedReader r = new BufferedReader(new FileReader(relationReprFile)); 32 | String line = null; 33 | List depRecords = new ArrayList(); 34 | while ((line = r.readLine()) != null) { 35 | String[] parts = line.split(":::"); 36 | if (parts.length == 3) { 37 | String ldp = parts[0]; 38 | String lexicalLdp = parts[1]; 39 | String example = parts[2]; 40 | int count = typedRelationCountMap.get(ldp); 41 | depRecords.add(new DepRecord(ldp, lexicalLdp, example, count)); 42 | } 43 | } 44 | r.close(); 45 | return depRecords; 46 | } 47 | 48 | public String getLdp() { 49 | return ldp; 50 | } 51 | 52 | public void setLdp(String ldp) { 53 | this.ldp = ldp; 54 | } 55 | 56 | public String getLexicalLdp() { 57 | return lexicalLdp; 58 | } 59 | 60 | public void setLexicalLdp(String lexicalLdp) { 61 | this.lexicalLdp = lexicalLdp; 62 | } 63 | 64 | public String getExample() { 65 | return example; 66 | } 67 | 68 | public void setExample(String example) { 69 | this.example = example; 70 | } 71 | 72 | public int getCount() { 73 | return count; 74 | } 75 | 76 | public void setCount(int count) { 77 | this.count = count; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/IceEntitySet.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | * An entity type defined through ICE, consisting of a set of names 7 | * and a set of nouns. (At present, only the nouns are used.) 8 | */ 9 | 10 | public class IceEntitySet { 11 | 12 | String type; 13 | 14 | List nouns; 15 | 16 | List names; 17 | 18 | // ----- property methods ----- 19 | 20 | public String getType () { 21 | return type; 22 | } 23 | 24 | public void setType (String tp) { 25 | type = tp; 26 | } 27 | 28 | public List getNouns () { 29 | return nouns; 30 | } 31 | 32 | public void setNouns (List ls) { 33 | nouns = ls; 34 | } 35 | 36 | public void addNoun (String noun) { 37 | if (!nouns.contains(noun)) 38 | nouns.add(noun); 39 | } 40 | 41 | public void removeNoun (String noun) { 42 | nouns.remove(noun); 43 | } 44 | 45 | public List getNames () { 46 | return names; 47 | } 48 | 49 | public void setNames (List ls) { 50 | names = ls; 51 | } 52 | 53 | public void addName (String name) { 54 | if (!names.contains(name)) 55 | names.add(name); 56 | } 57 | 58 | public void removeName (String name) { 59 | names.remove(name); 60 | } 61 | // ----- constructors ----- 62 | 63 | public IceEntitySet (String s) { 64 | type = s; 65 | nouns = new ArrayList(); 66 | names = new ArrayList(); 67 | } 68 | 69 | public IceEntitySet () { 70 | this("?"); 71 | } 72 | 73 | public String toString() { 74 | return type; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/IcePath.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import edu.nyu.jet.aceJet.AnchoredPath; 4 | import edu.nyu.jet.Logger; 5 | import edu.nyu.jet.LoggerFactory; 6 | import java.util.*; 7 | 8 | import gnu.trove.map.hash.TObjectDoubleHashMap; 9 | 10 | /** 11 | * A path in relation bootstrapping. ICE generates a ranked list of IcePaths. 12 | * These are presented to the user, who can label each as being or not being an 13 | * instance of the curent relation. Each IcePath includes a lexicalized 14 | * dependency path ('path'), an English phrase for that path ('repr'), a full-sentence 15 | * example of that path ('example'), and a score. The path includes arguments. 16 | * 17 | * @author yhe 18 | * @version 1.0 19 | */ 20 | 21 | public class IcePath implements Comparable { 22 | 23 | static final Logger logger = LoggerFactory.getLogger(IcePath.class); 24 | 25 | public enum IcePathChoice { 26 | NO, YES, UNDECIDED 27 | } 28 | private String path; 29 | private String repr; 30 | private String example; 31 | private double score; 32 | public TObjectDoubleHashMap subScores; 33 | private IcePathChoice choice; 34 | 35 | public IcePath() { 36 | this.path = "?"; 37 | } 38 | 39 | public IcePath (String path) { 40 | this.path = path; 41 | } 42 | 43 | public IcePath(String path, String repr, String example, double score, IcePathChoice choice) { 44 | this.path = path; 45 | this.repr = repr; 46 | this.example = example; 47 | this.score = score; 48 | this.choice = choice; 49 | logger.debug ("created IcePath with path {}", path); 50 | } 51 | 52 | public IcePath(String path, String repr, String example, double score) { 53 | this.path = path; 54 | this.repr = repr; 55 | this.example = example; 56 | this.score = score; 57 | this.choice = IcePathChoice.UNDECIDED; 58 | logger.debug ("created IcePath with path {}", path); 59 | } 60 | 61 | public IcePath(String path, String repr, String example, double score, TObjectDoubleHashMap subScores) { 62 | this.path = path; 63 | this.repr = repr; 64 | this.example = example; 65 | this.score = score; 66 | this.choice = IcePathChoice.UNDECIDED; 67 | this.subScores = subScores; 68 | logger.debug ("created IcePath with path {}", path); 69 | } 70 | 71 | public String getPath() { 72 | return path; 73 | } 74 | 75 | public String getPathString() { 76 | return path; 77 | } 78 | 79 | public String getBarePath () { 80 | String[] pp = getPathString().split("--"); 81 | if (pp.length != 3) { 82 | logger.error ("Attempting to add invalid path {} to relation", path); 83 | return null; 84 | } 85 | return pp[1].trim(); 86 | } 87 | 88 | /* 89 | public IcePath getFullPath (String arg1, String arg2) { 90 | if (path.indexOf(" -- ") >= 0) { 91 | return this; 92 | } else { 93 | String fullPath = arg1 + " -- " + path + " -- " + arg2; 94 | IcePath fullIp = IcePathFactory.getIcePath(fullPath); 95 | return fullIp; 96 | } 97 | } 98 | */ 99 | public void setPath(String path) { 100 | this.path = path; 101 | } 102 | 103 | public String getRepr() { 104 | if (repr == null) 105 | return "nullRepr"; 106 | else 107 | return repr; 108 | } 109 | 110 | public void setRepr(String repr) { 111 | this.repr = repr; 112 | } 113 | 114 | public String getExample() { 115 | return example; 116 | } 117 | 118 | public void setExample(String example) { 119 | this.example = example; 120 | } 121 | 122 | public double getScore() { 123 | return score; 124 | } 125 | 126 | public void setScore(double score) { 127 | this.score = score; 128 | } 129 | 130 | public IcePathChoice getChoice() { 131 | return choice; 132 | } 133 | 134 | public void setChoice(IcePathChoice choice) { 135 | this.choice = choice; 136 | } 137 | 138 | public int compareTo(IcePath icePath) { 139 | if (this.score < icePath.score) return 1; 140 | if (this.score > icePath.score) return -1; 141 | return 0; 142 | } 143 | 144 | @Override 145 | public boolean equals (Object other) { 146 | boolean result = false; 147 | if (other instanceof IcePath) { 148 | IcePath icePath = (IcePath) other; 149 | result = this.getPathString().equals(icePath.getPathString()); 150 | } 151 | return result; 152 | } 153 | public String toString() { 154 | return path + " ["+repr+"] "; 155 | } 156 | 157 | public IcePath(AnchoredPath ap) { 158 | path = ap.toString(); 159 | } 160 | 161 | /** 162 | * Returns the embedding of the IcePath, which is the 163 | * embedding of its English-like phrase (repr). 164 | */ 165 | 166 | public double[] embed () { 167 | return WordEmbedding.embed(getRepr().split(" ")); 168 | } 169 | 170 | /** 171 | * Returns the embedding of a list of IcePaths, 172 | * which is the component-by-component sum of 173 | * the embeddings of the constituent IcePaths. 174 | */ 175 | 176 | static public double[] embed (List paths) { 177 | int dim = WordEmbedding.getDim(); 178 | for (IcePath ip : paths) 179 | ip.embed(); 180 | double[] result = new double[dim]; 181 | for (int j=0; j < dim; j++) { 182 | result[j] = paths.get(0).embed()[j]; 183 | } 184 | for (int i=1; i < paths.size(); i++) { 185 | for (int j=0; j < dim; j++) { 186 | result[j] += paths.get(i).embed()[j]; 187 | } 188 | } 189 | return result; 190 | } 191 | 192 | } 193 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/IcePathFactory.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | * Generates instances of IcePath so that there is a unique instance for each 7 | * value of the path string. 8 | */ 9 | 10 | public class IcePathFactory { 11 | 12 | public static Map icePaths = new HashMap< String, IcePath>(); 13 | 14 | public static IcePath getIcePath (String s) { 15 | if (icePaths.get(s) == null) { 16 | icePaths.put(s, new IcePath(s)); 17 | } 18 | return icePaths.get(s); 19 | // System.out.println("creating IcePath " + s); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/IceRelation.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import edu.nyu.jet.Logger; 4 | import edu.nyu.jet.LoggerFactory; 5 | 6 | import java.util.*; 7 | 8 | /** 9 | * a relation defined using ICE. It consists of the types of its 10 | * arguments and a set of dependency paths. The argumenta may 11 | * include subscripts; if the subscripts have been removed, it 12 | * is a 'bare path'; if the arguments have been instantiated, it is 13 | * an anchored path (see the AnchoredPath class).` 14 | */ 15 | 16 | public class IceRelation { 17 | 18 | static final Logger logger = LoggerFactory.getLogger(IceRelation.class); 19 | 20 | private String name = ""; 21 | 22 | private String arg1type = ""; 23 | 24 | private String arg2type = ""; 25 | 26 | private List icePaths = new ArrayList(); 27 | 28 | private List negPaths = new ArrayList(); 29 | 30 | // ---- property methods ----- 31 | 32 | /** 33 | * Returns the name of the relation. 34 | */ 35 | 36 | public String getName() {return name;} 37 | 38 | /** 39 | * Set the name of the relation. 40 | */ 41 | 42 | public void setName (String s) {name = s;} 43 | 44 | public List getPaths () { 45 | return icePaths; 46 | } 47 | 48 | public void setPaths (List ip) { 49 | icePaths = ip; 50 | } 51 | 52 | /* previously called by SwingRelationsPanel 53 | 54 | public void updatePaths() { 55 | DepPathMap.load(); 56 | paths.clear(); 57 | for (String repr : reprs) { 58 | List paths = depPathMap.findPath(repr); 59 | addPaths(paths); 60 | } 61 | } 62 | */ 63 | 64 | /** 65 | * Add path as one of the paths for this relation. 66 | * path must be a full path, including relation arguments. 67 | */ 68 | 69 | public void addPath (IcePath ip) { 70 | icePaths.add(ip); 71 | logger.info ("Added path {} to relation {}", ip, name); 72 | } 73 | 74 | public void addPaths (List sip) { 75 | for (IcePath ip : sip) 76 | addPath (ip); 77 | } 78 | 79 | /** 80 | * Add path as one of the negative paths for this relation. 81 | * path must be a full path, including relation arguments. 82 | */ 83 | 84 | public void addNegPath (IcePath p) { 85 | // checks for null as a precaution - shoud not occur 86 | if (negPaths == null) negPaths = new ArrayList(); 87 | negPaths.add(p); 88 | logger.info ("Added negated path {} to relation {}", p, name); 89 | } 90 | 91 | public boolean rejected (IcePath ip) { 92 | return negPaths.contains(ip); 93 | } 94 | 95 | /** 96 | * Deletes from the relation all paths with a given repr 97 | * (i.e., which are rendered to the user using the same text. 98 | */ 99 | 100 | public void deletePaths (String repr) { 101 | List remaining = new ArrayList(); 102 | for (IcePath ip : icePaths) 103 | if (!ip.getRepr().equals(repr)) 104 | remaining.add(ip); 105 | icePaths = remaining; 106 | } 107 | 108 | public String getArg1type() {return arg1type;} 109 | 110 | public void setArg1type (String s) {arg1type = s;} 111 | 112 | public String getArg2type() {return arg2type;} 113 | 114 | public void setArg2type (String s) {arg2type = s;} 115 | 116 | public List getNegPaths() { 117 | return negPaths; 118 | } 119 | 120 | public void setNegPaths(List np) { 121 | negPaths = np; 122 | } 123 | 124 | // ---- constructors ----- 125 | 126 | public IceRelation (String name) { 127 | this.name = name; 128 | } 129 | 130 | public IceRelation () { 131 | this ("?"); 132 | } 133 | 134 | private static boolean validPath (String s) { 135 | return (s.split("--").length == 3); 136 | } 137 | 138 | /** 139 | * Returns true if 'path' specifies an inverted relation: if the 140 | * argument which comes first in text order is the second argument 141 | * to the relation. 142 | */ 143 | 144 | public boolean isInverted (IcePath path) { 145 | String[] pp = path.getPathString().split("--"); 146 | if (pp.length != 3) { 147 | logger.error ("Attempting to add invalid path {} to relation", path); 148 | return false; 149 | } 150 | String first = pp[0].trim(); 151 | boolean inverted = first.endsWith("(2)") || first.equals(arg2type); 152 | return inverted; 153 | } 154 | 155 | @Override 156 | public String toString() { 157 | return name; 158 | } 159 | 160 | public String report () { 161 | String r = name + "(" + arg1type + ", " + arg2type + ")\n"; 162 | for (IcePath p : icePaths) 163 | r += p.toString() + "\n"; 164 | return r; 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/MatcherNode.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | /** 4 | * A matcher node is a node on the dependency tree. It contains a dependency label and a word governed by the label. 5 | * 6 | * @author yhe 7 | * @version 1.0 8 | */ 9 | public class MatcherNode { 10 | String label; 11 | String token; 12 | 13 | public MatcherNode(String label, String token) { 14 | this.label = label; 15 | this.token = token; 16 | } 17 | 18 | @Override 19 | public boolean equals(Object o) { 20 | if (this == o) return true; 21 | if (o == null || getClass() != o.getClass()) return false; 22 | 23 | MatcherNode that = (MatcherNode) o; 24 | 25 | if (label != null ? !label.equals(that.label) : that.label != null) return false; 26 | if (token != null ? !token.equals(that.token) : that.token != null) return false; 27 | 28 | return true; 29 | } 30 | 31 | @Override 32 | public int hashCode() { 33 | int result = label != null ? label.hashCode() : 0; 34 | result = 31 * result + (token != null ? token.hashCode() : 0); 35 | return result; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/MatcherPath.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import edu.nyu.jet.aceJet.AnchoredPath; 4 | import edu.nyu.jet.lex.Stemmer; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * MatcherPath is a dependency path to be matched by the PathMatcher. It is an ordered list of MatcherNodes. 11 | * 12 | * @author yhe 13 | * @version 1.0 14 | */ 15 | public class MatcherPath { 16 | List nodes = new ArrayList(); 17 | String arg1Type = "UNK"; 18 | String arg2Type = "UNK"; 19 | String relationType = "NONE"; 20 | Stemmer stemmer = Stemmer.getDefaultStemmer(); 21 | 22 | public MatcherPath(String pathString) { 23 | nodes.clear(); 24 | String[] parts = pathString.split("--"); 25 | if (parts.length == 3) { 26 | arg1Type = parts[0].trim(); 27 | arg2Type = parts[2].trim(); 28 | parts = parts[1].split(":"); 29 | for (int i = 0; i < (parts.length - 1) / 2; i++) { 30 | MatcherNode node = new MatcherNode(parts[2*i], stemmer.getStem(parts[2*i + 1], 31 | "UNK")); 32 | nodes.add(node); 33 | } 34 | MatcherNode node = new MatcherNode(parts[parts.length - 1], "SYS_PATH_END"); 35 | nodes.add(node); 36 | } 37 | } 38 | 39 | public MatcherPath(AnchoredPath path) { 40 | nodes.clear(); 41 | String pathString = path.toString(); 42 | String[] parts = pathString.split("--"); 43 | if (parts.length == 3) { 44 | arg1Type = parts[0].trim(); 45 | arg2Type = parts[2].trim(); 46 | parts = parts[1].split(":"); 47 | for (int i = 0; i < (parts.length - 1) / 2; i++) { 48 | MatcherNode node = new MatcherNode(parts[2*i], stemmer.getStem(parts[2*i + 1], 49 | "UNK")); 50 | nodes.add(node); 51 | } 52 | MatcherNode node = new MatcherNode(parts[parts.length - 1], "SYS_PATH_END"); 53 | nodes.add(node); 54 | } 55 | } 56 | 57 | public void setRelationType(String relationType) { 58 | this.relationType = relationType; 59 | } 60 | 61 | public String getRelationType() { 62 | return relationType; 63 | } 64 | 65 | public boolean isEmpty() { 66 | return nodes.isEmpty(); 67 | } 68 | 69 | public int length() { 70 | return nodes.size(); 71 | } 72 | 73 | @Override 74 | public String toString() { 75 | if (nodes.size() == 0) { 76 | return arg1Type + "-- --" + arg2Type; 77 | } 78 | StringBuilder sb = new StringBuilder(); 79 | sb.append(arg1Type).append("--"); 80 | for (int i = 0; i < nodes.size() - 1; i++) { 81 | sb.append(nodes.get(i).label).append(":"); 82 | sb.append(nodes.get(i).token).append(":"); 83 | } 84 | sb.append(nodes.get(nodes.size()-1).label); 85 | sb.append("--").append(arg2Type); 86 | return sb.toString(); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/PathMatcher.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import gnu.trove.map.hash.TObjectDoubleHashMap; 4 | 5 | import java.io.BufferedReader; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | /** 12 | * PathMatcher is a Edit-Distance-based matcher that produces an alignment and an alignment score between 13 | * two MatcherPaths using the generalized Levenshtein algorithm. It can optionally use word embeddings to 14 | * compute the substitution cost, if embeddings is set. 15 | */ 16 | public class PathMatcher { 17 | 18 | private TObjectDoubleHashMap weights = new TObjectDoubleHashMap(); 19 | private TObjectDoubleHashMap labelWeights = new TObjectDoubleHashMap(); 20 | 21 | private static final double LABEL_MISMATCH_PENALTY = 2.5; 22 | private Map embeddings = null; 23 | 24 | public PathMatcher() { 25 | weights.put("replace", 0.5); 26 | weights.put("insert", 0.25); 27 | weights.put("delete", 1.0); 28 | labelWeights.put("nsubj-1", 1.5); 29 | labelWeights.put("dobj-1", 1.5); 30 | labelWeights.put("nsubj", 1.0); 31 | labelWeights.put("dobj", 0.5); 32 | labelWeights.put("preps_of", 0.2); 33 | labelWeights.put("preps_with", 0.2); 34 | } 35 | 36 | public void setEmbeddings(Map embeddings) { 37 | this.embeddings = embeddings; 38 | } 39 | 40 | public void updateCost(double replace, double insert, double delete) { 41 | weights.put("replace", replace); 42 | weights.put("insert", insert); 43 | weights.put("delete", delete); 44 | } 45 | 46 | public double matchPaths(String path1, String path2) { 47 | MatcherPath matcherPath1 = new MatcherPath(path1); 48 | MatcherPath matcherPath2 = new MatcherPath(path2); 49 | 50 | return matchPaths(matcherPath1, matcherPath2); 51 | } 52 | 53 | public double matchPaths(MatcherPath matcherPath1, MatcherPath matcherPath2) { 54 | int len1 = matcherPath1.nodes.size(); 55 | int len2 = matcherPath2.nodes.size(); 56 | if (len1 == 1 && len2 == 1) { 57 | return matcherPath1.nodes.get(0).label.equals(matcherPath2.nodes.get(0).label) 58 | && matcherPath1.arg1Type.equals(matcherPath2.arg1Type) 59 | && matcherPath1.arg2Type.equals(matcherPath2.arg2Type) ? 60 | 0 : 1; 61 | } 62 | 63 | double[][] dp = new double[len1 + 1][len2 + 1]; 64 | 65 | for (int i = 0; i <= len1; i++) { 66 | dp[i][0] = i; 67 | } 68 | 69 | for (int j = 0; j <= len2; j++) { 70 | dp[0][j] = j; 71 | } 72 | 73 | //iterate though, and check last char 74 | for (int i = 0; i < len1; i++) { 75 | MatcherNode c1 = matcherPath1.nodes.get(i); 76 | for (int j = 0; j < len2; j++) { 77 | MatcherNode c2 = matcherPath2.nodes.get(j); 78 | 79 | //if last two chars equal 80 | if (c1.equals(c2)) { 81 | //update dp value for +1 length 82 | dp[i + 1][j + 1] = dp[i][j]; 83 | } else { 84 | double labelWeight = labelWeights.containsKey(c2.label) ? 85 | labelWeights.get(c2.label) : 1; 86 | double insertLabelWeight = labelWeights.containsKey(c1.label) ? 87 | labelWeights.get(c1.label) : 1; 88 | double replacePenalty = c1.label.equals(c2.label) ? 89 | 1 : LABEL_MISMATCH_PENALTY; 90 | double replaceCost = 1 - WordEmbedding.similarity(c1.token, c2.token); 91 | // if (c1.token.equals("distribute") || c2.token.equals("distribute")) { 92 | // System.err.println("[LOG] " + c1.token + " " + c2.token + " " + replaceCost); 93 | // } 94 | double replace = dp[i][j] + weights.get("replace") * replacePenalty 95 | * replaceCost 96 | * labelWeight; 97 | double insert = dp[i][j + 1] + weights.get("insert") * insertLabelWeight; 98 | double delete = dp[i + 1][j] + weights.get("delete") * labelWeight; 99 | 100 | double min = replace > insert ? insert : replace; 101 | min = delete > min ? min : delete; 102 | dp[i + 1][j + 1] = min; 103 | } 104 | } 105 | } 106 | 107 | return matcherPath1.arg1Type.equals(matcherPath2.arg1Type) && 108 | matcherPath1.arg2Type.equals(matcherPath2.arg2Type) ? 109 | dp[len1][len2] : Math.max(matcherPath1.length(), matcherPath2.length()); 110 | } 111 | 112 | } 113 | 114 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/models/RelationFinder.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.models; 2 | 3 | import edu.nyu.jet.ice.uicomps.Ice; 4 | import edu.nyu.jet.ice.utils.ProgressMonitorI; 5 | import edu.nyu.jet.ice.utils.SwingProgressMonitor; 6 | 7 | import javax.swing.*; 8 | import java.io.IOException; 9 | 10 | /** 11 | * A Thread for running DepPaths: counts all dependency paths in corpus. 12 | */ 13 | public class RelationFinder extends Thread { 14 | 15 | String[] args; 16 | String types; 17 | JTextArea area; 18 | int numberOfDocs; 19 | ProgressMonitorI relationProgressMonitor = null; 20 | 21 | public RelationFinder(String docListFileName, String directory, String filter, 22 | String instances, String types, JTextArea area, int numberOfDocs, 23 | ProgressMonitorI relationProgressMonitor) { 24 | args = new String[4]; 25 | args[0] = "parseprops"; 26 | args[1] = docListFileName; 27 | args[2] = directory; 28 | args[3] = filter; 29 | this.types = types; 30 | this.area = area; 31 | this.numberOfDocs = numberOfDocs; 32 | this.relationProgressMonitor = relationProgressMonitor; 33 | } 34 | 35 | public void run() { 36 | try { 37 | // force monitor to display during long initialization 38 | try { 39 | Thread.sleep(1000); 40 | } catch (InterruptedException ignore) { 41 | } 42 | if (null != relationProgressMonitor) { 43 | relationProgressMonitor.setProgress(2); 44 | } 45 | DepPathMap depPathMap = DepPathMap.getInstance(); 46 | depPathMap.unpersist(); 47 | DepPaths.progressMonitor = relationProgressMonitor; 48 | System.out.println("$$$ types = " + types); 49 | DepPaths.main(args); 50 | String sortedTypes = types + ".sorted"; 51 | Corpus.sort(types, sortedTypes); 52 | System.out.println("$$$ types = " + types); 53 | depPathMap.loadPaths(true); 54 | if(area != null) { 55 | Corpus.displayTerms(types, 40, area, Corpus.relationFilter); 56 | } 57 | } catch (IOException e) { 58 | System.out.println("IOException in DepPaths " + e); 59 | e.printStackTrace(System.err); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 |

Principal Classes of ICE

3 | 4 |

Corpus analysis

5 | 6 | The corpus analysis performed by ICE can be divided into two types: 7 | analysis which is domain-independent and need only be done once, 8 | and analysis which is domain-dependent and may be done repeatedly 9 | as part of a bootstrapping process. 10 |

11 | The domain-independent processing includes 12 |

    13 |
  • part-of-speech tagging
  • 14 |
  • dependency parsing
  • 15 |
  • coreference analysis
  • 16 |
  • name tagging with respect to a set of generic name models 17 | (for people, places, and organizations)
  • 18 |
  • numeric and time expressions
  • 19 |
20 | All of this analysis is performed as part of preprocessing 21 | by the IcePreprocessor class and stored in the 22 | cache directory, which has one subdirectory for 23 | each corpus being preprocessed by ICE. 24 |

25 | Note that this preprocessing could be made more accurate 26 | by making use of domain-specific information, but we do 27 | not do so at this time. 28 |

29 | The domain-specific processing involves finding in 30 | the corpus all dependency paths which connect two 31 | entities (words which are members of an entity set). A 32 | relation is defined as a set of dependency paths, so 33 | this process collects the candidate paths to be used 34 | in relation bootstrapping. As the entity sets grow during IE 35 | customization, this set of candidate paths also grows and so needs 36 | to be recomputed. This analysis is performed by the 37 | RelationFinder class, which invokes DepPaths. 38 | To speed processing, DepPaths makes use of the 39 | information saved in the cache by preprocessing. 40 |

41 | One additional step of corpus analysis involves the computation 42 | of term context vectors, which record the dependency contexts of 43 | each term in the corpus. This information, which is used to guide 44 | the creation of entity sets, is computed by class EntitySetIndexer. 45 | 46 |

Dependency Paths

47 | 48 |

Representation

49 | 50 | ICE relations (class IceRelation) are specified in terms of 51 | the types of its arguments (entity sets) and a set of lexicalized 52 | dependency paths (LDPs). An LDP specifies a particular sequence of 53 | words and dependency relations. For communicating with the user we 54 | want to accept and generate English phrases. Methods in class 55 | DepPath perform the generation of phrases; the 56 | correspondence between the internal representation, the phrase, 57 | and a complete sentence with an example of this path is 58 | captured in instances of class IcePath. 59 |

60 | We are currently experimenting in Jet with set-generalized 61 | LDPs. where the words are constrained to be members of a set rather 62 | than taking on single values. 63 | 64 |

Matching

65 | 66 | Exact match of two LDPs an be done by simple sring match. 67 | To determine whether a document has an instance of an LDP, we 68 | can generate all the LDPs from a document and see if any one 69 | matches. 70 |

71 | For better recall we may want to allow approximate (soft) matching. 72 | Class PathMatcher provides edit-distance-based matching 73 | between two LDPs. 74 | 75 |

Exporting

76 | 77 | After some entity sets and relations have been defined using Ice, 78 | class JetEngineBuilder is used to write thes out in 79 | a format which is accepted by Jet. It is represented in Jet 80 | using classes AnchoredPath and AnchoredPathSet. 81 | 82 |

Bootstrapping

83 | 84 | The bootstrapping of relations is managed by class Bootstrap. 85 | The basic process starts with a seed provided by the user and ranks 86 | the candidate paths with respect to this seed using an elaborate 87 | combination of scores 88 |

89 | To reduce the manual input required when conducting repeated evaluations 90 | for the same relation, class RelationOracle captures the user's 91 | classifications on the initial run and generates automatic responses 92 | to the same queries on subsequent runs. 93 | */ 94 | package edu.nyu.jet.ice; 95 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/relation/PathRelationExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.ice.models.MatcherPath; 4 | import edu.nyu.jet.ice.models.PathMatcher; 5 | import edu.nyu.jet.ice.models.WordEmbedding; 6 | import opennlp.model.Event; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * Tag relations in a document by using positive and negative dependency path rules created with ICE 16 | * 17 | * @author yhe 18 | */ 19 | public class PathRelationExtractor { 20 | 21 | public static double minThreshold = 0.5; 22 | 23 | public void setNegDiscount(double nDiscount) { 24 | negDiscount = nDiscount; 25 | } 26 | 27 | public static double negDiscount = 0.8; 28 | 29 | private PathMatcher pathMatcher = new PathMatcher(); 30 | 31 | private static List ruleTable = new ArrayList(); 32 | 33 | private static List negTable = new ArrayList(); 34 | 35 | public void updateCost(double replace, double insert, double delete) { 36 | pathMatcher.updateCost(replace, insert, delete); 37 | } 38 | 39 | public static void loadModelForSoftMatch (String rulesFile) throws IOException { 40 | BufferedReader br = new BufferedReader(new FileReader(rulesFile)); 41 | String line = null; 42 | while ((line = br.readLine()) != null) { 43 | System.out.println("Loading rule " + line); 44 | String[] parts = line.split(" = "); 45 | MatcherPath path = new MatcherPath(parts[0]); 46 | if (parts[0].contains("EMPTY")) { 47 | continue; 48 | } 49 | if (!path.isEmpty()) { 50 | path.setRelationType(parts[1]); 51 | } 52 | ruleTable.add(path); 53 | } 54 | } 55 | 56 | public void loadNeg(String negRulesFile) throws IOException { 57 | BufferedReader br = new BufferedReader(new FileReader(negRulesFile)); 58 | String line = null; 59 | while ((line = br.readLine()) != null) { 60 | String[] parts = line.split(" = "); 61 | MatcherPath path = new MatcherPath(parts[0]); 62 | if (parts[0].contains("EMPTY")) { 63 | continue; 64 | } 65 | if (!path.isEmpty()) { 66 | path.setRelationType(parts[1]); 67 | } 68 | negTable.add(path); 69 | } 70 | } 71 | 72 | public void loadEmbeddings(String embeddingFile) throws IOException { 73 | WordEmbedding.loadWordEmbedding(embeddingFile); 74 | } 75 | 76 | /** 77 | * Predict the relation type of an Event. The context[] array of the Event 78 | * should have the format [dependency path, arg1 type, arg2 type] 79 | * @param e An OpenNLP context[]:label pair 80 | * @return 81 | */ 82 | public String predict(Event e) { 83 | String[] context = e.getContext(); 84 | String depPath = context[0]; 85 | String arg1Type = context[1]; 86 | String arg2Type = context[2]; 87 | String fullDepPath = arg1Type + "--" + depPath + "--" + arg2Type; 88 | MatcherPath matcherPath = new MatcherPath(fullDepPath); 89 | double minScore = 1; 90 | double minNegScore = 1; 91 | MatcherPath minRule = null; 92 | for (MatcherPath rule : ruleTable) { 93 | double score = pathMatcher.matchPaths(matcherPath, rule)/ 94 | rule.length(); 95 | if (arg1Type.equals("PERSON") && arg2Type.equals("DRUGS")) { 96 | // System.err.println("\tScore:"+ score); 97 | // System.err.println("\tRule:" + rule); 98 | // System.err.println("\tCurrent:" + matcherPath); 99 | //System.err.println("Gold:" + e.getOutcome() 100 | // + "\tPredicted:" + rule.getRelationType()); 101 | } 102 | if (score < minScore) { 103 | minScore = score; 104 | minRule = rule; 105 | } 106 | } 107 | MatcherPath minNegRule = null; 108 | if (minScore < minThreshold) { 109 | 110 | for (MatcherPath rule : negTable) { 111 | if (!rule.getRelationType().equals(minRule.getRelationType())) { 112 | continue; 113 | } 114 | double score = pathMatcher.matchPaths(matcherPath, rule) / rule.length(); 115 | if (score < minNegScore) { 116 | minNegScore = score; 117 | minNegRule = rule; 118 | } 119 | } 120 | } 121 | else { 122 | return null; 123 | } 124 | 125 | 126 | if (minScore < minThreshold && minScore < minNegScore* negDiscount) { 127 | System.err.println("Score:"+ minScore); 128 | System.err.println("Rule:" + minRule); 129 | System.err.println("Current:" + matcherPath); 130 | System.err.println("Gold:" + e.getOutcome() 131 | + "\tPredicted:" + minRule.getRelationType()); 132 | 133 | return minRule.getRelationType(); 134 | } 135 | if (minScore > minNegScore* negDiscount) { 136 | System.err.println("[REJ] Score:"+ minScore); 137 | System.err.println("[REJ] Neg Score:"+ minNegScore* negDiscount); 138 | System.err.println("[REJ] Rule:" + minRule); 139 | System.err.println("[REJ] Neg Rule:" + minNegRule); 140 | System.err.println("[REJ] Current:" + matcherPath); 141 | System.err.println("[REJ] Gold:" + e.getOutcome() 142 | + "\tPredicted:" + minRule.getRelationType()); 143 | } 144 | return null; 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/terminology/Term.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.terminology; 2 | 3 | /** 4 | * Summarizes information related to a term/entity, also supports sorting 5 | * 6 | * @author yhe 7 | * @version 1.0 8 | */ 9 | public class Term implements Comparable{ 10 | private String text; 11 | private int positiveDocFreq; 12 | private int positiveFreq; 13 | private int negativeDocFreq; 14 | private int negativeFreq; 15 | private double score; 16 | private int[] rawFreq; 17 | 18 | public Term(String text, int positiveDocFreq, int positiveFreq, int negativeDocFreq, int negativeFreq) { 19 | this.text = text; 20 | this.positiveDocFreq = positiveDocFreq; 21 | this.positiveFreq = positiveFreq; 22 | this.negativeDocFreq = negativeDocFreq; 23 | this.negativeFreq = negativeFreq; 24 | } 25 | 26 | public String getText() { 27 | return text; 28 | } 29 | 30 | public void setText(String text) { 31 | this.text = text; 32 | } 33 | 34 | public int getPositiveDocFreq() { 35 | return positiveDocFreq; 36 | } 37 | 38 | 39 | public int getPositiveFreq() { 40 | return positiveFreq; 41 | } 42 | 43 | 44 | public int getNegativeDocFreq() { 45 | return negativeDocFreq; 46 | } 47 | 48 | 49 | public int getNegativeFreq() { 50 | return negativeFreq; 51 | } 52 | 53 | public double getScore() { 54 | return score; 55 | } 56 | 57 | public void setScore(double score) { 58 | this.score = score; 59 | } 60 | 61 | public int[] getRawFreq() { 62 | return rawFreq; 63 | } 64 | 65 | public void setRawFreq(int[] rawFreq) { 66 | this.rawFreq = rawFreq; 67 | } 68 | 69 | public int compareTo(Term term) { 70 | if (this.score - term.score < 0) return -1; 71 | if (this.score - term.score > 0) return 1; 72 | return 0; 73 | } 74 | 75 | @Override 76 | public String toString() { 77 | return String.format("%.2f\t%s", score, text); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/terminology/TermRanker.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.terminology; 2 | 3 | import edu.nyu.jet.ice.uicomps.Ice; 4 | import edu.nyu.jet.ice.utils.IceUtils; 5 | 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | import java.io.PrintWriter; 9 | import java.util.*; 10 | 11 | /** 12 | * Ranker for extracted terms. 13 | * 14 | * @author yhe 15 | * @version 1.0 16 | */ 17 | public class TermRanker { 18 | private List terms; 19 | 20 | /** 21 | * Rank multi-word terms using the following score:
22 | * 23 | * Score = POSITIVE_FREQ * log(POSITIVE_FREQ) ^ alpha / NEGATIVE_FREQ
24 | * 25 | * where POSITIVE_FREQ is the document frequency in the foreground corpus 26 | * and NEGATIVE_FREQ is the document frequency in the background corpus. 27 | * alpha can be set in iceprops with the Ice.TermRanker.alpha property 28 | * 29 | * @param foregroundCountFile Name of the word count file for the foreground corpus 30 | * @param backgroundCountFile Name of the word count file for the background corpus 31 | * @throws IOException 32 | */ 33 | public TermRanker(String foregroundCountFile, String backgroundCountFile) 34 | throws IOException { 35 | String[] positiveWords = IceUtils.readLines(foregroundCountFile); 36 | String[] negativeWords = IceUtils.readLines(backgroundCountFile); 37 | //Map foregroundWordFreq = new HashMap(); 38 | Map foregroundDocFreq = new HashMap(); 39 | //Map backgroundWordFreq = new HashMap(); 40 | Map backgroundDocFreq = new HashMap(); 41 | int i = 0; 42 | for (String w : positiveWords) { 43 | if (i < 3) { 44 | i++; 45 | continue; 46 | } 47 | String[] parts = w.split("\\t"); 48 | if (parts[0].equals("Contact/nn") || 49 | parts[0].equals("today/nn") || 50 | parts[0].equals("yesterday/nn")) { 51 | continue; 52 | } 53 | foregroundDocFreq.put(parts[0], parts.length - 1); 54 | } 55 | i = 0; 56 | for (String w : negativeWords) { 57 | if (i < 3) { 58 | i++; 59 | continue; 60 | } 61 | String[] parts = w.split("\\t"); 62 | backgroundDocFreq.put(parts[0], parts.length - 1); 63 | i++; 64 | } 65 | terms = new ArrayList(); 66 | double pow = 1.0; 67 | try { 68 | pow = Double.valueOf(Ice.iceProperties.getProperty("Ice.TermRanker.alpha")); 69 | System.err.println("Trying to use alpha: " + pow); 70 | } catch (Exception e) { 71 | //e.printStackTrace(); 72 | } 73 | for (String w : foregroundDocFreq.keySet()) { 74 | // int negativeWordCount = backgroundWordFreq.containsKey(w) ? 75 | // backgroundWordFreq.get(w) + 1 : 1; 76 | int negativeDocCount = backgroundDocFreq.containsKey(w) ? 77 | backgroundDocFreq.get(w) + 1 : 1; 78 | Term term = new Term(w, 79 | foregroundDocFreq.get(w), 80 | 0, 81 | negativeDocCount, 82 | 0 83 | ); 84 | term.setScore((double) term.getPositiveDocFreq() * 85 | Math.pow(Math.log(term.getPositiveDocFreq()), pow) 86 | / 87 | term.getNegativeDocFreq()); 88 | terms.add(term); 89 | } 90 | Collections.sort(terms); 91 | Collections.reverse(terms); 92 | } 93 | 94 | /** 95 | * Write a ranked list of terms (top-ranked term first) to file 96 | * outputFileName. 97 | */ 98 | 99 | public void writeRankedList(String outputFileName) throws IOException { 100 | PrintWriter pw = new PrintWriter(new FileWriter(outputFileName)); 101 | for (Term term : terms) { 102 | pw.println(term); 103 | } 104 | pw.close(); 105 | } 106 | 107 | /** 108 | * Rank terms using term count files foregroundCountFile and 109 | * backgroundCountFile, writing result to outputFile 110 | * and returning a ranked list. 111 | */ 112 | 113 | public static List rankTerms(String foregroundCountFile, 114 | String backgroundCountFile, 115 | String outputFile) throws IOException { 116 | TermRanker ranker = new TermRanker(foregroundCountFile, backgroundCountFile); 117 | ranker.writeRankedList(outputFile); 118 | return ranker.terms; 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/uicomps/Ice.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.uicomps;// -*- tab-width: 4 -*- 2 | //Title: JET-ICE 3 | //Version: 1.72 4 | //Copyright: Copyright (c) 2014 5 | //Author: Ralph Grishman 6 | //Description: A Java-based Information Extraction Tool -- Customization Environment 7 | 8 | import java.awt.*; 9 | import java.util.*; 10 | import javax.swing.*; 11 | import javax.swing.border.*; 12 | import java.awt.event.*; 13 | 14 | import edu.nyu.jet.ice.models.Corpus; 15 | import edu.nyu.jet.ice.models.IceEntitySet; 16 | import edu.nyu.jet.ice.models.IceRelation; 17 | import edu.nyu.jet.ice.models.DepPathMap; 18 | import edu.nyu.jet.ice.events.IceEvent; 19 | import edu.nyu.jet.ice.events.DepTreeMap; 20 | import edu.nyu.jet.concepts.ConceptHierarchy; 21 | 22 | import edu.nyu.jet.Logger; 23 | import edu.nyu.jet.LoggerFactory; 24 | 25 | /** 26 | * Top-level objects for ICE 27 | */ 28 | 29 | public class Ice { 30 | 31 | static final Logger logger = LoggerFactory.getLogger(Ice.class); 32 | 33 | public static SortedMap corpora = new TreeMap (); 34 | public static SortedMap entitySets = new TreeMap(); 35 | public static SortedMap relations = new TreeMap(); 36 | public static SortedMap events = new TreeMap(); 37 | public static Corpus selectedCorpus = null; 38 | public static String selectedCorpusName = null; 39 | 40 | public static Properties iceProperties = new Properties(); 41 | 42 | public static JFrame mainFrame; 43 | 44 | public static void selectCorpus (String corpus) { 45 | selectedCorpusName = corpus; 46 | selectedCorpus = corpora.get(selectedCorpusName); 47 | DepPathMap depPathMap = DepPathMap.getInstance(); 48 | depPathMap.loadPaths(false); 49 | DepTreeMap depTreeMap = DepTreeMap.getInstance(); 50 | depTreeMap.loadTrees(false); 51 | } 52 | 53 | public static ConceptHierarchy ontology = null;; 54 | 55 | public static void addEntitySet (IceEntitySet entitySet) { 56 | entitySets.put(entitySet.getType(), entitySet); 57 | } 58 | 59 | public static IceEntitySet getEntitySet (String type) { 60 | return entitySets.get(type); 61 | } 62 | 63 | public static void removeEntitySet (String type) { 64 | entitySets.remove(type); 65 | } 66 | 67 | public static void addRelation (IceRelation relation) { 68 | relations.put(relation.getName(), relation); 69 | } 70 | 71 | public static IceRelation getRelation (String type) { 72 | return relations.get(type); 73 | } 74 | 75 | public static void removeRelation (String type) { 76 | if (relations.get(type) == null) 77 | logger.warn("Relation to be deleted does not exist."); 78 | else relations.remove(type); 79 | } 80 | 81 | public static void addEvent (IceEvent event) { 82 | events.put(event.getName(), event); 83 | } 84 | 85 | public static IceEvent getEvent (String type) { 86 | return events.get(type); 87 | } 88 | 89 | public static void removeEvent (String type) { 90 | events.remove(type); 91 | } 92 | } 93 | 94 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/uicomps/IceCellRenderer.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.uicomps; 2 | 3 | import edu.nyu.jet.ice.events.IceTree; 4 | import edu.nyu.jet.ice.models.IcePath; 5 | 6 | import javax.swing.*; 7 | import java.awt.*; 8 | 9 | /** 10 | * This cell renderer is intended to support displays of Lists 11 | * where the list elements are IcePaths or IceTrees. What is 12 | * displayed is the "repr" (English phrase) in either case and 13 | * the choice made by the user regarding that phrase. 14 | */ 15 | 16 | public class IceCellRenderer extends JLabel implements ListCellRenderer { 17 | 18 | boolean showYesNo; 19 | 20 | public IceCellRenderer (boolean showYesNo) { 21 | setOpaque(true); 22 | this.showYesNo = showYesNo; 23 | } 24 | 25 | public Component getListCellRendererComponent(JList list, 26 | Object value, 27 | int index, 28 | boolean isSelected, 29 | boolean cellHasFocus) { 30 | 31 | if (value == null) { 32 | System.out.println ("CellRenderer got null"); 33 | } else if (value instanceof IceTree) { 34 | IceTree t = (IceTree) value; 35 | String repr = t.getRepr(); 36 | if (repr == null) System.out.println ("CellRenderer getting trees with null repr"); 37 | IceTree.IceTreeChoice choice = t.getChoice(); 38 | if (showYesNo && choice == IceTree.IceTreeChoice.YES) 39 | repr += " / YES"; 40 | else if (showYesNo && choice == IceTree.IceTreeChoice.NO) 41 | repr += " / NO"; 42 | setText(repr); 43 | } else if (value instanceof IcePath) { 44 | IcePath t = (IcePath) value; 45 | String repr = t.getRepr(); 46 | if (repr == null) System.out.println ("CellRenderer getting paths with null repr"); 47 | IcePath.IcePathChoice choice = t.getChoice(); 48 | if (showYesNo && choice == IcePath.IcePathChoice.YES) 49 | repr += " / YES"; 50 | else if (showYesNo && choice == IcePath.IcePathChoice.NO) 51 | repr += " / NO"; 52 | setText(repr); 53 | } else System.out.println ("Cell renderer got " + value); 54 | 55 | Color background; 56 | Color foreground; 57 | 58 | // check if this cell is selected 59 | if (isSelected) { 60 | background = Color.BLUE; 61 | foreground = Color.WHITE; 62 | } else { 63 | background = Color.WHITE; 64 | foreground = Color.BLACK; 65 | }; 66 | 67 | setBackground(background); 68 | setForeground(foreground); 69 | 70 | // this.setToolTipText("hi" + t.getExample()); 71 | return this; 72 | } 73 | } 74 | 75 | 76 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/uicomps/ListFilter.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.uicomps; 2 | 3 | import javax.swing.Box; 4 | 5 | /** 6 | * a filter for displaying or skipping items, controlled by a GUI 7 | */ 8 | 9 | public abstract class ListFilter { 10 | 11 | public abstract boolean filter(String item); 12 | 13 | public abstract Box makeBox(); 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/uicomps/RelationBuilderThread.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.uicomps; 2 | 3 | import edu.nyu.jet.ice.relation.Bootstrap; 4 | import edu.nyu.jet.ice.views.swing.SwingRelationsPanel; 5 | 6 | /** 7 | * When the 'expand' button om the relation frame is pushed, the SwingRelationPanel 8 | * creates a RelationBuilderThread to perform the computations required to 9 | * generate a list of candidate relation patterns (which are then reviewed 10 | * by the user). 11 | * 12 | * Created by yhe on 10/14/14. 13 | */ 14 | public class RelationBuilderThread extends Thread { 15 | 16 | String[] args; 17 | // RelationBuilder builder; 18 | Bootstrap bootstrap; 19 | String arg1; 20 | String arg2; 21 | RelationBuilderFrame frame; 22 | SwingRelationsPanel swingRelationsPanel; 23 | 24 | public RelationBuilderThread( 25 | String seed, 26 | String relationInstanceFileName, 27 | String pathListFileName, 28 | // RelationBuilder builder, 29 | Bootstrap bootstrap, 30 | RelationBuilderFrame frame, 31 | SwingRelationsPanel swingRelationsPanel) { 32 | args = new String[3]; 33 | args[0] = seed; 34 | String[] parts = seed.trim().toLowerCase().split(" "); 35 | if (parts.length > 1) { 36 | arg1 = parts[0].toUpperCase(); 37 | arg2 = parts[parts.length - 1].toUpperCase(); 38 | } 39 | args[1] = relationInstanceFileName; 40 | args[2] = pathListFileName; 41 | // this.builder = builder; 42 | this.bootstrap = bootstrap; 43 | this.frame = frame; 44 | this.swingRelationsPanel = swingRelationsPanel; 45 | } 46 | 47 | public void run() { 48 | try { 49 | bootstrap.initialize(args[0], args[1]); 50 | frame.updateList(); 51 | frame.setLocationRelativeTo(null); 52 | frame.setVisible(true); 53 | frame.listPane.revalidate(); 54 | frame.listPane.repaint(); 55 | frame.rankedList.revalidate(); 56 | frame.rankedList.repaint(); 57 | } catch (Exception e) { 58 | System.err.println("Exception in Jet.RelationAL.Bootstrap: "); 59 | e.printStackTrace(); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/uicomps/RelationFilter.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.uicomps;// -*- tab-width: 4 -*- 2 | //Title: JET-ICE 3 | //Version: 1.72 4 | //Copyright: Copyright (c) 2014 5 | //Author: Ralph Grishman 6 | //Description: A Java-based Information Extraction Tool -- Customization Environment 7 | 8 | import edu.nyu.jet.ice.models.Corpus; 9 | 10 | import javax.swing.*; 11 | import java.awt.event.*; 12 | 13 | /** 14 | * a filter for selecting only sentential patterns (those with 15 | * an nsubj and dobj) or passing all patterns 16 | */ 17 | 18 | public class RelationFilter extends ListFilter { 19 | 20 | public boolean onlySententialPatterns; 21 | public JCheckBox sententialPatternCheckBox; 22 | JTextArea area = null; 23 | 24 | 25 | /** 26 | * return true if 'term' is a selected part of speech 27 | */ 28 | public boolean filter (String term) { 29 | if (onlySententialPatterns) 30 | return term.matches(".*nsubj-1:.*:dobj.*"); 31 | else 32 | return true; 33 | } 34 | 35 | public void setArea(JTextArea area) { 36 | this.area = area; 37 | } 38 | 39 | /** 40 | * draw a Box with the check box for selecting sentential patterns 41 | */ 42 | 43 | public Box makeBox () { 44 | Box box = Box.createHorizontalBox(); 45 | sententialPatternCheckBox = new JCheckBox("show only sentential patterns"); 46 | box.add(sententialPatternCheckBox); 47 | 48 | // listener ----- 49 | 50 | sententialPatternCheckBox.addActionListener(new ActionListener() { 51 | public void actionPerformed(ActionEvent ev) { 52 | onlySententialPatterns = sententialPatternCheckBox.isSelected(); 53 | try { 54 | Corpus.displayTerms(Ice.selectedCorpus.relationTypesFileName, 55 | 40, 56 | area, 57 | Corpus.relationFilter); 58 | } 59 | catch (Exception e) { 60 | e.printStackTrace(); 61 | } 62 | } 63 | }); 64 | 65 | return box; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/uicomps/TermFilter.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.uicomps;// -*- tab-width: 4 -*- 2 | //Title: JET-ICE 3 | //Version: 1.72 4 | //Copyright: Copyright (c) 2014 5 | //Author: Ralph Grishman 6 | //Description: A Java-based Information Extraction Tool -- Customization Environment 7 | 8 | import javax.swing.*; 9 | import java.awt.event.*; 10 | 11 | /** 12 | * part-of-speech filter for high-frequency words; 13 | * distinguishes nouns, names, verbs, and other 14 | */ 15 | 16 | public class TermFilter extends ListFilter { 17 | 18 | boolean showNouns; 19 | boolean showNames; 20 | boolean showVerbs; 21 | boolean showOther; 22 | 23 | /** 24 | * return true if 'term' is a selected part of speech 25 | */ 26 | 27 | public boolean filter (String term) { 28 | return (showNouns && termIsType(term, "nn")) || 29 | (showNames && termIsType(term, "nnp")) || 30 | (showVerbs && termIsType(term, "vb")) || 31 | (showOther && termIsType(term, "o")); 32 | } 33 | 34 | private boolean termIsType(String term, String type) { 35 | String[] parts = term.split("/"); 36 | if (parts.length < 2) return false; 37 | return parts[1].equals(type); 38 | } 39 | 40 | /** 41 | * draw a Box including check boxes for the different parts of speech 42 | */ 43 | 44 | public Box makeBox () { 45 | Box box = Box.createHorizontalBox(); 46 | box.add(new JLabel("show")); 47 | JCheckBox nounButton = new JCheckBox("nouns"); 48 | nounButton.setSelected(showNouns); 49 | box.add(nounButton); 50 | JCheckBox nameButton = new JCheckBox("names"); 51 | nameButton.setSelected(showNames); 52 | box.add(nameButton); 53 | JCheckBox verbButton = new JCheckBox("verbs"); 54 | verbButton.setSelected(showVerbs); 55 | box.add(verbButton); 56 | JCheckBox otherButton = new JCheckBox("other"); 57 | otherButton.setSelected(showOther); 58 | box.add(otherButton); 59 | 60 | // -------- listeners 61 | nounButton.addItemListener (new ItemListener() { 62 | public void itemStateChanged (ItemEvent ev) { 63 | showNouns = ev.getStateChange() == ItemEvent.SELECTED; 64 | } 65 | }); 66 | nameButton.addItemListener (new ItemListener() { 67 | public void itemStateChanged (ItemEvent ev) { 68 | showNames = ev.getStateChange() == ItemEvent.SELECTED; 69 | } 70 | }); 71 | verbButton.addItemListener (new ItemListener() { 72 | public void itemStateChanged (ItemEvent ev) { 73 | showVerbs = ev.getStateChange() == ItemEvent.SELECTED; 74 | } 75 | }); 76 | otherButton.addItemListener (new ItemListener() { 77 | public void itemStateChanged (ItemEvent ev) { 78 | showOther = ev.getStateChange() == ItemEvent.SELECTED; 79 | } 80 | }); 81 | 82 | return box; 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/utils/AnnotationStartComparator.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.utils; 2 | 3 | import edu.nyu.jet.tipster.Annotation; 4 | 5 | import java.util.Comparator; 6 | 7 | /** 8 | * Compares 2 Jet annotations by their start offsets 9 | * 10 | * @author yhe 11 | * @version 1.0 12 | */ 13 | public class AnnotationStartComparator implements Comparator { 14 | public int compare(Annotation annotation, Annotation annotation2) { 15 | return annotation.start() - annotation2.start(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/utils/FileNameSchema.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.utils; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | //import java.nio.file.Files; 6 | //import java.nio.file.Paths; 7 | 8 | /** 9 | * Created with IntelliJ IDEA. 10 | * User: joelsieh 11 | * Date: 9/9/14 12 | * Time: 10:26 AM 13 | * To change this template use File | Settings | File Templates. 14 | */ 15 | public class FileNameSchema { 16 | private static String CACHE_ROOT = "cache"; 17 | 18 | static { 19 | File cacheFile = new File(CACHE_ROOT); 20 | 21 | if(!cacheFile.exists()) { 22 | try { 23 | //Files.createDirectory(Paths.get(CACHE_ROOT)); 24 | cacheFile.mkdirs(); 25 | } catch (Exception e) { 26 | e.printStackTrace(); 27 | } 28 | } 29 | } 30 | 31 | 32 | public static String getCacheRoot() { 33 | return CACHE_ROOT; 34 | } 35 | 36 | public static String getPreprocessCacheDir(String corpusName) { 37 | return CACHE_ROOT + File.separator + corpusName + File.separator + "preprocess"; 38 | } 39 | 40 | public static String getWordCountFileName(String corpusName) { 41 | return CACHE_ROOT + File.separator + corpusName + File.separator + "counts"; 42 | } 43 | 44 | public static String getDocListFileName(String corpusName) { 45 | return CACHE_ROOT + File.separator + corpusName + File.separator + "docList"; 46 | } 47 | 48 | public static String getTermsFileName(String corpusName) { 49 | return CACHE_ROOT + File.separator + corpusName + File.separator + "terms"; 50 | } 51 | 52 | public static String getRelationsFileName(String corpusName) { 53 | return CACHE_ROOT + File.separator + corpusName + File.separator + "Relations"; 54 | } 55 | 56 | public static String getRelationTypesFileName(String corpusName) { 57 | return CACHE_ROOT + File.separator + corpusName + File.separator + "RelationTypes"; 58 | } 59 | 60 | public static String getRelationReprFileName(String corpusName) { 61 | return CACHE_ROOT + File.separator + corpusName + File.separator + "RelationRepr"; 62 | } 63 | 64 | public static String getEventsFileName(String corpusName) { 65 | return CACHE_ROOT + File.separator + corpusName + File.separator + "Events"; 66 | } 67 | 68 | public static String getEventTypesFileName(String corpusName) { 69 | return CACHE_ROOT + File.separator + corpusName + File.separator + "EventTypes"; 70 | } 71 | 72 | public static String getEventReprFileName(String corpusName) { 73 | return CACHE_ROOT + File.separator + corpusName + File.separator + "EventRepr"; 74 | } 75 | 76 | public static String getCorpusInfoDirectory(String corpusName) { 77 | return CACHE_ROOT + File.separatorChar + corpusName; 78 | } 79 | 80 | public static String getDependencyEventFileName(String corpusName) { 81 | return CACHE_ROOT + File.separatorChar + corpusName + File.separator + "DepEvents"; 82 | } 83 | 84 | public static String getEntitySetIndexFileName(String corpusName, String inType) { 85 | return CACHE_ROOT + File.separatorChar + corpusName + File.separator + "EntitySetIndex_" + inType; 86 | } 87 | 88 | public static String getPatternRatioFileName(String corpusName, String bgCorpusName) { 89 | return CACHE_ROOT + File.separatorChar + corpusName + File.separator + bgCorpusName + "-Pattern-Ratio"; 90 | } 91 | 92 | public static String getSortedPatternRatioFileName(String corpusName, String bgCorpusName) { 93 | return getPatternRatioFileName(corpusName, bgCorpusName) + ".sorted"; 94 | } 95 | 96 | public static String getPreprocessCacheMapFileName(String corpusName) { 97 | return CACHE_ROOT + File.separator + corpusName + File.separator + "preprocessCacheMap"; 98 | } 99 | 100 | public static String getDepPathsLogFileName(String corpusName) { 101 | return CACHE_ROOT + File.separator + corpusName + File.separator + "DepPathsLog"; 102 | } 103 | 104 | public static String getDepPathsPriorLogFileName(String corpusName) { 105 | return CACHE_ROOT + File.separator + corpusName + File.separator + "DepPathsPriorLog"; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/utils/ProcessFarm.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.utils; 2 | 3 | import org.apache.commons.exec.*; 4 | 5 | import java.util.ArrayList; 6 | 7 | /** 8 | * Spawn and manage processes 9 | * 10 | * @author yhe 11 | * @version 1.0 12 | */ 13 | public class ProcessFarm { 14 | ArrayList tasks = new ArrayList(); 15 | ArrayList processes = new ArrayList(); 16 | 17 | /** 18 | * Reset the tasks to be spawned 19 | */ 20 | synchronized public void reset() { 21 | tasks = new ArrayList(); 22 | processes = new ArrayList(); 23 | } 24 | 25 | /** 26 | * Submit the current list for execution 27 | */ 28 | synchronized public void submit() { 29 | try { 30 | for (String line : tasks) { 31 | System.err.println("Submit: " + line); 32 | CommandLine cmdLine = CommandLine.parse(line); 33 | 34 | DefaultExecuteResultHandler resultHandler = new DefaultExecuteResultHandler(); 35 | 36 | ExecuteWatchdog watchdog = new ExecuteWatchdog(ExecuteWatchdog.INFINITE_TIMEOUT); 37 | Executor executor = new DefaultExecutor(); 38 | executor.setExitValue(0); 39 | executor.setWatchdog(watchdog); 40 | executor.execute(cmdLine, resultHandler); 41 | processes.add(resultHandler); 42 | } 43 | } 44 | catch (Exception e) { 45 | e.printStackTrace(); 46 | } 47 | } 48 | 49 | /** 50 | * Hold the host thread until all spawned processes complete execution 51 | * 52 | * @return true if all tasks completed successfully, false otherwise 53 | */ 54 | synchronized public boolean waitFor() { 55 | boolean success = true; 56 | int i = 0; 57 | for (DefaultExecuteResultHandler p : processes) { 58 | try { 59 | p.waitFor(); 60 | int returnVal = p.getExitValue(); 61 | if (returnVal != 0) { 62 | System.err.println(tasks.get(i) + String.format(" (return code %d)", returnVal)); 63 | success = false; 64 | } 65 | } 66 | // catch (InterruptedException e) { 67 | catch (Exception e) { 68 | System.err.println(tasks.get(i) + " encountered interrupted exception:"); 69 | e.printStackTrace(); 70 | success = false; 71 | } 72 | i++; 73 | } 74 | return success; 75 | } 76 | 77 | /** 78 | * Add a shell command to list waiting to be executed. Use submit() to execute 79 | * all commands in the list 80 | * 81 | * @param s A shell command string to be executed 82 | */ 83 | synchronized public void addTask(String s) { 84 | tasks.add(s); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/utils/ProgressMonitorI.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.utils; 2 | 3 | /** 4 | * Created with IntelliJ IDEA. 5 | * User: joelsieh 6 | * Date: 7/9/14 7 | * Time: 4:48 PM 8 | * To change this template use File | Settings | File Templates. 9 | */ 10 | public interface ProgressMonitorI { 11 | 12 | boolean isCanceled(); 13 | 14 | void setProgress(int docCount); 15 | 16 | void setMaximum(int maximum); 17 | 18 | int getMaximum(); 19 | 20 | void setNote(String s); 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/utils/Ratio.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.utils; 2 | 3 | import java.util.*; 4 | import java.io.*; 5 | 6 | /** 7 | * given two document profiles, in the form of word frequencies, 8 | * dependency triple frequencies, etc., computes 9 | * f log f / f' 10 | * where f is the frequency in corpus 1, and f' the frequency 11 | * in corpus2 12 | */ 13 | 14 | public class Ratio { 15 | 16 | static Map count1 = new TreeMap(); 17 | static Map count2 = new TreeMap(); 18 | 19 | public static void main (String[] args) throws IOException { 20 | String countFile1 = args[0]; 21 | String countFile2 = args[1]; 22 | String ratioFile = args[2]; 23 | 24 | readCounts (countFile1, count1); 25 | readCounts (countFile2, count2); 26 | computeRatios(new PrintWriter (new FileWriter (ratioFile))); 27 | } 28 | 29 | public static void readCounts (String file, Map counts) throws IOException { 30 | counts.clear(); 31 | BufferedReader reader = new BufferedReader (new FileReader (file)); 32 | String line; 33 | while ((line = reader.readLine()) != null) { 34 | String[] field = line.trim().split("\t"); 35 | if (field.length == 2) 36 | counts.put(field[1], Integer.valueOf(field[0])); 37 | } 38 | } 39 | 40 | public static void computeRatios (PrintWriter writer) throws IOException { 41 | for (String w : count1.keySet()) { 42 | Integer f1 = count1.get(w); 43 | Integer f2 = count2.get(w); 44 | f1++; 45 | f2 = (f2 == null) ? 1 : f2 + 1; 46 | float ratio = (float) f1 / f2 * (float) Math.log((float) f1); 47 | writer.printf ("%8.1f\t%s\n", ratio, w); 48 | } 49 | writer.close(); 50 | } 51 | } 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/utils/SwingProgressMonitor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.utils; 2 | 3 | import javax.swing.*; 4 | import java.awt.*; 5 | 6 | /** 7 | * Created with IntelliJ IDEA. 8 | * User: joelsieh 9 | * Date: 7/9/14 10 | * Time: 4:49 PM 11 | * To change this template use File | Settings | File Templates. 12 | */ 13 | public class SwingProgressMonitor extends ProgressMonitor implements ProgressMonitorI { 14 | 15 | public SwingProgressMonitor(Component parentComponent, 16 | Object message, 17 | String note, 18 | int min, 19 | int max) { 20 | super(parentComponent, message, note, min, max); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/views/Refreshable.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.views; 2 | 3 | /** 4 | * Created by yhe on 10/19/14. 5 | */ 6 | public interface Refreshable { 7 | public void refresh(); 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/views/cli/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Command-line interface for ICE. 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/views/swing/SwingEntitiesPanel.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.views.swing; 2 | 3 | import edu.nyu.jet.ice.models.Corpus; 4 | import edu.nyu.jet.ice.models.DepPathMap; 5 | import edu.nyu.jet.ice.models.IcePreprocessor; 6 | import edu.nyu.jet.ice.terminology.TermCounter; 7 | import edu.nyu.jet.ice.terminology.TermRanker; 8 | import edu.nyu.jet.ice.uicomps.Ice; 9 | import edu.nyu.jet.ice.uicomps.ListFilter; 10 | import edu.nyu.jet.ice.uicomps.RelationFilter; 11 | import edu.nyu.jet.ice.uicomps.TermFilter; 12 | import edu.nyu.jet.ice.utils.FileNameSchema; 13 | import edu.nyu.jet.ice.utils.IceUtils; 14 | import edu.nyu.jet.ice.utils.ProgressMonitorI; 15 | import edu.nyu.jet.ice.utils.SwingProgressMonitor; 16 | import edu.nyu.jet.ice.entityset.EntityIndexerBox; 17 | import edu.nyu.jet.ice.views.Refreshable; 18 | import net.miginfocom.swing.MigLayout; 19 | 20 | import javax.swing.*; 21 | import javax.swing.border.TitledBorder; 22 | import java.awt.*; 23 | import java.awt.List; 24 | import java.awt.event.ActionEvent; 25 | import java.awt.event.ActionListener; 26 | import java.io.BufferedReader; 27 | import java.io.File; 28 | import java.io.FileReader; 29 | import java.io.IOException; 30 | import java.util.*; 31 | 32 | /** 33 | * Panel that manages the entity/terminology extraction. 34 | * 35 | * Currently, the entity index functionality comes from EntitySetBuilder.makeSwingBox() 36 | * 37 | * @author yhe 38 | */ 39 | 40 | public class SwingEntitiesPanel extends JPanel implements Refreshable { 41 | public final SwingIceStatusPanel statusPanel = new SwingIceStatusPanel(); 42 | public final JTextArea textArea = new JTextArea(11, 35); 43 | 44 | /** 45 | * create entities panel and display top-ranked entities in response 46 | * to "Find Entities" button. 47 | */ 48 | 49 | public SwingEntitiesPanel() { 50 | super(); 51 | this.setLayout(new MigLayout()); 52 | this.setOpaque(true); 53 | this.removeAll(); 54 | JPanel termBox = new JPanel(new MigLayout()); 55 | TitledBorder border = new TitledBorder("Entities"); 56 | termBox.setBorder(border); 57 | termBox.setOpaque(true); 58 | termBox.setMinimumSize(new Dimension(480, 270)); 59 | JScrollPane scrollPane = new JScrollPane(textArea); 60 | // if (termFileName != null) 61 | // displayTerms(termFileName, 100, textArea, termFilter); 62 | termBox.add(scrollPane, "wrap"); 63 | textArea.setEditable(false); 64 | 65 | JButton findEntitiesButton = new JButton("Find Entities"); 66 | findEntitiesButton.addActionListener(new ActionListener() { 67 | public void actionPerformed(ActionEvent e) { 68 | findTerms(); 69 | Ice.selectedCorpus.termFileName = FileNameSchema.getTermsFileName(Ice.selectedCorpusName); 70 | java.util.List terms = 71 | getTerms(FileNameSchema.getTermsFileName(Ice.selectedCorpusName), 100); 72 | StringBuilder areaTextBuilder = new StringBuilder(); 73 | for (String t : terms) { 74 | areaTextBuilder.append(t).append("\n"); 75 | } 76 | textArea.setText(areaTextBuilder.toString()); 77 | } 78 | }); 79 | 80 | termBox.add(findEntitiesButton); 81 | 82 | EntityIndexerBox eib = new EntityIndexerBox(); 83 | Box indexBox = eib.makeSwingBox(); 84 | this.add(termBox, "cell 0 0"); 85 | this.add(statusPanel, "cell 1 0 1 2"); 86 | this.add(indexBox, "cell 0 1"); 87 | refresh(); 88 | } 89 | 90 | /** 91 | * returns a list of (at most limit) terms from 92 | * file termFile. 93 | */ 94 | 95 | public static java.util.List getTerms(String termFile, int limit) { 96 | java.util.List topTerms = new ArrayList(); 97 | try { 98 | BufferedReader reader = new BufferedReader(new FileReader(termFile)); 99 | int k = 0; 100 | while (true) { 101 | String term = reader.readLine(); 102 | if (term == null) break; 103 | if (term.length() < 4 || !termIsType(term, "nn")) continue; 104 | term = term.substring(0, term.length() - 3); 105 | topTerms.add(term); 106 | k++; 107 | if (k >= limit) break; 108 | } 109 | } catch (IOException e) { 110 | e.printStackTrace(); 111 | } 112 | return topTerms; 113 | } 114 | 115 | public void refresh() { 116 | statusPanel.refresh(); 117 | } 118 | 119 | /** 120 | * invokes TermRanker to rank terms by relative frequency, 121 | * writing ranked list to file. 122 | */ 123 | 124 | public void findTerms() { 125 | String termFileName = FileNameSchema.getTermsFileName(Ice.selectedCorpusName); 126 | try { 127 | File f = new File(FileNameSchema.getWordCountFileName(Ice.selectedCorpusName)); 128 | if (!f.exists() || !f.isFile()) { 129 | if (SwingPathsPanel.preprocessedTextsAvailable(Ice.selectedCorpusName)) { 130 | IcePreprocessor.countWords(false); 131 | } else { 132 | JOptionPane.showMessageDialog(Ice.mainFrame, "Source text not available, cannot rebuild term set"); 133 | return; 134 | } 135 | } 136 | TermRanker.rankTerms(FileNameSchema.getWordCountFileName(Ice.selectedCorpusName), 137 | Ice.corpora.get(Ice.selectedCorpus.backgroundCorpus).wordCountFileName, 138 | termFileName); 139 | } 140 | catch (IOException e) { 141 | e.printStackTrace(System.err); 142 | return; 143 | } 144 | } 145 | 146 | private static boolean termIsType(String term, String type) { 147 | String[] parts = term.split("/"); 148 | if (parts.length < 2) return false; 149 | return parts[1].equals(type); 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/edu/nyu/jet/ice/views/swing/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Swing-based graphical interface for ICE. 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/main/python/extract_field.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | for l in open(sys.argv[1]): 4 | print(l.split('\t')[0].strip()) 5 | -------------------------------------------------------------------------------- /src/main/python/weight_gold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | try: 4 | count_file = sys.argv[1] 5 | gold_file = sys.argv[2] 6 | except: 7 | print("weight_gold.py count_file gold_file", sys.stderr) 8 | sys.exit(-1) 9 | 10 | d = {} 11 | skip = 0 12 | for l in open(count_file): 13 | if skip < 3: 14 | skip += 1 15 | continue 16 | count = reduce(lambda x, y: x + y, map(lambda x: int(x), l.strip().split('\t')[1:])) 17 | key = l.split('\t')[0].split('/')[0] 18 | 19 | d[l.split('\t')[0].split('/')[0]] = count 20 | 21 | # print(d) 22 | for l in open(gold_file): 23 | print("%s\t%d" % (l.strip(), d[l.strip()])) -------------------------------------------------------------------------------- /src/models/data/QuantifierPatterns.txt: -------------------------------------------------------------------------------- 1 | // handle gram of drugs 2 | 3 | pattern set quantifiers; 4 | 5 | quantifier := "gram" | "grams" | "kilogram" | "kilograms" | "pound" | "pounds" | "ounce" | "ounces"; 6 | 7 | name-with-quantifier := (quantifier):Arg2 "of" [enamex]:Arg1; 8 | 9 | when name-with-quantifier add [qenamex name=Arg1]; -------------------------------------------------------------------------------- /src/props/ice.yml: -------------------------------------------------------------------------------- 1 | --- !java.util.TreeMap {} 2 | --- !java.util.TreeMap {} 3 | --- !java.util.TreeMap {} -------------------------------------------------------------------------------- /src/props/iceprops: -------------------------------------------------------------------------------- 1 | Ice.TermRanker.alpha = 0 2 | # Ice.DepEmbeddings.fileName = deps.words 3 | Ice.Bootstrapper.diversify = false 4 | Ice.Bootstrapper.debug = false 5 | Ice.IcePreprocessor.parseprops = parseprops -------------------------------------------------------------------------------- /src/props/onomaprops: -------------------------------------------------------------------------------- 1 | # JET properties file for dependency counter 2 | # 3 | Jet.batch = t 4 | Jet.dataPath = data 5 | EnglishLex.fileName1 = Jet4.dict 6 | EnglishLex.fileName2 = titles.dict 7 | Gazetteer.fileName = loc.dict 8 | NameGender.fileName = gender.dict 9 | Tags.fileName = pos_hmm.txt 10 | NameTags.ME.fileName = ../acedata/AceOntoMeneModel 11 | WordClusters.fileName = brownClusters10-2014.txt 12 | Pattern.fileName1 = MEchunkPatterns.txt 13 | Ace.EDTtype.fileName = ../acedata/EDT type dict 05.txt 14 | Ace.NameSubtypeModel.fileName = ../acedata/ACEnameSubtypeModel 05.txt 15 | Pattern.quantifierFileName = QuantifierPatterns.txt 16 | Timex.refTime = 2015-10-01 17 | Timex.rule = time_rules.yaml 18 | ##### ICE USER CONFIG FILES ##### 19 | Onoma.fileName = ../acedata/ice_onoma.dict 20 | Ace.EDTtype.auxFileName = ../acedata/EDTypesFromUser.dict 21 | Ace.RelationModel.fileName = ../acedata/iceRelationModel 22 | ################################# 23 | # 24 | # processDocument = sentenceSplit, sentence:processSentence 25 | processSentence = tokenize, lexLookup, tagNamesFromOnoma, tagTimex 26 | -------------------------------------------------------------------------------- /src/props/parseprops: -------------------------------------------------------------------------------- 1 | # JET properties file for dependency counter 2 | # 3 | Jet.batch = t 4 | Jet.dataPath = data 5 | EnglishLex.fileName1 = Jet4.dict 6 | EnglishLex.fileName2 = titles.dict 7 | Gazetteer.fileName = loc.dict 8 | NameGender.fileName = gender.dict 9 | Tags.fileName = pos_hmm.txt 10 | Chunker.fileName = chunkModel.txt 11 | NameTags.ME.fileName = ../acedata/AceOntoMeneModel 12 | WordClusters.fileName = brownClusters10-2014.txt 13 | Pattern.fileName1 = MEchunkPatterns.txt 14 | Ace.EDTtype.fileName = ../acedata/EDT type dict 05.txt 15 | Ace.NameSubtypeModel.fileName = ../acedata/ACEnameSubtypeModel 05.txt 16 | DepParser.model.fileName = parseModel.gz 17 | DepParser.transformations = yes 18 | Ace.generic.fileName = ../acedata/generic dict 05.txt 19 | ##### ICE USER CONFIG FILES ##### 20 | Onoma.fileName = ../acedata/ice_onoma.dict 21 | Ace.EDTtype.auxFileName = ../acedata/EDTypesFromUser.dict 22 | Ace.RelationModel.fileName = ../acedata/iceRelationModel 23 | ################################# 24 | # 25 | processDocument = sentenceSplit, sentence:processSentence 26 | processSentence = tokenize, lexLookup, pruneTags, tagPOS, \ 27 | tagNames, chunk, \ 28 | pat(names), pat(othernames), ng:processNG, pat(fusePossessive), \ 29 | pat(vgroups), pat(particles), pat(np), pat(np), pat(conj), \ 30 | pat(vp), pat(rnv), pat(s), depParse, resolve 31 | processNG = pat(ng-chunks) 32 | -------------------------------------------------------------------------------- /src/props/props: -------------------------------------------------------------------------------- 1 | # JET properties file to run ACE with ICE-generated entity classes and relation patterns 2 | Jet.batch = t 3 | Jet.dataPath = data 4 | EnglishLex.fileName1 = Jet4.dict 5 | EnglishLex.fileName2 = titles.dict 6 | Gazetteer.fileName = loc.dict 7 | NameGender.fileName = gender.dict 8 | DepParser.model.fileName = parseModel.gz 9 | DepParser.transformations = t 10 | Time.fileName = time_rules.yaml 11 | Ace.EDTtype.fileName = ../acedata/EDT type dict 05.txt 12 | Ace.generic.fileName = ../acedata/generic dict 05.txt 13 | Ace.NameSubtypeModel.fileName = ../acedata/ACEnameSubtypeModel 05.txt 14 | Ace.Value.fileName = ../acedata/values.dict 15 | Tags.fileName = pos_hmm.txt 16 | Pattern.fileName1 = MEchunkPatterns.txt 17 | Pattern.fileName2 = NPpatterns.txt 18 | Chunker.fileName = chunkModel.txt 19 | NameTags.ME.fileName = ../acedata/AceOntoMeneModel 20 | WordClusters.fileName = brownClusters10-2014.txt 21 | ##### ICE GENERATED FILES ##### 22 | Onoma.fileName = ../acedata/ice_onoma.dict 23 | Ace.EDTtype.auxFileName = ../acedata/EDTypesFromUser.dict 24 | Ace.RelationDepPaths.fileName = ../acedata/iceRelationModel 25 | ################################# 26 | processDocument = sentenceSplit, sentence:processSentence 27 | processSentence = tokenize, lexLookup, pruneTags, tagNames, tagNamesFromOnoma, chunk, \ 28 | pat(names), pat(othernames), ng:processNG, depParse, resolve 29 | processNG = pat(ng-chunks) 30 | -------------------------------------------------------------------------------- /src/retired/BatchMaeToApf.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.ice.utils.IceUtils; 4 | 5 | import java.io.IOException; 6 | 7 | /** 8 | * A utility class to convert MAE annotated files to APF files. 9 | * 10 | * This is not used in the current ICE GUI / CLI. 11 | * 12 | * @author yhe 13 | */ 14 | public class BatchMaeToApf { 15 | 16 | public static void main(String[] args) throws IOException { 17 | if (args.length != 3) { 18 | System.err.println("Jet.RelationAL.BatchMaeToApf maeFileList txtFileList apfFileList"); 19 | System.exit(-1); 20 | } 21 | String[] maeFiles = IceUtils.readLines(args[0]); 22 | String[] txtFiles = IceUtils.readLines(args[1]); 23 | String[] apfFiles = IceUtils.readLines(args[2]); 24 | if (maeFiles.length != apfFiles.length || 25 | maeFiles.length != txtFiles.length) { 26 | System.err.println("Mae, txt, and apf file list should have the same length."); 27 | } 28 | for (int i = 0; i < maeFiles.length; i++) { 29 | System.err.println("Mae file:" + maeFiles[i]); 30 | MaeToApf.main(new String[]{maeFiles[i], txtFiles[i], apfFiles[i]}); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/retired/DepPathFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.aceJet.AceDocument; 4 | import edu.nyu.jet.aceJet.AceEntityMention; 5 | import edu.nyu.jet.aceJet.AceRelationMention; 6 | import edu.nyu.jet.aceJet.EventSyntacticPattern; 7 | import edu.nyu.jet.parser.SyntacticRelationSet; 8 | import edu.nyu.jet.tipster.Annotation; 9 | import edu.nyu.jet.tipster.Document; 10 | import opennlp.model.Event; 11 | 12 | import java.util.List; 13 | 14 | /** 15 | * A Feature extractor using dependency path features for supervised/simulated active learning relation extraction. 16 | * 17 | * This class is not used by ICE GUI/CLI. 18 | */ 19 | public class DepPathFeatureExtractor implements RelationFeatureExtractor { 20 | public Event extractFeatures(AceEntityMention m1, 21 | AceEntityMention m2, 22 | AceRelationMention r, 23 | Annotation sentence, 24 | SyntacticRelationSet paths, 25 | List mentions, 26 | AceDocument aceDoc, 27 | Document doc) { 28 | String label = r == null ? "NONE" : r.relation.type; 29 | int h1 = m1.getJetHead().start(); 30 | int h2 = m2.getJetHead().start(); 31 | if (h1 >= h2) { 32 | int tmp = h1; 33 | h1 = h2; 34 | h2 = tmp; 35 | } 36 | String path = EventSyntacticPattern.buildSyntacticPath(h1, h2, paths); 37 | path = path == null ? "EMPTY" : path.replaceAll("\\s+", "_"); 38 | String type1 = m1.entity.type; 39 | String type2 = m2.entity.type; 40 | String concatTypes = type1 + ":::" + type2; 41 | String concatAll = type1 + ":::" + path + ":::" + type2; 42 | return new Event(label, new String[]{path}); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/retired/DepPathSameConstitsFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.aceJet.AceDocument; 4 | import edu.nyu.jet.aceJet.AceEntityMention; 5 | import edu.nyu.jet.aceJet.AceRelationMention; 6 | import edu.nyu.jet.aceJet.EventSyntacticPattern; 7 | import edu.nyu.jet.parser.SyntacticRelationSet; 8 | import edu.nyu.jet.tipster.Annotation; 9 | import edu.nyu.jet.tipster.Document; 10 | import opennlp.model.Event; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | /** 16 | * A Feature extractor using dependency path and a flag for whether the two arguments belong to a same syntactic 17 | * constituent as features for supervised/simulated active learning relation extraction. 18 | * 19 | * This class is not used by ICE GUI/CLI. 20 | * 21 | * @author yhe 22 | */ 23 | public class DepPathSameConstitsFeatureExtractor implements RelationFeatureExtractor { 24 | public Event extractFeatures(AceEntityMention m1, 25 | AceEntityMention m2, 26 | AceRelationMention r, 27 | Annotation sentence, 28 | SyntacticRelationSet paths, 29 | List mentions, 30 | AceDocument aceDoc, 31 | Document doc) { 32 | String label = r == null ? "NONE" : r.relation.type; 33 | int h1 = m1.getJetHead().start(); 34 | int h2 = m2.getJetHead().start(); 35 | int h1Start = m1.getJetHead().start(); 36 | int h2End = m2.getJetHead().end(); 37 | if (h1 >= h2) { 38 | int tmp = h1; 39 | h1 = h2; 40 | h2 = tmp; 41 | h1Start = m2.getJetHead().start(); 42 | h2End = m1.getJetHead().end(); 43 | } 44 | String path = EventSyntacticPattern.buildSyntacticPath(h1, h2, paths); 45 | path = path == null ? "EMPTY" : path.replaceAll("\\s+", "_"); 46 | String type1 = m1.entity.type; 47 | String type2 = m2.entity.type; 48 | String concatTypes = type1 + ":::" + type2; 49 | String concatAll = type1 + ":::" + path + ":::" + type2; 50 | List feats = SameConstitFeatureExtractor.extractSameConstits(h1Start, h2End, doc); 51 | int pathLength = path.split(":").length; 52 | //System.err.println("Path length = " + pathLength); 53 | List updatedFeats = new ArrayList(); 54 | for (String feat : feats) { 55 | updatedFeats.add("CONJ_SAME_LENGTH=" + feat + ":::" + pathLength); 56 | } 57 | updatedFeats.add("PATH_LENGTH=" + pathLength); 58 | updatedFeats.add("PATH_WITH_TYPE=" + concatAll); 59 | return new Event(label, updatedFeats.toArray(new String[updatedFeats.size()])); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/retired/DepPathTypeFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.aceJet.AceDocument; 4 | import edu.nyu.jet.aceJet.AceEntityMention; 5 | import edu.nyu.jet.aceJet.AceRelationMention; 6 | import edu.nyu.jet.aceJet.EventSyntacticPattern; 7 | import edu.nyu.jet.parser.SyntacticRelationSet; 8 | import edu.nyu.jet.tipster.Annotation; 9 | import edu.nyu.jet.tipster.Document; 10 | import opennlp.model.Event; 11 | 12 | import java.util.List; 13 | 14 | /** 15 | * A Feature extractor using dependency path and entity types as features for supervised/simulated active learning 16 | * relation extraction. 17 | * 18 | * This class is not used by ICE GUI/CLI. 19 | * 20 | * @author yhe 21 | */ 22 | public class DepPathTypeFeatureExtractor implements RelationFeatureExtractor { 23 | public Event extractFeatures(AceEntityMention m1, 24 | AceEntityMention m2, 25 | AceRelationMention r, 26 | Annotation sentence, 27 | SyntacticRelationSet paths, 28 | List mentions, 29 | AceDocument aceDoc, 30 | Document doc) { 31 | String label = r == null || r.relation.type.toLowerCase().equals("null") ? "NONE" : r.relation.type; 32 | int h1 = m1.getJetHead().start(); 33 | int h2 = m2.getJetHead().start(); 34 | if (h1 >= h2) { 35 | int tmp = h1; 36 | h1 = h2; 37 | h2 = tmp; 38 | } 39 | String path = EventSyntacticPattern.buildSyntacticPath(h1, h2, paths); 40 | path = path == null ? "EMPTY" : path.replaceAll("\\s+", "_"); 41 | String type1 = m1.entity.type; 42 | String type2 = m2.entity.type; 43 | return new Event(label, new String[]{path, type1, type2}); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/retired/EventItem.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | * EventItem is a wrapper around the OpenNLP Event class (feature+label for a training instance), to simplify training 7 | * of supervised relation extraction models 8 | * 9 | * Not used in ICE GUI/CLI. 10 | * 11 | * @author yhe 12 | * @version 1.0 13 | */ 14 | public class EventItem implements Comparable { 15 | public static final String NOT_RELATION_LABEL = "NOT_RELATION"; 16 | public static final String UNKNOWN_LABEL = "UNDECIDED"; 17 | private String sentence; 18 | private String path; 19 | private String type1; 20 | private String type2; 21 | private boolean sameNP; 22 | private List wordsInBetween; 23 | private String outcome; 24 | private String predictedOutcome; 25 | private double score; 26 | 27 | public double getUncertainty() { 28 | return uncertainty; 29 | } 30 | 31 | private double uncertainty; 32 | 33 | public EventItem(String sentence, String path, String type1, String type2, boolean sameNP, List wordsInBetween) { 34 | this.sentence = sentence; 35 | this.path = path; 36 | this.type1 = type1; 37 | this.type2 = type2; 38 | this.sameNP = sameNP; 39 | this.wordsInBetween = wordsInBetween; 40 | this.outcome = UNKNOWN_LABEL; 41 | } 42 | 43 | public String[] context() { 44 | List contextList = new ArrayList(); 45 | contextList.add("PATH=" + path.trim().replaceAll("\\s+", "_")); 46 | contextList.add("sameNP=" + sameNP); 47 | for (String w : wordsInBetween) { 48 | contextList.add("wordInBetween=" + w.trim().replaceAll("\\s+", "_")); 49 | } 50 | return contextList.toArray(new String[contextList.size()]); 51 | } 52 | 53 | public static EventItem fromLine(String line) { 54 | String[] parts = line.trim().split("\\|\\|\\|"); 55 | System.err.println(line); 56 | System.err.println(parts.length); 57 | String sentence = parts[0]; 58 | String path = parts[1]; 59 | String types = parts[2]; 60 | String sameNP = parts[3]; 61 | String wordsInSentence = parts[4]; 62 | String[] wordsInBetweenArr = parts.length == 6 ? parts[5].trim().split(" ") : new String[]{}; 63 | String type1 = types.trim().split("\\+\\+\\+")[0]; 64 | String type2 = types.trim().split("\\+\\+\\+")[1]; 65 | boolean sameNPBool = Boolean.valueOf(sameNP); 66 | Set wordsInBetweenSet = new TreeSet(); 67 | for (String wordInBetween : wordsInBetweenArr) { 68 | wordsInBetweenSet.add(wordInBetween); 69 | } 70 | List wordsInBetween = new ArrayList(wordsInBetweenSet); 71 | return new EventItem(sentence.trim(), path.trim(), type1.trim(), type2.trim(), sameNPBool, wordsInBetween); 72 | } 73 | 74 | public boolean sameTypesAs(String type1, String type2) { 75 | return (this.type1.equals(type1) && this.type2.equals(type2)) || 76 | (this.type1.equals(type2) && this.type2.equals(type1)); 77 | } 78 | 79 | public boolean outcomeUNK() { 80 | return outcome.equals(UNKNOWN_LABEL); 81 | } 82 | 83 | public String getSentence() { 84 | return sentence; 85 | } 86 | 87 | public void setSentence(String sentence) { 88 | this.sentence = sentence; 89 | } 90 | 91 | public String getPath() { 92 | return path; 93 | } 94 | 95 | public void setPath(String path) { 96 | this.path = path; 97 | } 98 | 99 | public String getType1() { 100 | return type1; 101 | } 102 | 103 | public void setType1(String type1) { 104 | this.type1 = type1; 105 | } 106 | 107 | public String getType2() { 108 | return type2; 109 | } 110 | 111 | public void setType2(String type2) { 112 | this.type2 = type2; 113 | } 114 | 115 | public boolean isSameNP() { 116 | return sameNP; 117 | } 118 | 119 | public void setSameNP(boolean sameNP) { 120 | this.sameNP = sameNP; 121 | } 122 | 123 | public List getWordsInBetween() { 124 | return wordsInBetween; 125 | } 126 | 127 | public void setWordsInBetween(List wordsInBetween) { 128 | this.wordsInBetween = wordsInBetween; 129 | } 130 | 131 | public String getOutcome() { 132 | return outcome; 133 | } 134 | 135 | public void setOutcome(String outcome) { 136 | this.outcome = outcome; 137 | } 138 | 139 | @Override 140 | public String toString() { 141 | return sentence.trim().replaceAll("\\s+", " "); 142 | } 143 | 144 | 145 | public int compareTo(EventItem eventItem) { 146 | if (this.uncertainty - eventItem.uncertainty < 0) { 147 | return -1; 148 | } 149 | else if (this.uncertainty - eventItem.uncertainty > 0) { 150 | return 1; 151 | } 152 | else { 153 | return 0; 154 | } 155 | } 156 | 157 | public String getPredictedOutcome() { 158 | return predictedOutcome; 159 | } 160 | 161 | public void setPredictedOutcome(String predictedOutcome) { 162 | this.predictedOutcome = predictedOutcome; 163 | } 164 | 165 | public void setScore(double score) { 166 | this.score = score; 167 | this.uncertainty = Math.abs(0.5 - score); 168 | } 169 | 170 | public double getScore() { 171 | return score; 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/retired/RelationFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.aceJet.AceDocument; 4 | import edu.nyu.jet.aceJet.AceEntityMention; 5 | import edu.nyu.jet.aceJet.AceRelationMention; 6 | import edu.nyu.jet.parser.SyntacticRelationSet; 7 | import edu.nyu.jet.tipster.Annotation; 8 | import edu.nyu.jet.tipster.Document; 9 | import opennlp.model.Event; 10 | 11 | import java.util.List; 12 | 13 | /** 14 | * Interface for a feature extractor for relation classifiers. 15 | * 16 | * Used for supervised/simulated active learning; not used by ICE GUI/CLI. 17 | */ 18 | public interface RelationFeatureExtractor { 19 | public Event extractFeatures(AceEntityMention m1, 20 | AceEntityMention m2, 21 | AceRelationMention r, 22 | Annotation sentence, 23 | SyntacticRelationSet paths, 24 | List mentions, 25 | AceDocument aceDoc, 26 | Document doc); 27 | } 28 | -------------------------------------------------------------------------------- /src/retired/RelationOracle.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation;// -*- tab-width: 4 -*- 2 | 3 | import edu.nyu.jet.ice.models.IcePath; 4 | import edu.nyu.jet.ice.models.IcePath.IcePathChoice; 5 | 6 | import java.util.*; 7 | import java.io.*; 8 | 9 | /** 10 | * Provides a mechanism for simulated active learning of relations, avoiding the 11 | * need to label the same dependency paths repeatedly by hand. 12 | *

13 | * If there is a local file relationOracle, use that file to label 14 | * candidate paths. If there is no entry for a particular candidate, ask the 15 | * user to label it and record that label for future use in file 16 | * newRelationOracle. 17 | */ 18 | 19 | public class RelationOracle { 20 | 21 | static String status = "UNKNOWN"; 22 | 23 | // each line has a repr and YES or NO 24 | static Set knownRelations = new HashSet(); 25 | 26 | public static boolean exists () { 27 | if (status.equals("YES")) 28 | return true; 29 | else if (status.equals("NO")) 30 | return false; 31 | else try { 32 | if (new File("relationOracle").exists()) { 33 | BufferedReader reader = new BufferedReader (new FileReader ("relationOracle")); 34 | String line; 35 | while ((line = reader.readLine()) != null) { 36 | knownRelations.add(line); 37 | } 38 | status = "YES"; 39 | return true; 40 | } else { 41 | status = "NO"; 42 | return false; 43 | } 44 | } catch (IOException e) { 45 | System.err.println("IOException in RelationOracle"); 46 | return false; 47 | } 48 | } 49 | 50 | /** 51 | * If a relation oracle table has been loaded, use that table to label the 52 | * candidate paths on foundPatterns. If a candidate path 53 | * is not in the table, ask the user for a label, apply that label 54 | * and record that label for future use. 55 | *

56 | * At the end, write a file newRelationOracle with 57 | * an updated table. 58 | */ 59 | 60 | public static void label (List foundPatterns) { 61 | try { 62 | BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); 63 | for (IcePath fp : foundPatterns) { 64 | String repr = fp.getRepr(); 65 | if (knownRelations.contains(repr + " YES")) { 66 | fp.setChoice(IcePathChoice.YES); 67 | } else if (knownRelations.contains(repr + " NO")) { 68 | fp.setChoice(IcePathChoice.NO); 69 | } else 70 | while (true) { 71 | System.out.print (repr + "?"); 72 | String response = reader.readLine(); 73 | if (response.equals("Y")) { 74 | fp.setChoice(IcePathChoice.YES); 75 | knownRelations.add(repr + " YES"); 76 | break; 77 | } else if (response.equals("N")) { 78 | fp.setChoice(IcePathChoice.NO); 79 | knownRelations.add(repr + " NO"); 80 | break; 81 | } else { 82 | System.out.println("Type Y or N"); 83 | } 84 | } 85 | } 86 | PrintWriter writer = new PrintWriter (new FileWriter ("newRelationOracle")); 87 | for (String repr : knownRelations) { 88 | writer.println(repr); 89 | } 90 | writer.close(); 91 | } catch (IOException e) { 92 | System.err.println("IOException in RelationOracle"); 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/retired/SameConstitFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.aceJet.AceDocument; 4 | import edu.nyu.jet.aceJet.AceEntityMention; 5 | import edu.nyu.jet.aceJet.AceRelationMention; 6 | import edu.nyu.jet.parser.SyntacticRelationSet; 7 | import edu.nyu.jet.tipster.Annotation; 8 | import edu.nyu.jet.tipster.Document; 9 | import opennlp.model.Event; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * A Feature extractor that checks if both arguments belong to the same syntactic constituent for supervised/simulated 16 | * active learning relation extraction. 17 | * 18 | * This class is not used by ICE GUI/CLI. 19 | * 20 | * @author yhe 21 | */ 22 | public class SameConstitFeatureExtractor implements RelationFeatureExtractor { 23 | public Event extractFeatures(AceEntityMention m1, 24 | AceEntityMention m2, 25 | AceRelationMention r, 26 | Annotation sentence, 27 | SyntacticRelationSet paths, 28 | List mentions, 29 | AceDocument aceDoc, 30 | Document doc) { 31 | String label = r == null ? "NONE" : r.relation.type; 32 | int h1Start = m1.getJetHead().start(); 33 | int h2End = m2.getJetHead().end(); 34 | if (h1Start >= h2End) { 35 | h1Start = m2.getJetHead().start(); 36 | h2End = m1.getJetHead().end(); 37 | } 38 | List sameConstits = extractSameConstits(h1Start, h2End, doc); 39 | return new Event(label, 40 | sameConstits.toArray(new String[sameConstits.size()])); 41 | } 42 | 43 | public static List extractSameConstits(int start, int end, Document doc) { 44 | List result = new ArrayList(); 45 | List constits = doc.annotationsOfType("constit"); 46 | if (constits != null) { 47 | for (Annotation constit : constits) { 48 | if (constit.start() < start && constit.end() > end) { 49 | String cat = ((String)constit.get("cat")).toUpperCase(); 50 | if (cat.equals("NP")) { 51 | String feat = "SameConstit=" + cat.toUpperCase(); 52 | if (!result.contains(feat)) { 53 | result.add(feat); 54 | } 55 | } 56 | } 57 | } 58 | } 59 | // if (result.size() == 0) { 60 | // result.add("SameConstit=NONE"); 61 | // } 62 | return result; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/retired/TokenFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.aceJet.AceDocument; 4 | import edu.nyu.jet.aceJet.AceEntityMention; 5 | import edu.nyu.jet.aceJet.AceRelationMention; 6 | import edu.nyu.jet.parser.SyntacticRelationSet; 7 | import edu.nyu.jet.tipster.Annotation; 8 | import edu.nyu.jet.tipster.Document; 9 | import opennlp.model.Event; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * A Feature extractor using token sequence as features for supervised/simulated active learning 16 | * relation extraction. 17 | * 18 | * This class is not used by ICE GUI/CLI. 19 | * 20 | * @author yhe 21 | */ 22 | public class TokenFeatureExtractor implements RelationFeatureExtractor { 23 | public Event extractFeatures(AceEntityMention m1, 24 | AceEntityMention m2, 25 | AceRelationMention r, 26 | Annotation sentence, 27 | SyntacticRelationSet paths, 28 | List mentions, 29 | AceDocument aceDoc, 30 | Document doc) { 31 | String label = r == null ? "NONE" : r.relation.type; 32 | int h1Start = m1.getJetHead().end(); 33 | int h2End = m2.getJetHead().start(); 34 | if (h1Start >= h2End) { 35 | h1Start = m2.getJetHead().end(); 36 | h2End = m1.getJetHead().start(); 37 | } 38 | List sameTokens = extractSameTokens(h1Start, h2End, doc); 39 | return new Event(label, 40 | sameTokens.toArray(new String[sameTokens.size()])); 41 | } 42 | 43 | public static List extractSameTokens(int start, int end, Document doc) { 44 | List result = new ArrayList(); 45 | List tokens = doc.annotationsOfType("token"); 46 | if (tokens != null) { 47 | StringBuilder b = new StringBuilder(); 48 | for (Annotation token : tokens) { 49 | if (token.start() > start && token.end() < end) { 50 | String word = doc.text(token).toLowerCase().trim().replaceAll("\\s+", "_"); 51 | b.append(word + ":"); 52 | } 53 | } 54 | if (b.length() > 0) { 55 | String feat = "Tokens=" + b.toString().substring(0, b.length() - 1); 56 | if (!result.contains(feat)) { 57 | result.add(feat); 58 | } 59 | } 60 | } 61 | if (result.size() == 0) { 62 | result.add("Tokens=NONE"); 63 | } 64 | return result; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/retired/TokenTypeFeatureExtractor.java: -------------------------------------------------------------------------------- 1 | package edu.nyu.jet.ice.relation; 2 | 3 | import edu.nyu.jet.aceJet.AceDocument; 4 | import edu.nyu.jet.aceJet.AceEntityMention; 5 | import edu.nyu.jet.aceJet.AceRelationMention; 6 | import edu.nyu.jet.aceJet.EventSyntacticPattern; 7 | import edu.nyu.jet.parser.SyntacticRelationSet; 8 | import edu.nyu.jet.tipster.Annotation; 9 | import edu.nyu.jet.tipster.Document; 10 | import opennlp.model.Event; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | /** 16 | * A Feature extractor using token sequence and entity types as features for supervised/simulated active learning 17 | * relation extraction. 18 | * 19 | * This class is not used by ICE GUI/CLI. 20 | * 21 | * @author yhe 22 | */ 23 | public class TokenTypeFeatureExtractor implements RelationFeatureExtractor { 24 | public Event extractFeatures(AceEntityMention m1, 25 | AceEntityMention m2, 26 | AceRelationMention r, 27 | Annotation sentence, 28 | SyntacticRelationSet paths, 29 | List mentions, 30 | AceDocument aceDoc, 31 | Document doc) { 32 | String label = r == null ? "NONE" : r.relation.type; 33 | int h1 = m1.getJetHead().start(); 34 | int h2 = m2.getJetHead().start(); 35 | int h1Start = m1.getJetHead().end(); 36 | int h2End = m2.getJetHead().start(); 37 | if (h1 >= h2) { 38 | int tmp = h1; 39 | h1 = h2; 40 | h2 = tmp; 41 | h1Start = m2.getJetHead().end(); 42 | h2End = m1.getJetHead().start(); 43 | } 44 | String path = EventSyntacticPattern.buildSyntacticPath(h1, h2, paths); 45 | path = path == null ? "EMPTY" : path.replaceAll("\\s+", "_"); 46 | String type1 = m1.entity.type; 47 | String type2 = m2.entity.type; 48 | String concatTypes = type1 + ":::" + type2; 49 | //String concatAll = type1 + ":::" + path + ":::" + type2; 50 | List feats = TokenFeatureExtractor.extractSameTokens(h1Start, h2End, doc); 51 | //int pathLength = path.split(":").length; 52 | //System.err.println("Path length = " + pathLength); 53 | List updatedFeats = new ArrayList(); 54 | for (String feat : feats) { 55 | updatedFeats.add(feat); 56 | updatedFeats.add("CONJ=" + feat + ":::" + concatTypes); 57 | } 58 | if (updatedFeats.size() == 0) { 59 | updatedFeats.add("TOKEN=NONE"); 60 | updatedFeats.add("CONJ=TOKEN=NONE:::" + concatTypes); 61 | } 62 | //updatedFeats.add("PATH_LENGTH=" + pathLength); 63 | updatedFeats.add(type1); 64 | updatedFeats.add(type2); 65 | updatedFeats.add(concatTypes); 66 | return new Event(label, updatedFeats.toArray(new String[updatedFeats.size()])); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/scripts/icecli: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | java -Xmx4g -Dfile.encoding=UTF-8 -cp "$ICE_HOME/ice-all.jar" -DjetHome=$JET_HOME -DiceHome=$ICE_HOME edu.nyu.jet.ice.views.cli.IceCLI $@ 3 | -------------------------------------------------------------------------------- /src/scripts/icecli6: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ICE_HOME=. 3 | ICE_LIB_HOME=. 4 | java -Xmx4g -Dfile.encoding=UTF-8 -cp "$ICE_HOME/ice-all.jar" edu.nyu.jet.ice.views.cli.IceCLI6 $@ 5 | -------------------------------------------------------------------------------- /src/scripts/runice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | java -Xmx4g -Dfile.encoding=UTF-8 -cp "$ICE_HOME/ice-all.jar" -DjetHome=$JET_HOME -DiceHome=$ICE_HOME edu.nyu.jet.ice.controllers.Nice 3 | -------------------------------------------------------------------------------- /src/scripts/runtagger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ICE_HOME=. 3 | ICE_LIB_HOME=. 4 | java -Xmx4g -Dfile.encoding=UTF-8 -cp "$ICE_HOME/ice-all.jar" AceJet.IceTagger $1 $2 $3 5 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus2/doc1.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Fred Smith lives in Chicago. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus2/doc2.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Harriet Smith lives in Seattle. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus2/doc3.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Francoise Smith lives in Paris. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus2/doc4.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Karl Smith lives in Berlin. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus3/doc1.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Fred Smith lives in Chicago. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus3/doc2.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Harriet Smith lives in Seattle. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus3/doc3.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Francoise Smith lives in Paris. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus3/doc4.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Karl Smith lives in Berlin. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus4/doc1.txt: -------------------------------------------------------------------------------- 1 | Here is today's news. Fred Smith died yesterday in Chicago, Illinois. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus4/doc2.txt: -------------------------------------------------------------------------------- 1 | Harriet Smith died yesterday in Seattle, Washington. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus4/doc3.txt: -------------------------------------------------------------------------------- 1 | Francoise Smith died yesterday in Paris, France. 2 | -------------------------------------------------------------------------------- /src/test/resources/tinyCorpus4/doc4.txt: -------------------------------------------------------------------------------- 1 | Karl Smith died yesterday in Berlin, Germany. 2 | -------------------------------------------------------------------------------- /src/test/scripts/checkCount: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | set var=`ls $1 | wc -l` 3 | set var=$var[1] 4 | if ($var != $2) then 5 | echo "error: size of $1 is $var , should be $2" 6 | endif 7 | -------------------------------------------------------------------------------- /src/test/scripts/checkLength: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | set var=`wc -l $1` 3 | set var=$var[1] 4 | if ($var != $2) then 5 | echo "error: length of $1 is $var , should be $2" 6 | endif 7 | -------------------------------------------------------------------------------- /src/test/scripts/validateCLI: -------------------------------------------------------------------------------- 1 | #!/bin/tcsh 2 | # 3 | # validation script for icecli 4 | # 5 | pwd 6 | rm ice.yml 7 | \rm -r cache 8 | # 9 | # create a few small corpora (2 and 3 are identical) 10 | # 11 | icecli addCorpus tinyCorpus2 --inputDir /misc/proteus107/grishman/ice/ice/src/test/resources/tinyCorpus2 --filter txt 12 | icecli addCorpus tinyCorpus3 --inputDir /misc/proteus107/grishman/ice/ice/src/test/resources/tinyCorpus3 --filter txt 13 | icecli addCorpus tinyCorpus4 --inputDir /misc/proteus107/grishman/ice/ice/src/test/resources/tinyCorpus4 --filter txt 14 | # 15 | # test mergeCorporaInto 16 | # 17 | icecli mergeCorporaInto mergedCorpus1 --targetDir mergedDocs1 --filter txt --fromCorpora tinyCorpus2,tinyCorpus3 18 | icecli mergeCorporaInto mergedCorpus2 --targetDir mergedDocs2 --filter txt --fromCorpora tinyCorpus2,tinyCorpus4 19 | # 20 | # test addCorpus with multiple processes 21 | # 22 | icecli addCorpus tinyCorpusPar --inputDir /misc/proteus107/grishman/ice/ice/src/test/resources/tinyCorpus2 --filter txt --processes 2 23 | # 24 | checkLength cache/tinyCorpus2/docList 4 25 | checkLength cache/tinyCorpusPar/docList 4 26 | checkCount cache/tinyCorpus2/preprocess 25 27 | checkCount cache/tinyCorpusPar/preprocess 25 28 | checkLength cache/mergedCorpus1/docList 8 29 | checkLength cache/mergedCorpus2/docList 8 30 | checkLength cache/mergedCorpus1/counts 13 # 3 header + 4 gpe + 4 person + 2 nn 31 | checkLength cache/mergedCorpus2/counts 18 # 3 header + 8 gpe + 4 person + 3 nn 32 | checkLength cache/mergedCorpus1/Relations 4 # one 'lives' relation in each doc 33 | checkCount mergedDocs1 8 34 | checkCount mergedDocs2 8 35 | checkCount cache/mergedCorpus1/preprocess 49 36 | --------------------------------------------------------------------------------