├── ranking ├── model.bin ├── models │ ├── base_ranker.py │ └── mr_ranker.py ├── all_rules.txt ├── gaussian_binner.py ├── main.py └── features │ └── feature_extractor.py ├── .gitignore ├── DiscourseSimplification ├── README.md ├── src │ ├── main │ │ ├── java │ │ │ └── org │ │ │ │ └── lambda3 │ │ │ │ └── text │ │ │ │ └── simplification │ │ │ │ └── discourse │ │ │ │ ├── processing │ │ │ │ ├── ProcessingType.java │ │ │ │ ├── SentencePreprocessor.java │ │ │ │ └── ExtendedDiscourseSimplifier.java │ │ │ │ ├── utils │ │ │ │ ├── IDGenerator.java │ │ │ │ ├── ner │ │ │ │ │ ├── NERStringParseException.java │ │ │ │ │ ├── NERExtractionUtils.java │ │ │ │ │ ├── tner │ │ │ │ │ │ ├── TNERString.java │ │ │ │ │ │ └── TNERToken.java │ │ │ │ │ ├── NERToken.java │ │ │ │ │ ├── NERTokenGroup.java │ │ │ │ │ ├── NERString.java │ │ │ │ │ └── NERStringParser.java │ │ │ │ ├── parseTree │ │ │ │ │ ├── ParseTreeException.java │ │ │ │ │ ├── ParseTreeParser.java │ │ │ │ │ └── ParseTreeVisualizer.java │ │ │ │ ├── ConfigUtils.java │ │ │ │ ├── IndexRange.java │ │ │ │ ├── pos │ │ │ │ │ ├── POSToken.java │ │ │ │ │ └── POSTagger.java │ │ │ │ ├── sentences │ │ │ │ │ └── SentencesUtils.java │ │ │ │ └── words │ │ │ │ │ └── WordsUtils.java │ │ │ │ ├── runner │ │ │ │ └── discourse_tree │ │ │ │ │ ├── extraction │ │ │ │ │ ├── rules │ │ │ │ │ │ ├── ListNP │ │ │ │ │ │ │ ├── PreListNPExtractor.java │ │ │ │ │ │ │ ├── PostListNPExtractor.java │ │ │ │ │ │ │ └── ListNPExtractor.java │ │ │ │ │ │ ├── ReferenceInitialAdverbialExtractor.java │ │ │ │ │ │ ├── ReferenceInitialConjunctionExtractor.java │ │ │ │ │ │ ├── ReferenceMedialAdverbialExtractor.java │ │ │ │ │ │ ├── ReferenceFinalAdverbialExtractor.java │ │ │ │ │ │ ├── LeadNPExtractor.java │ │ │ │ │ │ ├── AdjectivalAdverbialInitialExtractor.java │ │ │ │ │ │ ├── SubordinationPreExtractor.java │ │ │ │ │ │ ├── PurposePreExtractor.java │ │ │ │ │ │ ├── PurposePostExtractor.java │ │ │ │ │ │ ├── SubordinationPostExtractor2.java │ │ │ │ │ │ ├── SubordinationPostExtractor.java │ │ │ │ │ │ ├── SubordinationPrePurposeExtractor.java │ │ │ │ │ │ ├── SubordinationPostPurposeExtractor.java │ │ │ │ │ │ ├── AdjectivalAdverbialMiddleFinalExtractor.java │ │ │ │ │ │ ├── SharedNPPostCoordinationExtractor.java │ │ │ │ │ │ ├── SubordinationPostAttributionExtractor2.java │ │ │ │ │ │ ├── SubordinationPostAttributionExtractor.java │ │ │ │ │ │ ├── NonRestrictiveRelativeClauseWhereExtractor.java │ │ │ │ │ │ ├── RestrictiveParticipialExtractor.java │ │ │ │ │ │ ├── SharedNPPreParticipalExtractor.java │ │ │ │ │ │ ├── SharedNPPostParticipalExtractor.java │ │ │ │ │ │ └── NonRestrictiveRelativeClausePrepWhichWhoExtractor.java │ │ │ │ │ └── utils │ │ │ │ │ │ └── TregexUtils.java │ │ │ │ │ ├── model │ │ │ │ │ ├── SentenceLeaf.java │ │ │ │ │ ├── Invalidation.java │ │ │ │ │ └── Leaf.java │ │ │ │ │ └── Relation.java │ │ │ │ ├── model │ │ │ │ ├── TimeInformation.java │ │ │ │ ├── serializer │ │ │ │ │ ├── TreeSerializer.java │ │ │ │ │ └── TreeDeserializer.java │ │ │ │ ├── LinkedContext.java │ │ │ │ ├── OutSentence.java │ │ │ │ ├── Content.java │ │ │ │ └── Element.java │ │ │ │ └── App.java │ │ └── resources │ │ │ ├── logback.xml │ │ │ ├── attribution_verbs.conf │ │ │ └── cue_phrases.conf │ └── test │ │ └── java │ │ └── org │ │ └── lambda3 │ │ └── text │ │ └── simplification │ │ └── discourse │ │ ├── processing │ │ ├── SentencePreprocessorTest.java │ │ └── DiscourseSimplifierTest.java │ │ └── utils │ │ └── words │ │ └── WordUtilsTest.java └── DiscourseSimplification.iml ├── generate_candidates.py └── README.md /ranking/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mounicam/controllable_simplification/HEAD/ranking/model.bin -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | DiscourseSimplification/input.txt 3 | DiscourseSimplification/output* 4 | DiscourseSimplification/target 5 | sample_data 6 | -------------------------------------------------------------------------------- /DiscourseSimplification/README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/Lambda-3/DiscourseSimplification.svg?branch=master)](https://travis-ci.org/Lambda-3/DiscourseSimplification) 2 | 3 | # Discourse Simplification 4 | 5 | A project for simplifying sentences wrt. discourse/rhetorical structures. 6 | 7 | This is the core component of the [Graphene](https://github.com/Lambda-3/Graphene) project. 8 | 9 | ## Setup 10 | 11 | mvn clean install -DskipTests 12 | 13 | ### Run the program 14 | Create a new text file with the input 15 | 16 | vim input.txt 17 | 18 | Run program 19 | 20 | mvn clean compile exec:java 21 | 22 | Inspect output 23 | 24 | cat output_default.txt 25 | cat output_flat.txt 26 | 27 | ## Use as library 28 | Check `App.java`. 29 | Or its usage in the [Graphene](https://github.com/Lambda-3/Graphene) project. 30 | 31 | 32 | ## Contributors (alphabetical order) 33 | - Andre Freitas 34 | - Bernhard Bermeitinger 35 | - Christina Niklaus 36 | - Matthias Cetto 37 | - Siegfried Handschuh 38 | -------------------------------------------------------------------------------- /ranking/models/base_ranker.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.autograd import Variable 4 | from gaussian_binner import GaussianBinner 5 | 6 | 7 | class BaseRanker: 8 | def __init__(self, epochs, lr): 9 | self.epochs = epochs 10 | self.model = None 11 | self.lr = lr 12 | 13 | self.binner = GaussianBinner() 14 | 15 | def set_model(self, d_in, dropout=0.2): 16 | h, d_out = 100, 1 17 | self.model = torch.nn.Sequential( 18 | torch.nn.Linear(d_in, h), 19 | torch.nn.Tanh(), 20 | torch.nn.Dropout(p=dropout), 21 | torch.nn.Linear(h, h), 22 | torch.nn.Tanh(), 23 | torch.nn.Dropout(p=dropout), 24 | torch.nn.Linear(h, h), 25 | torch.nn.Tanh(), 26 | torch.nn.Dropout(p=dropout), 27 | torch.nn.Linear(h, d_out) 28 | ) 29 | 30 | def set_device(self, device): 31 | self.model.to(device) 32 | 33 | def predict(self, test_x, test_segs): 34 | test_x = self.binner.transform(np.array(test_x)) 35 | test_x = Variable(torch.FloatTensor(test_x)) 36 | return [score[0] for score in self.model(test_x).data.numpy()], test_segs 37 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/processing/ProcessingType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ProcessingType 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.processing; 24 | 25 | /** 26 | * 27 | */ 28 | public enum ProcessingType { 29 | SEPARATE, 30 | WHOLE 31 | } 32 | -------------------------------------------------------------------------------- /generate_candidates.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dissim 3 | import argparse 4 | 5 | 6 | def main(args): 7 | cwd = os.getcwd() 8 | 9 | # Runs DisSim to generate candidates 10 | os.system("cp " + args.input + " DiscourseSimplification/input.txt") 11 | os.chdir('DiscourseSimplification') 12 | os.system("mvn clean compile exec:java") 13 | os.chdir(cwd) 14 | dissim_candidates = dissim.generate_candidates(args.input, "DiscourseSimplification/output_dt.txt") 15 | 16 | # TODO: add neural splitter candidates. 17 | 18 | fpout = open(args.output, "w") 19 | for candidates in dissim_candidates: 20 | fpout.write("\t".join(candidates) + "\n") 21 | fpout.close() 22 | 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser( 26 | description='Generate DisSim candidates that have undergone splitting and deletion.') 27 | parser.add_argument('--input', help="Input sentences with one sentence in each line.") 28 | parser.add_argument('--output', help="Candidates for each input sentence seperated by tabs. \n" 29 | "The format for each candidate is " 30 | "||||||") 31 | args = parser.parse_args() 32 | main(args) 33 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/IDGenerator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : IDGenerator 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils; 24 | 25 | import java.util.UUID; 26 | 27 | public class IDGenerator { 28 | public static String generateUUID() { 29 | return String.valueOf(UUID.randomUUID()).replaceAll("-", ""); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERStringParseException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : NERStringParseException 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.ner; 24 | 25 | /** 26 | * 27 | */ 28 | public class NERStringParseException extends Exception { 29 | 30 | public NERStringParseException(String msg) { 31 | super(msg); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ParseTreeException 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.parseTree; 24 | 25 | /** 26 | * 27 | */ 28 | public class ParseTreeException extends Exception { 29 | 30 | public ParseTreeException(String text) { 31 | super("Failed to parse text: \"" + text + "\""); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/ListNP/PreListNPExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : PreListNPExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules.ListNP; 24 | 25 | /** 26 | * 27 | */ 28 | public class PreListNPExtractor extends ListNPExtractor { 29 | 30 | public PreListNPExtractor() { 31 | super("ROOT <<: (S < (NP=np $.. VP))"); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/model/TimeInformation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : TimeInformation 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.model; 24 | 25 | public class TimeInformation { 26 | private String value; 27 | 28 | // for deserialization 29 | public TimeInformation() { 30 | } 31 | 32 | public TimeInformation(String value) { 33 | this.value = value; 34 | } 35 | 36 | public String getValue() { 37 | return value; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/ListNP/PostListNPExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : PostListNPExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules.ListNP; 24 | 25 | /** 26 | * 27 | */ 28 | public class PostListNPExtractor extends ListNPExtractor { 29 | 30 | public PostListNPExtractor() { 31 | super("ROOT <<: (S < (NP $.. (VP << (NP=np))))"); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ConfigUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ConfigUtils 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils; 24 | 25 | import com.typesafe.config.Config; 26 | import com.typesafe.config.ConfigRenderOptions; 27 | 28 | /** 29 | * 30 | */ 31 | public class ConfigUtils { 32 | public static String prettyPrint(Config config) { 33 | return config == null 34 | ? null 35 | : config.root().render(ConfigRenderOptions.concise().setFormatted(true)); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/model/SentenceLeaf.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SentenceLeaf 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.model; 24 | 25 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 26 | 27 | /** 28 | * 29 | */ 30 | public class SentenceLeaf extends Leaf { 31 | 32 | public SentenceLeaf(String sentence, int sentenceIdx) throws ParseTreeException { 33 | super("SENTENCE", sentence); 34 | this.setRecursiveUnsetSentenceIdx(sentenceIdx); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /ranking/all_rules.txt: -------------------------------------------------------------------------------- 1 | PP_Shortening 2 | ParticipialMiddleExtractor 3 | ReferenceInitialAdverbialExtractor 4 | PostListNPExtractor 5 | RestrictiveParticipialExtractor 6 | NonRestrictiveRelativeClausePrepWhichWhoExtractor 7 | SubordinationPostExtractor 8 | PrepositionalAttachedtoVPExtractor 9 | SubordinationPostAttributionExtractor2 10 | SubordinationPostPurposeExtractor 11 | RestrictiveRelativeClauseWhoWhichExtractor 12 | AdjectivalAdverbialInitialExtractor 13 | SubordinationPrePurposeExtractor 14 | PurposePreExtractor 15 | NonRestrictiveRelativeClauseWhoseExtractor 16 | RestrictiveRelativeClauseWithoutRelativePronounExtractor 17 | NonRestrictiveAppositionExtractor 18 | PrepositionalMiddleFinalExtractor 19 | NonRestrictiveRelativeClauseWhoWhichExtractor 20 | NonRestrictiveRelativeClauseWhereExtractor 21 | SBAR_Shortening 22 | SharedNPPostParticipalExtractor 23 | SubordinationPreExtractor 24 | PrepositionalInitialExtractor 25 | XPOverXP 26 | PreListNPExtractor 27 | RestrictiveAppositionExtractor 28 | PreAttributionExtractor 29 | TimeExpressions 30 | CoordinationExtractor 31 | ReferenceFinalAdverbialExtractor 32 | SharedNPPreParticipalExtractor 33 | PurposePostExtractor 34 | ProjectionPrinciple 35 | NonRestrictiveRelativeClauseWhomExtractor 36 | PreposedAdjuncts 37 | AdjectivalAdverbialMiddleFinalExtractor 38 | RestrictiveRelativeClauseWhoseExtractor 39 | SubordinationPostAttributionExtractor 40 | SharedNPPostCoordinationExtractor 41 | ReferenceInitialConjunctionExtractor 42 | QuotedAttributionPostExtractor 43 | LeadNPExtractor 44 | QuotedAttributionPreExtractor 45 | ReferenceMedialAdverbialExtractor 46 | Transformer 47 | DisSim 48 | HT 49 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/IndexRange.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : IndexRange 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils; 24 | 25 | /** 26 | * 27 | */ 28 | public class IndexRange { 29 | private final int fromIdx; 30 | private final int toIdx; 31 | 32 | public IndexRange(int fromIdx, int toIdx) { 33 | this.fromIdx = fromIdx; 34 | this.toIdx = toIdx; 35 | } 36 | 37 | public int getFromIdx() { 38 | return fromIdx; 39 | } 40 | 41 | public int getToIdx() { 42 | return toIdx; 43 | } 44 | 45 | @Override 46 | public String toString() { 47 | return "(" + fromIdx + " | " + toIdx + ")"; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERExtractionUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : NERExtractionUtils 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.ner; 24 | 25 | import org.lambda3.text.simplification.discourse.utils.IndexRange; 26 | 27 | import java.util.ArrayList; 28 | import java.util.List; 29 | 30 | /** 31 | * 32 | */ 33 | public class NERExtractionUtils { 34 | 35 | public static List getNERIndexRanges(NERString nerString) { 36 | List res = new ArrayList<>(); 37 | 38 | for (NERTokenGroup group : nerString.getGroups()) { 39 | res.add(new IndexRange(group.getFromTokenIndex(), group.getToTokenIndex())); 40 | } 41 | 42 | return res; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/tner/TNERString.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : TNERString 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.ner.tner; 24 | 25 | import edu.stanford.nlp.trees.Tree; 26 | import org.lambda3.text.simplification.discourse.utils.ner.NERString; 27 | 28 | import java.util.ArrayList; 29 | import java.util.List; 30 | 31 | /** 32 | * 33 | */ 34 | public class TNERString extends NERString { 35 | private final Tree parseTree; 36 | 37 | public TNERString(List tokens, Tree parseTree) { 38 | super(new ArrayList<>(tokens)); 39 | this.parseTree = parseTree; 40 | this.tokens.forEach(t -> ((TNERToken) t).setNerString(this)); 41 | } 42 | 43 | public Tree getParseTree() { 44 | return parseTree; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/pos/POSToken.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : POSToken 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.pos; 24 | 25 | /** 26 | * 27 | */ 28 | public class POSToken { 29 | private final int index; 30 | private final String text; 31 | private final String pos; 32 | 33 | public POSToken(int index, String text, String pos) { 34 | this.index = index; 35 | this.text = text; 36 | this.pos = pos; 37 | } 38 | 39 | public int getIndex() { 40 | return index; 41 | } 42 | 43 | public String getText() { 44 | return text; 45 | } 46 | 47 | public String getPos() { 48 | return pos; 49 | } 50 | 51 | public String toString() { 52 | return "(" + index + ": " + pos + ", '" + text + "')"; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /ranking/gaussian_binner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | 6 | def gaussian(diff, sig): 7 | return np.exp(-np.power(diff, 2.) / (2 * sig * sig)) 8 | 9 | 10 | class GaussianBinner: 11 | 12 | def __init__(self, bins=10, w=0.2): 13 | self.bin_values, self.sigmas = [], [] 14 | self.bins = bins 15 | self.width = w 16 | self.eps = 0.000001 17 | 18 | def fit(self, x, features_to_be_binned=7): 19 | for index in range(0, features_to_be_binned): 20 | 21 | dimension = x[:, index] 22 | bin_divisions = np.histogram(dimension, bins=self.bins)[1] 23 | 24 | bin_means = [(bin_divisions[i] + bin_divisions[i+1]) / 2.0 25 | for i in range(0, len(bin_divisions) - 1)] 26 | 27 | half_width = abs(bin_divisions[1] - bin_divisions[0]) / 2.0 28 | bin_means[0:0] = [bin_divisions[0] - half_width] 29 | bin_means.append(bin_divisions[len(bin_divisions) - 1] + half_width) 30 | self.bin_values.append(bin_means) 31 | 32 | self.sigmas.append(abs(bin_divisions[1] - bin_divisions[0]) * self.width) 33 | 34 | def transform(self, x, features_to_be_binned=7): 35 | expanded_features = [x[:, features_to_be_binned:]] 36 | for index in range(0, features_to_be_binned): 37 | 38 | bin_means = np.array(self.bin_values[index]) 39 | 40 | projected_features = gaussian(np.tile(x[:, index], (self.bins + 2, 1)).T - bin_means, 41 | self.sigmas[index]) 42 | 43 | sum_f = np.sum(projected_features, axis=1) 44 | sum_f[sum_f == 0] = self.eps 45 | projected_features = (projected_features.T / sum_f).T 46 | expanded_features.append(projected_features) 47 | 48 | return np.concatenate(expanded_features, axis=1) 49 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/model/Invalidation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : Invalidation 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.model; 24 | 25 | import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; 26 | 27 | import java.util.ArrayList; 28 | import java.util.Collections; 29 | import java.util.List; 30 | 31 | /** 32 | * 33 | */ 34 | public class Invalidation extends DiscourseTree { 35 | 36 | public Invalidation() { 37 | super(""); 38 | } 39 | 40 | // VISUALIZATION /////////////////////////////////////////////////////////////////////////////////////////////////// 41 | 42 | @Override 43 | public List getPTPCaption() { 44 | return Collections.singletonList("INVALIDATED"); 45 | } 46 | 47 | @Override 48 | public List getPTPEdges() { 49 | return new ArrayList<>(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/model/serializer/TreeSerializer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : TreeSerializer 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.model.serializer; 24 | 25 | import com.fasterxml.jackson.core.JsonGenerator; 26 | import com.fasterxml.jackson.databind.SerializerProvider; 27 | import com.fasterxml.jackson.databind.ser.std.StdSerializer; 28 | import edu.stanford.nlp.trees.Tree; 29 | 30 | import java.io.IOException; 31 | 32 | /** 33 | * 34 | */ 35 | public class TreeSerializer extends StdSerializer { 36 | 37 | public TreeSerializer() { 38 | this(null); 39 | } 40 | 41 | protected TreeSerializer(Class t) { 42 | super(t); 43 | } 44 | 45 | @Override 46 | public void serialize(Tree value, JsonGenerator gen, SerializerProvider provider) throws IOException { 47 | gen.writeString(value.pennString().trim().replaceAll("\\s+", " ").replaceAll("[\\n\\t]", "")); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Controllable Text Simplification with Explicit Paraphrasing 2 | 3 | This repository contains the code and resources from the following [paper](https://arxiv.org/pdf/2010.11004.pdf). Our approach simplifies the given complex sentence in three steps: 4 | 5 | 1. Generate candidates for an input sentence using [DisSim](https://www.aclweb.org/anthology/P19-1333.pdf) and neural sentence splitter. DisSim is a rule-based approach proposed by Nikluas et al. 2019 that uses 35 syntactic rules to split a sentence. 6 | 7 | 1. Rank the candidates that have undergone splitting and deletion based on the quality of simplification. 8 | 9 | 1. Pass the best ranked candidate to the paraphrase generation Transformer model. 10 | 11 | 12 | ## Candidate Generation: 13 | 14 | First, you need to install the DiscourseSimplification code. We use the same code from [this](https://github.com/Lambda-3/DiscourseSimplification) repo. 15 | 16 | ``` 17 | cd DiscourseSimplification 18 | mvn clean install -DskipTests 19 | ``` 20 | 21 | To generate the candidates, you can use the following command: 22 | 23 | ```python3 generate_candidates.py --input --output ``` 24 | 25 | ## Candidate Ranking: 26 | 27 | To rank the candidates generated in the previous step, you can use the following command: 28 | 29 | ``` 30 | python3 ranking/main.py --input --candidates --output 31 | ``` 32 | 33 | ## Paraphrase Generation: 34 | 35 | Coming Soon. 36 | 37 | ## Citation 38 | Please cite if you use the above resources for your research 39 | ``` 40 | @InProceedings{NAACL-2021-Maddela, 41 | author = "Maddela, Mounica and Alva-Manchego, Fernando and Xu, Wei", 42 | title = "Controllable Text Simplification with Explicit Paraphrasing", 43 | booktitle = "Proceedings of the North American Association for Computational Linguistics (NAACL)", 44 | year = "2021", 45 | } 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERToken.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : NERToken 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.ner; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | 27 | /** 28 | * 29 | */ 30 | public class NERToken { 31 | protected final int index; 32 | protected final String text; 33 | protected final String category; 34 | 35 | public NERToken(int index, String text, String category) { 36 | this.index = index; 37 | this.text = text; 38 | this.category = category; 39 | } 40 | 41 | public int getIndex() { 42 | return index; 43 | } 44 | 45 | public String getText() { 46 | return text; 47 | } 48 | 49 | public Word getWord() { 50 | return new Word(text); 51 | } 52 | 53 | public String getCategory() { 54 | return category; 55 | } 56 | 57 | @Override 58 | public String toString() { 59 | return "(" + index + ": " + category + ", '" + text + "')"; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/model/LinkedContext.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : LinkedContext 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.model; 24 | 25 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 26 | 27 | public class LinkedContext { 28 | private String targetID; 29 | private Relation relation; 30 | 31 | // for deserialization 32 | public LinkedContext() { 33 | } 34 | 35 | public LinkedContext(String targetID, Relation relation) { 36 | this.targetID = targetID; 37 | this.relation = relation; 38 | } 39 | 40 | public String getTargetID() { 41 | return targetID; 42 | } 43 | 44 | public Element getTargetElement(SimplificationContent content) { 45 | return content.getElement(targetID); 46 | } 47 | 48 | public Relation getRelation() { 49 | return relation; 50 | } 51 | 52 | @Override 53 | public boolean equals(Object o) { 54 | return ((o instanceof LinkedContext) 55 | && (((LinkedContext) o).targetID.equals(targetID)) 56 | && (((LinkedContext) o).relation.equals(relation))); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /ranking/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | from models import mr_ranker 4 | from features.feature_extractor import FeatureExtractor 5 | 6 | 7 | def rerank(segs, segs_feats, model): 8 | score_map = {} 9 | predicted_scores, segs = model.predict(segs_feats, segs) 10 | for ind, seg in enumerate(segs): 11 | score_map[seg] = predicted_scores[ind] 12 | return sorted(score_map.keys(), key=score_map.__getitem__, reverse=True) 13 | 14 | 15 | def main(args): 16 | 17 | feature_extractor = FeatureExtractor() 18 | test_feats, test_cands, test_src = feature_extractor.get_features(args.input, args.candidates) 19 | 20 | model = torch.load(args.model) 21 | model.model.eval() 22 | 23 | top_simplifications = [] 24 | i = 0 25 | for segs_feats, segs, src in zip(test_feats, test_cands, test_src): 26 | 27 | if i % 1000 == 0: 28 | print(i) 29 | i += 1 30 | 31 | if len(segs) == 0: 32 | top_simplifications.append([src]) 33 | else: 34 | reranked_segs = rerank(segs, segs_feats, model) 35 | top_simplifications.append(reranked_segs) 36 | 37 | if args.output is not None: 38 | fp = open(args.output, 'w') 39 | for segs in top_simplifications: 40 | fp.write(segs[0] + "\n") 41 | fp.close() 42 | 43 | 44 | if __name__ == '__main__': 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument('--model', dest='model', default='ranking/model.bin', type=str) 47 | parser.add_argument("--output", dest="output", help="Best ranked candidate.", type=str) 48 | parser.add_argument('--input', help="Input sentences with one sentence in each line.") 49 | parser.add_argument('--candidates', help="Candidates for each input sentence seperated by tabs. \n" 50 | "The format for each candidate is " 51 | "||||||") 52 | args = parser.parse_args() 53 | main(args) 54 | 55 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/model/serializer/TreeDeserializer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : TreeDeSerializer 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.model.serializer; 24 | 25 | import com.fasterxml.jackson.core.JsonParser; 26 | import com.fasterxml.jackson.core.JsonProcessingException; 27 | import com.fasterxml.jackson.databind.DeserializationContext; 28 | import com.fasterxml.jackson.databind.deser.std.StdDeserializer; 29 | import edu.stanford.nlp.trees.PennTreeReader; 30 | import edu.stanford.nlp.trees.Tree; 31 | 32 | import java.io.IOException; 33 | import java.io.StringReader; 34 | 35 | /** 36 | * 37 | */ 38 | public class TreeDeserializer extends StdDeserializer { 39 | 40 | public TreeDeserializer() { 41 | this(null); 42 | } 43 | 44 | protected TreeDeserializer(Class vc) { 45 | super(vc); 46 | } 47 | 48 | @Override 49 | public Tree deserialize(JsonParser p, DeserializationContext ctxt) throws IOException, JsonProcessingException { 50 | String pennString = p.getValueAsString(); 51 | return new PennTreeReader(new StringReader(pennString)).readTree(); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/test/java/org/lambda3/text/simplification/discourse/processing/SentencePreprocessorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SentencePreprocessorTest 4 | * 5 | * Copyright © 2018 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.processing; 24 | 25 | import com.typesafe.config.Config; 26 | import com.typesafe.config.ConfigFactory; 27 | import org.junit.jupiter.api.Assertions; 28 | import org.junit.jupiter.api.Test; 29 | 30 | public class SentencePreprocessorTest { 31 | private final Config config = ConfigFactory.load().getConfig("discourse-simplification"); 32 | private final SentencePreprocessor preprocessor = new SentencePreprocessor(config); 33 | 34 | @Test 35 | void preprocessSentence() { 36 | preprocessor.setRemoveBrackets(true); 37 | 38 | String sentence = "This is a test (in brackets) and [the last (one)]."; 39 | String sentence2 = "This is -LRB- a second test -RRB-."; 40 | 41 | String psentence = preprocessor.preprocessSentence(sentence); 42 | Assertions.assertEquals("This is a test and .", psentence); 43 | 44 | String psentence2 = preprocessor.preprocessSentence(sentence2); 45 | Assertions.assertEquals("This is .", psentence2); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/test/java/org/lambda3/text/simplification/discourse/utils/words/WordUtilsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : WordUtilsTest 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.words; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import org.junit.jupiter.api.Assertions; 27 | import org.junit.jupiter.api.Test; 28 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 29 | 30 | import java.util.Arrays; 31 | import java.util.List; 32 | 33 | /** 34 | * 35 | */ 36 | class WordUtilsTest { 37 | 38 | @Test 39 | void wordsToProperSentence() throws Exception { 40 | List words = Arrays.asList( 41 | new Word("."), 42 | new Word("."), 43 | new Word("hello"), 44 | new Word(","), 45 | new Word(","), 46 | new Word("this"), 47 | new Word("is"), 48 | new Word("a"), 49 | new Word("test"), 50 | new Word("."), 51 | new Word(".") 52 | ); 53 | 54 | String sentence = WordsUtils.wordsToProperSentenceString(words); 55 | Assertions.assertEquals("Hello , this is a test .", sentence); 56 | } 57 | } -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/pos/POSTagger.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : POSTagger 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.pos; 24 | 25 | import edu.stanford.nlp.tagger.maxent.MaxentTagger; 26 | 27 | import java.util.ArrayList; 28 | import java.util.List; 29 | 30 | /** 31 | * 32 | */ 33 | public class POSTagger { 34 | private static final MaxentTagger TAGGER = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"); 35 | 36 | public static List parse(String text) { 37 | List tokens = new ArrayList<>(); 38 | 39 | String posString = TAGGER.tagString(text); 40 | 41 | String[] posTokens = posString.split(" "); 42 | 43 | int idx = 0; 44 | for (String posToken : posTokens) { 45 | int sep_idx = posToken.lastIndexOf("_"); 46 | 47 | // create text 48 | String txt = posToken.substring(0, sep_idx); 49 | String pos = posToken.substring(sep_idx + 1); 50 | POSToken token = new POSToken(idx, txt, pos); 51 | tokens.add(token); 52 | 53 | ++idx; 54 | } 55 | 56 | return tokens; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | %msg%n%n 32 | 33 | 34 | 35 | 36 | ${LOG_DIR}/out.log 37 | false 38 | 39 | 40 | %msg%n%n 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERTokenGroup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : NERTokenGroup 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.ner; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | 27 | import java.util.List; 28 | import java.util.stream.Collectors; 29 | 30 | /** 31 | * 32 | */ 33 | class NERTokenGroup { 34 | private final List tokens; 35 | 36 | public NERTokenGroup(List tokens) { 37 | this.tokens = tokens; 38 | } 39 | 40 | public int getFromTokenIndex() { 41 | return tokens.get(0).index; 42 | } 43 | 44 | public int getToTokenIndex() { 45 | return tokens.get(tokens.size() - 1).index; 46 | } 47 | 48 | public List getTokens() { 49 | return tokens; 50 | } 51 | 52 | private String getCategory() { 53 | return tokens.get(0).getCategory(); 54 | } 55 | 56 | public boolean isNamedEntity() { 57 | return !getCategory().equals(NERString.NO_CATEGORY); 58 | } 59 | 60 | public List getWords() { 61 | return tokens.stream().map(t -> new Word(t.getText())).collect(Collectors.toList()); 62 | } 63 | 64 | @Override 65 | public String toString() { 66 | return "[\n" + tokens.stream().map(t -> "\t" + t.toString()).collect(Collectors.joining("\n")) + "\n]"; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/tner/TNERToken.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : TNERToken 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.ner.tner; 24 | 25 | import edu.stanford.nlp.trees.Tree; 26 | import org.lambda3.text.simplification.discourse.utils.ner.NERToken; 27 | 28 | /** 29 | * 30 | */ 31 | public class TNERToken extends NERToken { 32 | 33 | private final Tree leafNode; 34 | private TNERString nerString; 35 | private Tree posNode; 36 | 37 | public TNERToken(int index, String token, String category, Tree leafNode) { 38 | super(index, token, category); 39 | this.nerString = null; 40 | this.leafNode = leafNode; 41 | this.posNode = null; // wait until nerString is set 42 | } 43 | 44 | public void setNerString(TNERString nerString) { 45 | this.nerString = nerString; 46 | this.posNode = leafNode.parent(getParseTree()); 47 | } 48 | 49 | private Tree getParseTree() { 50 | return nerString.getParseTree(); 51 | } 52 | 53 | public Tree getLeafNode() { 54 | return leafNode; 55 | } 56 | 57 | public Tree getPosNode() { 58 | return posNode; 59 | } 60 | 61 | private String getPOSTag() { 62 | return posNode.value(); 63 | } 64 | 65 | @Override 66 | public String toString() { 67 | return "(" + index + ": " + category + ", '" + text + "', " + getPOSTag() + ")"; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ParseTreeParser 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.parseTree; 24 | 25 | import edu.stanford.nlp.ling.CoreLabel; 26 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 27 | import edu.stanford.nlp.process.CoreLabelTokenFactory; 28 | import edu.stanford.nlp.process.PTBTokenizer; 29 | import edu.stanford.nlp.process.TokenizerFactory; 30 | import edu.stanford.nlp.trees.Tree; 31 | 32 | import java.io.StringReader; 33 | import java.util.List; 34 | 35 | /** 36 | * 37 | */ 38 | public class ParseTreeParser { 39 | 40 | private static final TokenizerFactory TOKENIZER_FACTORY = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 41 | private static final LexicalizedParser LEX_PARSER = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); 42 | 43 | static { 44 | LEX_PARSER.setOptionFlags("-outputFormat", "penn,typedDependenciesCollapsed", "-retainTmpSubcategories"); 45 | } 46 | 47 | public static Tree parse(String text) throws ParseTreeException { 48 | List rawWords = TOKENIZER_FACTORY.getTokenizer(new StringReader(text)).tokenize(); 49 | Tree bestParse = LEX_PARSER.parseTree(rawWords); 50 | if (bestParse == null) { 51 | throw new ParseTreeException(text); 52 | } 53 | 54 | return bestParse; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/sentences/SentencesUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SentencesUtils 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.sentences; 24 | 25 | import edu.stanford.nlp.ling.HasWord; 26 | import edu.stanford.nlp.ling.SentenceUtils; 27 | import edu.stanford.nlp.process.DocumentPreprocessor; 28 | 29 | import java.io.*; 30 | import java.util.ArrayList; 31 | import java.util.List; 32 | 33 | /** 34 | * 35 | */ 36 | public class SentencesUtils { 37 | 38 | private static List splitIntoSentences(Reader reader) { 39 | List res = new ArrayList<>(); 40 | 41 | DocumentPreprocessor dp = new DocumentPreprocessor(reader); 42 | for (List sentence : dp) { 43 | res.add(SentenceUtils.listToString(sentence)); 44 | } 45 | 46 | return res; 47 | } 48 | 49 | public static List splitIntoSentences(String text) { 50 | return splitIntoSentences(new StringReader(text)); 51 | } 52 | 53 | public static List splitIntoSentencesFromFile(File file, boolean byLines) throws IOException { 54 | if (byLines) { 55 | List res = new ArrayList<>(); 56 | 57 | try (BufferedReader br = new BufferedReader(new FileReader(file))) { 58 | String line; 59 | while ((line = br.readLine()) != null) { 60 | res.add(line); 61 | } 62 | } 63 | 64 | return res; 65 | } else { 66 | return splitIntoSentences(new BufferedReader(new FileReader(file))); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/parseTree/ParseTreeVisualizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ParseTreeVisualizer 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.parseTree; 24 | 25 | import edu.stanford.nlp.trees.Tree; 26 | import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; 27 | 28 | import java.util.ArrayList; 29 | import java.util.Arrays; 30 | import java.util.List; 31 | import java.util.stream.Collectors; 32 | 33 | /** 34 | * 35 | */ 36 | public class ParseTreeVisualizer { 37 | 38 | public static String prettyPrint(Tree parseTree) { 39 | MyNode node = new MyNode(parseTree, parseTree); 40 | return PrettyTreePrinter.prettyPrint(node, false); 41 | } 42 | 43 | private static class MyNode implements PrettyTreePrinter.Node { 44 | private final List children; 45 | private final String caption; 46 | private final int nr; 47 | 48 | public MyNode(Tree parseNode, Tree anchor) { 49 | this.caption = parseNode.value(); 50 | this.children = new ArrayList<>(); 51 | for (Tree childNode : parseNode.getChildrenAsList()) { 52 | this.children.add(new MyNode(childNode, anchor)); 53 | } 54 | this.nr = parseNode.nodeNumber(anchor); 55 | } 56 | 57 | @Override 58 | public List getPTPCaption() { 59 | return Arrays.asList(caption, "#" + nr); 60 | } 61 | 62 | @Override 63 | public List getPTPEdges() { 64 | return children.stream().map(c -> new PrettyTreePrinter.DefaultEdge("", c, true)).collect(Collectors.toList()); 65 | } 66 | 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/App.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : App 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse; 24 | 25 | import org.lambda3.text.simplification.discourse.processing.DiscourseSimplifier; 26 | import org.lambda3.text.simplification.discourse.processing.ProcessingType; 27 | import org.lambda3.text.simplification.discourse.model.SimplificationContent; 28 | import org.slf4j.LoggerFactory; 29 | 30 | import java.io.BufferedWriter; 31 | import java.io.File; 32 | import java.io.FileWriter; 33 | import java.io.IOException; 34 | import java.util.Arrays; 35 | import java.util.List; 36 | import java.util.stream.Collectors; 37 | 38 | public class App { 39 | private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(App.class); 40 | private static final DiscourseSimplifier DISCOURSE_SIMPLIFIER = new DiscourseSimplifier(); 41 | 42 | private static void saveLines(File file, List lines) { 43 | try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) { 44 | bw.write(lines.stream().collect(Collectors.joining("\n"))); 45 | 46 | // no need to close it. 47 | //bw.close() 48 | } catch (IOException e) { 49 | e.printStackTrace(); 50 | } 51 | } 52 | 53 | public static void main(String[] args) throws IOException { 54 | SimplificationContent content = DISCOURSE_SIMPLIFIER.doDiscourseSimplification(new File("input.txt"), ProcessingType.SEPARATE, true); 55 | content.serializeToJSON(new File("output.json")); 56 | saveLines(new File("output_default.txt"), Arrays.asList(content.defaultFormat(false))); 57 | saveLines(new File("output_flat.txt"), Arrays.asList(content.flatFormat(false))); 58 | LOGGER.info("done"); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/processing/SentencePreprocessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SentencePreprocessor 4 | * 5 | * Copyright © 2018 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.processing; 24 | 25 | import com.typesafe.config.Config; 26 | import edu.stanford.nlp.simple.Sentence; 27 | 28 | public class SentencePreprocessor { 29 | private static String ROUND_BRACKET_PATTERN = "\\([^\\(\\)]*?\\)"; 30 | private static String SQUARE_BRACKET_PATTERN = "\\[[^\\[\\]]*?\\]"; 31 | private static String CURLY_BRACKET_PATTERN = "\\{[^\\{\\}]*?\\}"; 32 | 33 | private static String ROUND_BRACKET_PATTERN2 = "-LRB-((?!-LRB-|-RRB-).)*?-RRB-"; 34 | private static String SQUARE_BRACKET_PATTERN2 = "-LSB-((?!-LSB-|-RSB-).)*?-RSB-"; 35 | private static String CURLY_BRACKET_PATTERN2 = "-LCB-((?!-LCB-|-RCB-).)*?-RCB-"; 36 | 37 | 38 | private static String WHITESPACE_PATTERN = "\\s+"; 39 | 40 | public boolean removeBrackets; 41 | 42 | public SentencePreprocessor(Config config) { 43 | this.removeBrackets = config.getBoolean("remove-brackets"); 44 | } 45 | 46 | public void setRemoveBrackets(boolean removeBrackets) { 47 | this.removeBrackets = removeBrackets; 48 | } 49 | 50 | public String preprocessSentence(String sentence) { 51 | String res = sentence; 52 | 53 | if (removeBrackets) { 54 | res = sentence.replaceAll(ROUND_BRACKET_PATTERN, "") 55 | .replaceAll(SQUARE_BRACKET_PATTERN, "") 56 | .replaceAll(CURLY_BRACKET_PATTERN, "") 57 | .replaceAll(ROUND_BRACKET_PATTERN2, "") 58 | .replaceAll(SQUARE_BRACKET_PATTERN2, "") 59 | .replaceAll(CURLY_BRACKET_PATTERN2, ""); 60 | } 61 | 62 | res = res.replaceAll(WHITESPACE_PATTERN, " "); 63 | return res; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/model/OutSentence.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : OutSentence 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.model; 24 | 25 | import java.util.HashMap; 26 | import java.util.LinkedHashMap; 27 | import java.util.List; 28 | import java.util.stream.Collectors; 29 | 30 | /** 31 | * 32 | */ 33 | public class OutSentence { 34 | private int sentenceIdx; 35 | private String originalSentence; 36 | private HashMap elementMap; // all extractions extracted from this sentence 37 | 38 | // for deserialization 39 | public OutSentence() { 40 | } 41 | 42 | public OutSentence(int sentenceIdx,String originalSentence) { 43 | this.sentenceIdx = sentenceIdx; 44 | this.originalSentence = originalSentence; 45 | this.elementMap = new LinkedHashMap<>(); 46 | } 47 | 48 | public void addElement(Element element) { 49 | if (sentenceIdx != element.getSentenceIdx()) { 50 | throw new AssertionError("Element should not be added to this sentence"); 51 | } 52 | elementMap.putIfAbsent(element.getId(), element); 53 | } 54 | 55 | public int getSentenceIdx() { 56 | return sentenceIdx; 57 | } 58 | 59 | public String getOriginalSentence() { 60 | return originalSentence; 61 | } 62 | 63 | public Element getElement(String id) { 64 | return elementMap.getOrDefault(id, null); 65 | } 66 | 67 | public List getElements() { 68 | return elementMap.values().stream().collect(Collectors.toList()); 69 | } 70 | 71 | @Override 72 | public String toString() { 73 | StringBuilder strb = new StringBuilder(); 74 | strb.append("# " + originalSentence + "\n"); 75 | getElements().forEach(e -> strb.append("\n" + e)); 76 | return strb.toString(); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /ranking/features/feature_extractor.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | import numpy as np 4 | from collections import Counter 5 | 6 | 7 | def jaccard_similarity(list1, list2): 8 | intersection = len(list(set(list1).intersection(list2))) 9 | union = (len(list1) + len(list2)) - intersection 10 | return float(intersection) / union 11 | 12 | 13 | def compression_ratio(comp, simp): 14 | return len(simp.split()) * 1.0 / len(comp.split()) 15 | 16 | 17 | class FeatureExtractor: 18 | def __init__(self): 19 | self.rule_vocab = {} 20 | for ind, line in enumerate(open("ranking/all_rules.txt")): 21 | self.rule_vocab[line.strip()] = ind 22 | 23 | def get_fv(self, cand, src): 24 | cand_sent = cand[0] 25 | 26 | fv = list() 27 | fv.append(len(cand_sent.lower().split(""))) 28 | fv.append(len(src.split()) * 1.0) 29 | fv.append(jaccard_similarity(cand_sent.lower().split(), src.lower().split())) 30 | 31 | ratio_src_cand = len(cand_sent.split()) * 1.0 / len(src.split()) 32 | fv.append(ratio_src_cand) 33 | 34 | fv.append(len(cand_sent.split()) * 1.0 / len(cand_sent.split(""))) 35 | 36 | rules = cand[1].split() 37 | fv.append(len(rules)) 38 | 39 | rules_vec = [0] * len(self.rule_vocab) 40 | for rule in rules: 41 | rules_vec[self.rule_vocab[rule]] = 1 42 | fv.extend(rules_vec) 43 | return fv 44 | 45 | def filter_candidates(self, tuples): 46 | cands, feats = [], [] 47 | for tup in tuples: 48 | fv, cand_sent = tup 49 | feats.append(fv) 50 | cands.append(cand_sent) 51 | return cands, feats 52 | 53 | def get_features(self, input_file, cands_file): 54 | 55 | print("Extracting features") 56 | all_src, all_features, all_cands = [], [], [] 57 | for src, candidates in zip(open(input_file), open(cands_file)): 58 | 59 | src = src.strip() 60 | cands = candidates.strip().split("\t") 61 | cands = [tuple(cand.split("|||")[:2]) for cand in cands] 62 | 63 | tuples = [] 64 | cand_sents = set() 65 | for cand in cands: 66 | if len(cand) > 1 and cand[0] not in cand_sents: 67 | fv = self.get_fv(cand, src) 68 | tuples.append((fv, cand[0])) 69 | cands, feature_vectors = self.filter_candidates(tuples) 70 | 71 | assert len(cands) == len(feature_vectors) 72 | all_src.append(src) 73 | all_cands.append(cands) 74 | all_features.append(feature_vectors) 75 | 76 | print("Data size: ", len(all_features), len(all_features[0]), len(all_features[0][0])) 77 | print("Done extracting features") 78 | return all_features, all_cands, all_src 79 | -------------------------------------------------------------------------------- /ranking/models/mr_ranker.py: -------------------------------------------------------------------------------- 1 | import torch, random 2 | import numpy as np 3 | from torch.autograd import Variable 4 | from models.base_ranker import BaseRanker 5 | 6 | 7 | class MRRanker(BaseRanker): 8 | def __init__(self, epochs, lr, device): 9 | self.device = device 10 | super().__init__(epochs, lr) 11 | 12 | def train(self, all_features, all_labels): 13 | train_x_1, train_x_2, train_y = self._get_pairwise_features(all_features, all_labels) 14 | 15 | self.set_model(train_x_1.size(1)) 16 | self.model.to(self.device) 17 | self.model.training = True 18 | 19 | loss_fn = torch.nn.MarginRankingLoss(margin=1.0) 20 | optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr) 21 | 22 | print("Started training.") 23 | 24 | batch_size = 4096 * 16 25 | for epoch in range(self.epochs): 26 | print("Final data size", train_x_1.size(0), train_x_1.size(1)) 27 | permutation = torch.randperm(train_x_1.size(0)) 28 | for i in range(0, train_x_1.size(0), batch_size): 29 | indices = permutation[i:i + batch_size] 30 | y_pred_1 = self.model(train_x_1[indices]) 31 | y_pred_2 = self.model(train_x_2[indices]) 32 | loss = loss_fn(y_pred_1, y_pred_2, train_y[indices]) 33 | 34 | print("Epoch ", epoch, "Loss", loss.data.cpu().numpy().tolist()) 35 | 36 | optimizer.zero_grad() 37 | loss.backward() 38 | optimizer.step() 39 | 40 | print("Done training.") 41 | self.model.eval() 42 | self.model.to("cpu") 43 | 44 | def _get_pairwise_features(self, all_features, all_labels): 45 | train_labels = [] 46 | train_features_1, train_features_2 = [], [] 47 | 48 | for feats, ls in zip(all_features, all_labels): 49 | for i, sf1 in enumerate(feats): 50 | for j, sf2 in enumerate(feats): 51 | if abs(ls[i] - ls[j]) > 0.1: 52 | train_features_1.append(sf1) 53 | train_features_2.append(sf2) 54 | train_labels.append(float(np.sign(ls[i] - ls[j]))) 55 | 56 | 57 | self.binner.fit(np.array(train_features_1)) 58 | train_features_1 = self.binner.transform(np.array(train_features_1)) 59 | train_features_2 = self.binner.transform(np.array(train_features_2)) 60 | assert len(train_labels) == len(train_features_1) == len(train_features_2) 61 | print("Pairwise data size: ", len(train_features_2)) 62 | 63 | train_x_1 = Variable(torch.FloatTensor(train_features_1).to(self.device)) 64 | train_x_2 = Variable(torch.FloatTensor(train_features_2).to(self.device)) 65 | train_y = Variable(torch.FloatTensor(train_labels).to(self.device), requires_grad=False) 66 | train_y = torch.unsqueeze(train_y, 1) 67 | return train_x_1, train_x_2, train_y 68 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/model/Content.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : Content 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.model; 24 | 25 | import com.fasterxml.jackson.annotation.JsonAutoDetect; 26 | import com.fasterxml.jackson.annotation.PropertyAccessor; 27 | import com.fasterxml.jackson.core.JsonProcessingException; 28 | import com.fasterxml.jackson.databind.ObjectMapper; 29 | import com.fasterxml.jackson.databind.module.SimpleModule; 30 | import edu.stanford.nlp.trees.Tree; 31 | import org.lambda3.text.simplification.discourse.model.serializer.TreeDeserializer; 32 | import org.lambda3.text.simplification.discourse.model.serializer.TreeSerializer; 33 | 34 | import java.io.File; 35 | import java.io.IOException; 36 | 37 | public abstract class Content { 38 | private static final ObjectMapper MAPPER = new ObjectMapper(); 39 | private static final SimpleModule MODULE = new SimpleModule(); 40 | 41 | static { 42 | MAPPER.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.NONE); 43 | MAPPER.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); 44 | 45 | // register custom de-/serializers 46 | MODULE.addSerializer(Tree.class, new TreeSerializer()); 47 | MODULE.addDeserializer(Tree.class, new TreeDeserializer()); 48 | 49 | MAPPER.registerModule(MODULE); 50 | } 51 | 52 | public static T deserializeFromJSON(String json, Class clazz) throws IOException { 53 | return MAPPER.readValue(json, clazz); 54 | } 55 | 56 | public static T deserializeFromJSON(File file, Class clazz) throws IOException { 57 | return MAPPER.readValue(file, clazz); 58 | } 59 | 60 | public String prettyPrintJSON() throws JsonProcessingException { 61 | return MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(this); 62 | } 63 | 64 | public String serializeToJSON() throws JsonProcessingException { 65 | return MAPPER.writeValueAsString(this); 66 | } 67 | 68 | public void serializeToJSON(File file) throws IOException { 69 | MAPPER.writeValue(file, this); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERString.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : NERString 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.ner; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | 27 | import java.util.ArrayList; 28 | import java.util.List; 29 | import java.util.stream.Collectors; 30 | 31 | /** 32 | * 33 | */ 34 | public class NERString { 35 | public static final String NO_CATEGORY = "O"; 36 | 37 | protected final List tokens; 38 | private List groups; 39 | 40 | public NERString(List tokens) { 41 | this.tokens = tokens; 42 | this.createGroups(); 43 | } 44 | 45 | private void createGroups() { 46 | this.groups = new ArrayList<>(); 47 | 48 | String lastCategory = null; 49 | List currGroupTokens = new ArrayList<>(); 50 | for (NERToken nerToken : this.tokens) { 51 | 52 | if ((lastCategory != null) && (!nerToken.getCategory().equals(lastCategory))) { 53 | // add 54 | this.groups.add(new NERTokenGroup(currGroupTokens)); 55 | currGroupTokens = new ArrayList<>(); 56 | } 57 | 58 | currGroupTokens.add(nerToken); 59 | lastCategory = nerToken.getCategory(); 60 | } 61 | 62 | // add 63 | this.groups.add(new NERTokenGroup(currGroupTokens)); 64 | } 65 | 66 | public List getTokens() { 67 | return tokens; 68 | } 69 | 70 | public List getGroups() { 71 | return groups; 72 | } 73 | 74 | private List getWords(int fromIndex, int toIndex) { 75 | return tokens.subList(fromIndex, toIndex).stream().map(t -> new Word(t.getText())).collect(Collectors.toList()); 76 | } 77 | 78 | public List getWords() { 79 | return getWords(0, tokens.size()); 80 | } 81 | 82 | @Override 83 | public String toString() { 84 | return tokens.stream().map(NERToken::toString).collect(Collectors.joining("\n")); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/test/java/org/lambda3/text/simplification/discourse/processing/DiscourseSimplifierTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : DiscourseSimplifierTest 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.processing; 24 | 25 | import org.junit.jupiter.api.Assertions; 26 | import org.junit.jupiter.api.Test; 27 | import org.lambda3.text.simplification.discourse.model.OutSentence; 28 | import org.lambda3.text.simplification.discourse.model.SimplificationContent; 29 | import org.slf4j.LoggerFactory; 30 | 31 | import java.io.File; 32 | import java.io.IOException; 33 | 34 | /** 35 | * 36 | */ 37 | class DiscourseSimplifierTest { 38 | private org.slf4j.Logger log = LoggerFactory.getLogger(this.getClass()); 39 | private DiscourseSimplifier simplifier = new DiscourseSimplifier(); 40 | 41 | @Test 42 | void processSingleSentence() { 43 | String text = "Peter went to Paris because he likes the city."; 44 | SimplificationContent c = simplifier.doDiscourseSimplification(text, ProcessingType.WHOLE); 45 | 46 | Assertions.assertEquals(1, c.getSentences().size()); 47 | OutSentence sent = c.getSentences().get(0); 48 | 49 | Assertions.assertEquals(2, sent.getElements().size()); 50 | } 51 | 52 | @Test 53 | void serializationTest() throws IOException { 54 | String text = "After graduating from Columbia University in 1983, Barack Obama worked as a community organizer in Chicago."; 55 | SimplificationContent c = simplifier.doDiscourseSimplification(text, ProcessingType.WHOLE); 56 | 57 | final String filename = "tmp-w8weg3q493ewqieh.json"; 58 | 59 | log.info("SAVE TO FILE..."); 60 | c.serializeToJSON(new File(filename)); 61 | 62 | log.info("LOAD FROM FILE..."); 63 | SimplificationContent loaded = SimplificationContent.deserializeFromJSON(new File(filename), SimplificationContent.class); 64 | 65 | log.info(loaded.prettyPrintJSON()); 66 | log.info("---------------------------------"); 67 | log.info(loaded.defaultFormat(false)); 68 | 69 | log.info("DELETE FILE..."); 70 | File file = new File(filename); 71 | file.delete(); 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/resources/attribution_verbs.conf: -------------------------------------------------------------------------------- 1 | attribution_verbs = [ 2 | comment, 3 | have faith in, 4 | consider, 5 | demand, 6 | apprise, 7 | report, 8 | evince, 9 | identify, 10 | enlighten, 11 | utter, 12 | ruminate, 13 | give away, 14 | discern, 15 | hold, 16 | acknowledge, 17 | explain, 18 | hypothesize, 19 | forbid, 20 | shout, 21 | theorise, 22 | betray, 23 | turn down, 24 | traverse , 25 | pipe up, 26 | cogitate, 27 | confide, 28 | hope, 29 | dispute, 30 | notify, 31 | conjecture, 32 | televise, 33 | signify , 34 | read, 35 | propose, 36 | void, 37 | express, 38 | perceive, 39 | mention, 40 | meditate, 41 | insist, 42 | presume, 43 | judge, 44 | compute, 45 | speculate, 46 | discuss, 47 | counter, 48 | reveal, 49 | contradict, 50 | conceive, 51 | proclaim, 52 | hypothesise, 53 | ascertain, 54 | signal, 55 | mean, 56 | respond, 57 | prohibit, 58 | signify, 59 | weight, 60 | urge, 61 | repudiate, 62 | pronounce, 63 | deduce, 64 | asseverate, 65 | design, 66 | expect, 67 | critique, 68 | adjudge, 69 | enounce, 70 | wonder, 71 | educate, 72 | detect, 73 | deliberate, 74 | confess, 75 | rehearse, 76 | publish, 77 | verbalize, 78 | veto, 79 | state, 80 | suspect, 81 | disprove, 82 | blur, 83 | manifest, 84 | disclose, 85 | reiterate, 86 | avow, 87 | slur, 88 | disagree, 89 | communicate, 90 | enunciate, 91 | disallow, 92 | disclaim, 93 | contemplate, 94 | reason, 95 | brood, 96 | imagine, 97 | distinguish, 98 | estimate, 99 | narrate, 100 | surmise , 101 | remark, 102 | theorize, 103 | clarify, 104 | study, 105 | disavow, 106 | keep back, 107 | recollect, 108 | display, 109 | admit, 110 | credit, 111 | belie, 112 | entertain, 113 | verbalise, 114 | dismiss, 115 | argue, 116 | think, 117 | recite, 118 | invalidate, 119 | abjure, 120 | speak up, 121 | feel, 122 | relate, 123 | renounce, 124 | articulate, 125 | assess, 126 | instruct, 127 | guess , 128 | esteem, 129 | trust, 130 | teach, 131 | speak, 132 | ventilate, 133 | guess, 134 | edify, 135 | acquaint, 136 | connote, 137 | vocalize, 138 | question, 139 | mediate, 140 | submit, 141 | mark, 142 | indicate, 143 | iterate , 144 | whisper, 145 | familiarize, 146 | tell, 147 | garble, 148 | offer, 149 | share, 150 | expose, 151 | regard, 152 | refuse, 153 | muse, 154 | clue, 155 | assert, 156 | observe, 157 | differentiate, 158 | argue against, 159 | recount, 160 | believe, 161 | count, 162 | reflect on, 163 | affirm, 164 | recall, 165 | anticipate, 166 | spill, 167 | controvert, 168 | air, 169 | warn, 170 | record, 171 | suppose, 172 | espouse, 173 | voice, 174 | declare, 175 | announce, 176 | exhibit, 177 | claim, 178 | gather, 179 | recognize, 180 | describe, 181 | influence, 182 | predicate, 183 | denote, 184 | say, 185 | deem, 186 | embrace, 187 | contest, 188 | sense, 189 | phrase, 190 | allege, 191 | publicise, 192 | surmise, 193 | ponder, 194 | discriminate, 195 | refute, 196 | agree, 197 | divulge, 198 | couch, 199 | note, 200 | discredit, 201 | reject, 202 | answer, 203 | oppose, 204 | advise, 205 | infer, 206 | bear in mind, 207 | repeat, 208 | intend, 209 | allow, 210 | mispronounce, 211 | reckon, 212 | familiarise, 213 | vocalise, 214 | make known, 215 | reflect, 216 | concede, 217 | purpose, 218 | recognise, 219 | recount , 220 | disown, 221 | broadcast, 222 | deny, 223 | let slip, 224 | renounce , 225 | remember, 226 | rationalize, 227 | assume, 228 | bid, 229 | register, 230 | make out, 231 | withhold, 232 | inform, 233 | command, 234 | unburden , 235 | publicize, 236 | recant, 237 | order, 238 | talk, 239 | know, 240 | promote, 241 | advertise, 242 | swear, 243 | emphasize, 244 | underline, 245 | testify, 246 | cite, 247 | message, 248 | ask 249 | ] -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/model/Leaf.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : Leaf 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.model; 24 | 25 | import edu.stanford.nlp.trees.Tree; 26 | import org.lambda3.text.simplification.discourse.utils.PrettyTreePrinter; 27 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 28 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 29 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeParser; 30 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 31 | 32 | import java.util.ArrayList; 33 | import java.util.Collections; 34 | import java.util.List; 35 | 36 | /** 37 | * 38 | */ 39 | public class Leaf extends DiscourseTree { 40 | private Tree parseTree; 41 | private boolean allowSplit; // true, if extraction-rules will be applied on the text 42 | private boolean toSimpleContext; 43 | 44 | public Leaf() { 45 | super("UNKNOWN"); 46 | } 47 | 48 | public Leaf(String extractionRule, Tree parseTree) { 49 | super(extractionRule); 50 | this.parseTree = parseTree; 51 | this.allowSplit = true; 52 | this.toSimpleContext = false; 53 | } 54 | 55 | // not efficient -> prefer to use constructor with tree 56 | public Leaf(String extractionRule, String text) throws ParseTreeException { 57 | this(extractionRule, ParseTreeParser.parse(text)); 58 | } 59 | 60 | public void dontAllowSplit() { 61 | this.allowSplit = false; 62 | } 63 | 64 | public Tree getParseTree() { 65 | return parseTree; 66 | } 67 | 68 | public void setParseTree(Tree parseTree) { 69 | this.parseTree = parseTree; 70 | } 71 | 72 | public String getText() { 73 | return WordsUtils.wordsToString(ParseTreeExtractionUtils.getContainingWords(parseTree)); 74 | } 75 | 76 | public void setToSimpleContext(boolean toSimpleContext) { 77 | this.toSimpleContext = toSimpleContext; 78 | } 79 | 80 | public boolean isAllowSplit() { 81 | return allowSplit; 82 | } 83 | 84 | public boolean isToSimpleContext() { 85 | return toSimpleContext; 86 | } 87 | 88 | // VISUALIZATION /////////////////////////////////////////////////////////////////////////////////////////////////// 89 | 90 | @Override 91 | public List getPTPCaption() { 92 | return Collections.singletonList("|||" + getText()); 93 | } 94 | 95 | @Override 96 | public List getPTPEdges() { 97 | return new ArrayList<>(); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/ReferenceInitialAdverbialExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ReferenceExtractor1 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class ReferenceInitialAdverbialExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV <, (ADVP|PP=adv))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | if (matcher.findAt(leaf.getParseTree())) { 53 | List cuePhraseWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("adv")); 54 | 55 | // the right constituent 56 | List words = new ArrayList<>(); 57 | words.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("adv"), false)); 58 | words.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("adv"), false)); 59 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(words)); 60 | 61 | // relation 62 | Optional relation = classifer.classifyAdverbial(cuePhraseWords); 63 | 64 | // only if present 65 | if (relation.isPresent()) { 66 | Extraction res = new Extraction( 67 | getClass().getSimpleName(), 68 | true, 69 | cuePhraseWords, 70 | relation.get(), 71 | true, 72 | Arrays.asList(rightConstituent) 73 | ); 74 | 75 | return Optional.of(res); 76 | } 77 | } 78 | 79 | return Optional.empty(); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/ReferenceInitialConjunctionExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ReferenceExtractor0 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class ReferenceInitialConjunctionExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV <, (CC=cc))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | if (matcher.findAt(leaf.getParseTree())) { 53 | List cuePhraseWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("cc")); 54 | 55 | // the right constituent 56 | List words = new ArrayList<>(); 57 | words.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("cc"), false)); 58 | words.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("cc"), false)); 59 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(words)); 60 | 61 | // relation 62 | Optional relation = classifer.classifyCoordinating(cuePhraseWords); 63 | 64 | // only if present 65 | if (relation.isPresent()) { 66 | Extraction res = new Extraction( 67 | getClass().getSimpleName(), 68 | true, 69 | cuePhraseWords, 70 | relation.get(), 71 | true, 72 | Arrays.asList(rightConstituent) 73 | ); 74 | 75 | return Optional.of(res); 76 | } 77 | } 78 | 79 | return Optional.empty(); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/ReferenceMedialAdverbialExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ReferenceExtractor2 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class ReferenceMedialAdverbialExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (ADVP|PP=adv , /,/=begin . /,/=end $,, NP $.. VP))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | if (matcher.findAt(leaf.getParseTree())) { 53 | List cuePhraseWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("adv")); 54 | 55 | // the right constituent 56 | List words = new ArrayList<>(); 57 | words.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("begin"), false)); 58 | words.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("end"), false)); 59 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(words)); 60 | 61 | // relation 62 | Optional relation = classifer.classifyAdverbial(cuePhraseWords); 63 | 64 | // only if present 65 | if (relation.isPresent()) { 66 | Extraction res = new Extraction( 67 | getClass().getSimpleName(), 68 | true, 69 | cuePhraseWords, 70 | relation.get(), 71 | true, 72 | Arrays.asList(rightConstituent) 73 | ); 74 | 75 | return Optional.of(res); 76 | } 77 | } 78 | 79 | return Optional.empty(); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/ReferenceFinalAdverbialExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ReferenceExtractor3 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class ReferenceFinalAdverbialExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S=s < (VP <+(VP) (ADVP|PP=adv))) : (=s [<<- =adv | <<- (/\\./ , =adv)])"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | if (matcher.findAt(leaf.getParseTree())) { 53 | List cuePhraseWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("adv")); 54 | 55 | // the right constituent 56 | List words = new ArrayList<>(); 57 | words.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("adv"), false)); 58 | words.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("adv"), false)); 59 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(words)); 60 | 61 | // relation 62 | Optional relation = classifer.classifyAdverbial(cuePhraseWords); 63 | 64 | // only if present 65 | if (relation.isPresent()) { 66 | Extraction res = new Extraction( 67 | getClass().getSimpleName(), 68 | true, 69 | cuePhraseWords, 70 | relation.get(), 71 | true, 72 | Arrays.asList(rightConstituent) 73 | ); 74 | 75 | return Optional.of(res); 76 | } 77 | } 78 | 79 | return Optional.empty(); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/ner/NERStringParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : NERStringParser 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.ner; 24 | 25 | import edu.stanford.nlp.ie.AbstractSequenceClassifier; 26 | import edu.stanford.nlp.ie.crf.CRFClassifier; 27 | import edu.stanford.nlp.trees.Tree; 28 | import org.lambda3.text.simplification.discourse.utils.ner.tner.TNERString; 29 | import org.lambda3.text.simplification.discourse.utils.ner.tner.TNERToken; 30 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 31 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 32 | 33 | import java.util.ArrayList; 34 | import java.util.List; 35 | 36 | /** 37 | * 38 | */ 39 | public class NERStringParser { 40 | 41 | private static final AbstractSequenceClassifier NER_CLASSIFIER = CRFClassifier.getClassifierNoExceptions("edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"); 42 | 43 | public static NERString parse(String text) { 44 | List tokens = new ArrayList<>(); 45 | 46 | String nerString = NER_CLASSIFIER.classifyToString(text); 47 | String[] nerTokens = nerString.split(" "); 48 | 49 | int idx = 0; 50 | for (String nerToken : nerTokens) { 51 | int sep_idx = nerToken.lastIndexOf("/"); 52 | 53 | // create text 54 | String txt = nerToken.substring(0, sep_idx); 55 | String category = nerToken.substring(sep_idx + 1); 56 | NERToken token = new NERToken(idx, txt, category); 57 | tokens.add(token); 58 | 59 | ++idx; 60 | } 61 | 62 | return new NERString(tokens); 63 | } 64 | 65 | public static TNERString parse(Tree parseTree) throws NERStringParseException { 66 | List tokens = new ArrayList<>(); 67 | 68 | List parseTreeLeafNumbers = ParseTreeExtractionUtils.getLeafNumbers(parseTree, parseTree); 69 | String nerString = NER_CLASSIFIER.classifyToString(WordsUtils.wordsToString(parseTree.yieldWords())); 70 | String[] nerTokens = nerString.split(" "); 71 | 72 | if (parseTreeLeafNumbers.size() != nerTokens.length) { 73 | throw new NERStringParseException("Could not map NER string to parseTree"); 74 | } 75 | 76 | int idx = 0; 77 | for (String nerToken : nerTokens) { 78 | int sep_idx = nerToken.lastIndexOf("/"); 79 | 80 | // create token 81 | String text = nerToken.substring(0, sep_idx); 82 | String category = nerToken.substring(sep_idx + 1); 83 | TNERToken token = new TNERToken(idx, text, category, parseTree.getNodeNumber(parseTreeLeafNumbers.get(idx))); 84 | tokens.add(token); 85 | 86 | ++idx; 87 | } 88 | 89 | return new TNERString(tokens, parseTree); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/processing/ExtendedDiscourseSimplifier.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ExtendedDiscourseSimplifier 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.processing; 24 | 25 | import org.lambda3.text.simplification.discourse.model.SimplificationContent; 26 | import org.lambda3.text.simplification.discourse.utils.sentences.SentencesUtils; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | import java.io.File; 31 | import java.io.FileNotFoundException; 32 | import java.io.IOException; 33 | import java.util.ArrayList; 34 | import java.util.Collections; 35 | import java.util.List; 36 | import java.util.stream.Collectors; 37 | 38 | /** 39 | * 40 | */ 41 | public class ExtendedDiscourseSimplifier extends DiscourseSimplifier { 42 | private final Logger logger = LoggerFactory.getLogger(getClass()); 43 | 44 | public static List filterSentences(List sentences, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences) { 45 | 46 | // select sentences to doDiscourseSimplification 47 | List res = new ArrayList<>(); 48 | res.addAll(sentences); 49 | 50 | // shuffle 51 | if (shuffleSentences) { 52 | Collections.shuffle(res); 53 | } 54 | 55 | // remove too long sentences 56 | if (maxSentenceLength != null) { 57 | res = res.stream().filter(s -> s.length() <= maxSentenceLength).collect(Collectors.toList()); 58 | } 59 | 60 | // limit number of sentences 61 | if (maxSentences != null) { 62 | if (res.size() > maxSentences) { 63 | res = res.subList(0, maxSentences); 64 | } 65 | } 66 | 67 | return res; 68 | } 69 | 70 | public SimplificationContent process(File file, ProcessingType type, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences) throws IOException { 71 | return process(file, type, shuffleSentences, maxSentenceLength, maxSentences, false); 72 | } 73 | 74 | public SimplificationContent process(File file, ProcessingType type, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences, boolean separateLines) throws IOException { 75 | return process(SentencesUtils.splitIntoSentencesFromFile(file, separateLines), type, shuffleSentences, maxSentenceLength, maxSentences); 76 | } 77 | 78 | public SimplificationContent process(String text, ProcessingType type, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences) { 79 | return process(SentencesUtils.splitIntoSentences(text), type, shuffleSentences, maxSentenceLength, maxSentences); 80 | } 81 | 82 | public SimplificationContent process(List sentences, ProcessingType type, boolean shuffleSentences, Integer maxSentenceLength, Integer maxSentences) { 83 | return doDiscourseSimplification(filterSentences(sentences, shuffleSentences, maxSentenceLength, maxSentences), type); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/LeadNPExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class LeadNPExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV <, (NP=np $+ (/,/ $+ NP & $++ VP=vp)))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | while (matcher.findAt(leaf.getParseTree())) { 53 | 54 | // rephrase 55 | List leftConstituentWords = rephraseEnablement(matcher.getNode("np"), matcher.getNode("vp")); 56 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 57 | leftConstituent.dontAllowSplit(); 58 | leftConstituent.setToSimpleContext(true); 59 | 60 | // the right, superordinate constituent 61 | List rightConstituentWords = new ArrayList<>(); 62 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("np"), false)); 63 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("np"), false)); 64 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 65 | 66 | // relation 67 | Relation relation = Relation.UNKNOWN_SUBORDINATION; //TODO 68 | 69 | Extraction res = new Extraction( 70 | getClass().getSimpleName(), 71 | false, 72 | null, 73 | relation, 74 | false, 75 | Arrays.asList(leftConstituent, rightConstituent) 76 | ); 77 | 78 | return Optional.of(res); 79 | } 80 | 81 | return Optional.empty(); 82 | } 83 | } 84 | 85 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/AdjectivalAdverbialInitialExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class AdjectivalAdverbialInitialExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV <, (ADJP|ADVP=ad $+ (/,/ $++ VP=vp)))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | while (matcher.findAt(leaf.getParseTree())) { 53 | 54 | // rephrase 55 | List leftConstituentWords = rephraseEnablement(matcher.getNode("ad"), matcher.getNode("vp")); 56 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 57 | leftConstituent.dontAllowSplit(); 58 | leftConstituent.setToSimpleContext(true); 59 | 60 | // the right, superordinate constituent 61 | List rightConstituentWords = new ArrayList<>(); 62 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("ad"), false)); 63 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("ad"), false)); 64 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 65 | 66 | // relation 67 | Relation relation = Relation.UNKNOWN_SUBORDINATION; //TODO 68 | 69 | Extraction res = new Extraction( 70 | getClass().getSimpleName(), 71 | false, 72 | null, 73 | relation, 74 | false, 75 | Arrays.asList(leftConstituent, rightConstituent) 76 | ); 77 | 78 | return Optional.of(res); 79 | } 80 | 81 | return Optional.empty(); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SubordinationPreExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPreExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SubordinationPreExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (SBAR=sbar < (S=s < (NP $.. VP)) $.. (NP $.. VP)))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | // the left, subordinate constituent 54 | List leftConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); 55 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 56 | 57 | // the right, superordinate constituent 58 | List rightConstituentWords = new ArrayList<>(); 59 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 60 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 61 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 62 | 63 | // relation 64 | List cuePhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("sbar"), matcher.getNode("s"), false); 65 | Relation relation = classifer.classifySubordinating(cuePhraseWords).orElse(Relation.UNKNOWN_SUBORDINATION); 66 | 67 | Extraction res = new Extraction( 68 | getClass().getSimpleName(), 69 | false, 70 | cuePhraseWords, 71 | relation, 72 | false, 73 | Arrays.asList(leftConstituent, rightConstituent) 74 | ); 75 | 76 | return Optional.of(res); 77 | } 78 | 79 | return Optional.empty(); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/PurposePreExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : EnablementPreExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class PurposePreExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (S=s <<, (VP <<, /(T|t)o/) $.. (NP $.. VP=vp)))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | while (matcher.findAt(leaf.getParseTree())) { 53 | 54 | // the left, subordinate constituent 55 | // List leftConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); 56 | 57 | // rephrase 58 | List leftConstituentWords = rephraseEnablement(matcher.getNode("s"), matcher.getNode("vp")); 59 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 60 | leftConstituent.dontAllowSplit(); 61 | leftConstituent.setToSimpleContext(true); 62 | 63 | // the right, superordinate constituent 64 | List rightConstituentWords = new ArrayList<>(); 65 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("s"), false)); 66 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("s"), false)); 67 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 68 | 69 | // relation 70 | Relation relation = Relation.PURPOSE; 71 | 72 | Extraction res = new Extraction( 73 | getClass().getSimpleName(), 74 | false, 75 | null, 76 | relation, 77 | false, 78 | Arrays.asList(leftConstituent, rightConstituent) 79 | ); 80 | 81 | return Optional.of(res); 82 | } 83 | 84 | return Optional.empty(); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/PurposePostExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : EnablementPostExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class PurposePostExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (NP $.. (VP=vp <+(VP) (NP|PP $.. (S=s <<, (VP <<, /(T|t)o/))))))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | while (matcher.findAt(leaf.getParseTree())) { 53 | 54 | // the left, superordinate constituent 55 | List leftConstituentWords = new ArrayList<>(); 56 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("s"), false)); 57 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("s"), false)); 58 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 59 | 60 | // the right, subordinate constituent 61 | // List rightConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); 62 | 63 | // rephrase 64 | List rightConstituentWords = rephraseEnablement(matcher.getNode("s"), matcher.getNode("vp")); 65 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 66 | rightConstituent.dontAllowSplit(); 67 | rightConstituent.setToSimpleContext(true); 68 | 69 | // relation 70 | Relation relation = Relation.PURPOSE; 71 | 72 | Extraction res = new Extraction( 73 | getClass().getSimpleName(), 74 | false, 75 | null, 76 | relation, 77 | true, 78 | Arrays.asList(leftConstituent, rightConstituent) 79 | ); 80 | 81 | return Optional.of(res); 82 | } 83 | 84 | return Optional.empty(); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SubordinationPostExtractor2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostISAExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SubordinationPostExtractor2 extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (NP $.. (VP <+(VP) (SBAR=sbar <<, /that/ < (S=s)))))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | // the left, !subordinate! constituent 54 | List leftConstituentWords = new ArrayList<>(); 55 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 56 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 57 | 58 | // rephrase 59 | leftConstituentWords = rephraseIntraSententialAttribution(leftConstituentWords); 60 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 61 | leftConstituent.dontAllowSplit(); 62 | leftConstituent.setToSimpleContext(true); 63 | 64 | // the right, !superordinate! constituent 65 | List rightConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); 66 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 67 | 68 | // relation 69 | Relation relation = Relation.UNKNOWN_SUBORDINATION; 70 | 71 | Extraction res = new Extraction( 72 | getClass().getSimpleName(), 73 | false, 74 | null, 75 | relation, 76 | false, 77 | Arrays.asList(leftConstituent, rightConstituent) 78 | ); 79 | 80 | return Optional.of(res); 81 | } 82 | 83 | return Optional.empty(); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SubordinationPostExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SubordinationPostExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (NP $.. (VP <+(VP) (SBAR=sbar < (S=s < (NP $.. VP))))))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | // the left, superordinate constituent 54 | List leftConstituentWords = new ArrayList<>(); 55 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 56 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 57 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 58 | 59 | // the right, subordinate constituent 60 | List rightConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); 61 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 62 | 63 | // relation 64 | List cuePhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("sbar"), matcher.getNode("s"), false); 65 | Relation relation = classifer.classifySubordinating(cuePhraseWords).orElse(Relation.UNKNOWN_SUBORDINATION); 66 | 67 | //TODO not always doDiscourseExtraction? 68 | Extraction res = new Extraction( 69 | getClass().getSimpleName(), 70 | false, 71 | cuePhraseWords, 72 | relation, 73 | true, 74 | Arrays.asList(leftConstituent, rightConstituent) 75 | ); 76 | 77 | return Optional.of(res); 78 | } 79 | 80 | return Optional.empty(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SubordinationPrePurposeExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPreEnablementExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SubordinationPrePurposeExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (SBAR=sbar < (S=s <<, (VP <<, /(T|t)o/)) $.. (NP $.. VP=vp)))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | // the left, subordinate constituent 54 | List leftConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); 55 | 56 | // rephrase 57 | leftConstituentWords = rephraseEnablement(matcher.getNode("s"), matcher.getNode("vp")); 58 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 59 | leftConstituent.dontAllowSplit(); 60 | leftConstituent.setToSimpleContext(true); 61 | 62 | // the right, superordinate constituent 63 | List rightConstituentWords = new ArrayList<>(); 64 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 65 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 66 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 67 | 68 | // relation 69 | Relation relation = Relation.PURPOSE; 70 | 71 | Extraction res = new Extraction( 72 | getClass().getSimpleName(), 73 | false, 74 | null, 75 | relation, 76 | false, 77 | Arrays.asList(leftConstituent, rightConstituent) 78 | ); 79 | 80 | return Optional.of(res); 81 | } 82 | 83 | return Optional.empty(); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /DiscourseSimplification/DiscourseSimplification.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/utils/words/WordsUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : WordsUtils 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.utils.words; 24 | 25 | import edu.stanford.nlp.ling.CoreLabel; 26 | import edu.stanford.nlp.ling.SentenceUtils; 27 | import edu.stanford.nlp.ling.Word; 28 | import edu.stanford.nlp.process.CoreLabelTokenFactory; 29 | import edu.stanford.nlp.process.PTBTokenizer; 30 | import edu.stanford.nlp.simple.Sentence; 31 | 32 | import java.io.StringReader; 33 | import java.util.ArrayList; 34 | import java.util.Arrays; 35 | import java.util.Iterator; 36 | import java.util.List; 37 | 38 | /** 39 | * 40 | */ 41 | public class WordsUtils { 42 | 43 | public static Word lemmatize(Word word) { 44 | Sentence sentence = new Sentence(word.value()); 45 | return new Word(sentence.lemma(0)); 46 | } 47 | 48 | public static List splitIntoWords(String sentence) { 49 | PTBTokenizer ptbt = new PTBTokenizer<>(new StringReader(sentence), new CoreLabelTokenFactory(), ""); 50 | List words = new ArrayList<>(); 51 | 52 | while (ptbt.hasNext()) { 53 | CoreLabel label = ptbt.next(); 54 | words.add(new Word(label)); 55 | } 56 | 57 | return words; 58 | } 59 | 60 | public static String wordsToString(List words) { 61 | return SentenceUtils.listToString(words); 62 | } 63 | 64 | public static String wordsToProperSentenceString(List words) { 65 | return wordsToString(wordsToProperSentence(words)); 66 | } 67 | 68 | private static Word capitalizeWord(Word word) { 69 | String s = word.value(); 70 | if (s.length() > 0) { 71 | s = s.substring(0, 1).toUpperCase() + s.substring(1); 72 | } 73 | 74 | return new Word(s); 75 | } 76 | 77 | public static Word lowercaseWord(Word word) { 78 | return new Word(word.value().toLowerCase()); 79 | } 80 | 81 | private static List wordsToProperSentence(List words) { 82 | List res = new ArrayList<>(); 83 | res.addAll(words); 84 | 85 | // trim '.' and ',' at beginning and the end and remove multiple, consecutive occurrences 86 | for (String c : Arrays.asList(".", ",")) { 87 | Word prev = null; 88 | Iterator it = res.iterator(); 89 | while (it.hasNext()) { 90 | Word word = it.next(); 91 | if (word.value().equals(c)) { 92 | if (prev == null || prev.value().equals(word.value())) { 93 | it.remove(); 94 | } 95 | } 96 | prev = word; 97 | } 98 | if ((!res.isEmpty()) && (res.get(res.size() - 1).value().equals(c))) { 99 | res.remove(res.size() - 1); 100 | } 101 | } 102 | 103 | // add a '.' at the end 104 | res.add(new Word(".")); 105 | 106 | // capitalize first word 107 | if (!res.isEmpty()) { 108 | res.set(0, capitalizeWord(res.get(0))); 109 | } 110 | 111 | return res; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SubordinationPostPurposeExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostEnablementExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SubordinationPostPurposeExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (NP $.. (VP=vp <+(VP) (SBAR=sbar < (S=s <<, (VP <<, /(T|t)o/))))))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | // the left, superordinate constituent 54 | List leftConstituentWords = new ArrayList<>(); 55 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 56 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 57 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 58 | 59 | // the right, subordinate constituent 60 | // List rightConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); 61 | 62 | // rephrase 63 | List rightConstituentWords = rephraseEnablement(matcher.getNode("s"), matcher.getNode("vp")); 64 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 65 | rightConstituent.dontAllowSplit(); 66 | rightConstituent.setToSimpleContext(true); 67 | 68 | // relation 69 | Relation relation = Relation.PURPOSE; 70 | 71 | Extraction res = new Extraction( 72 | getClass().getSimpleName(), 73 | false, 74 | null, 75 | relation, 76 | true, 77 | Arrays.asList(leftConstituent, rightConstituent) 78 | ); 79 | 80 | return Optional.of(res); 81 | } 82 | 83 | return Optional.empty(); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/AdjectivalAdverbialMiddleFinalExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class AdjectivalAdverbialMiddleFinalExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < VP=vp & << (/,/=comma1 $+ (ADJP|ADVP=ad ?$+ /,/=comma2)))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | while (matcher.findAt(leaf.getParseTree())) { 53 | 54 | // rephrase 55 | List leftConstituentWords = rephraseEnablement(matcher.getNode("ad"), matcher.getNode("vp")); 56 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 57 | leftConstituent.dontAllowSplit(); 58 | leftConstituent.setToSimpleContext(true); 59 | 60 | // the right, superordinate constituent 61 | List rightConstituentWords = new ArrayList<>(); 62 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("comma1"), false)); 63 | if (matcher.getNode("comma2") != null) { 64 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("comma2"), false)); 65 | } else { 66 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("ad"), false)); 67 | } 68 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 69 | 70 | // relation 71 | Relation relation = Relation.UNKNOWN_SUBORDINATION; //TODO 72 | 73 | Extraction res = new Extraction( 74 | getClass().getSimpleName(), 75 | false, 76 | null, 77 | relation, 78 | false, 79 | Arrays.asList(leftConstituent, rightConstituent) 80 | ); 81 | 82 | return Optional.of(res); 83 | } 84 | 85 | return Optional.empty(); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/utils/TregexUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : TregexUtils 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.utils; 24 | 25 | import edu.stanford.nlp.trees.Tree; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | 29 | import java.util.ArrayList; 30 | import java.util.HashMap; 31 | import java.util.List; 32 | 33 | /** 34 | * 35 | */ 36 | public class TregexUtils { 37 | 38 | public static List sortedFindAt(Tree parseTree, TregexPattern p, List groupsToOrder) { 39 | List res = new ArrayList<>(); 40 | 41 | TregexMatcher matcher = p.matcher(parseTree); 42 | while (matcher.findAt(parseTree)) { 43 | HashMap groups = new HashMap<>(); 44 | for (String name : matcher.getNodeNames()) { 45 | groups.put(name, matcher.getNode(name)); 46 | } 47 | res.add(new MyMatch(groups)); 48 | } 49 | 50 | // sort groups 51 | res.sort(new MyMatch.Comparator(parseTree, groupsToOrder)); 52 | 53 | return res; 54 | } 55 | 56 | public static List sortedFind(Tree parseTree, TregexPattern p, List groupsToOrder) { 57 | List res = new ArrayList<>(); 58 | 59 | TregexMatcher matcher = p.matcher(parseTree); 60 | while (matcher.find()) { 61 | HashMap groups = new HashMap<>(); 62 | for (String name : matcher.getNodeNames()) { 63 | groups.put(name, matcher.getNode(name)); 64 | } 65 | res.add(new MyMatch(groups)); 66 | } 67 | 68 | // sort groups 69 | res.sort(new MyMatch.Comparator(parseTree, groupsToOrder)); 70 | 71 | return res; 72 | } 73 | 74 | public static class MyMatch { 75 | private final HashMap groups; 76 | 77 | public MyMatch(HashMap groups) { 78 | this.groups = groups; 79 | } 80 | 81 | public Tree getNode(String name) { 82 | if (groups.containsKey(name)) { 83 | return groups.get(name); 84 | } else { 85 | throw new IllegalArgumentException("No discourse_tree for name: '" + name + "'"); 86 | } 87 | } 88 | 89 | public static class Comparator implements java.util.Comparator { 90 | private final Tree anchorTree; 91 | private final List names; 92 | 93 | public Comparator(Tree anchorTree, List names) { 94 | this.anchorTree = anchorTree; 95 | this.names = names; 96 | } 97 | 98 | @Override 99 | public int compare(MyMatch myMatch, MyMatch otherMatch) { 100 | int myMatchValue = 0; 101 | int otherMatchValue = 0; 102 | for (String name : names) { 103 | myMatchValue += myMatch.getNode(name).nodeNumber(anchorTree); 104 | otherMatchValue += otherMatch.getNode(name).nodeNumber(anchorTree); 105 | } 106 | 107 | return myMatchValue - otherMatchValue; 108 | } 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SharedNPPostCoordinationExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SharedNPPostCoordinationExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.Tree; 27 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 28 | import edu.stanford.nlp.trees.tregex.TregexPattern; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 32 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 34 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 35 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 36 | 37 | import java.util.ArrayList; 38 | import java.util.Arrays; 39 | import java.util.List; 40 | import java.util.Optional; 41 | 42 | /** 43 | * 44 | */ 45 | public class SharedNPPostCoordinationExtractor extends ExtractionRule { 46 | 47 | @Override 48 | public Optional extract(Leaf leaf) throws ParseTreeException { 49 | 50 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (NP $.. (VP <+(VP) (VP > VP=vp $.. VP))))"); 51 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 52 | 53 | while (matcher.findAt(leaf.getParseTree())) { 54 | List siblings = getSiblings(matcher.getNode("vp"), Arrays.asList("VP")); 55 | 56 | // constituents 57 | List precedingWords = ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), siblings.get(0), false); 58 | List followingWords = ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), siblings.get(siblings.size() - 1), false); 59 | 60 | List constituents = new ArrayList<>(); 61 | for (Tree sibling : siblings) { 62 | List words = new ArrayList(); 63 | words.addAll(precedingWords); 64 | words.addAll(ParseTreeExtractionUtils.getContainingWords(sibling)); 65 | words.addAll(followingWords); 66 | 67 | Leaf constituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(words)); 68 | constituents.add(constituent); 69 | } 70 | 71 | List cuePhraseWords = null; 72 | Relation relation = Relation.UNKNOWN_COORDINATION; 73 | if (constituents.size() == 2) { 74 | cuePhraseWords = ParseTreeExtractionUtils.getWordsInBetween(leaf.getParseTree(), siblings.get(0), siblings.get(siblings.size() - 1), false, false); 75 | relation = classifer.classifyCoordinating(cuePhraseWords).orElse(Relation.UNKNOWN_COORDINATION); 76 | } 77 | 78 | Extraction res = new Extraction( 79 | getClass().getSimpleName(), 80 | false, 81 | cuePhraseWords, 82 | relation, 83 | true, 84 | constituents 85 | ); 86 | 87 | return Optional.of(res); 88 | } 89 | 90 | return Optional.empty(); 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SubordinationPostAttributionExtractor2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostAttributionExtractor2 4 | * 5 | * Copyright © 2018 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SubordinationPostAttributionExtractor2 extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (NP $.. (VP=vp <+(VP) (SBAR=sbar <<, /that/ < (S=s)))))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | // the left, !subordinate! constituent 54 | List leftConstituentWords = new ArrayList<>(); 55 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 56 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 57 | 58 | // rephrase 59 | leftConstituentWords = rephraseIntraSententialAttribution(leftConstituentWords); 60 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 61 | leftConstituent.dontAllowSplit(); 62 | leftConstituent.setToSimpleContext(true); 63 | 64 | // the right, !superordinate! constituent 65 | List rightConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s")); 66 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 67 | 68 | // relation 69 | Optional headVerb = getHeadVerb(matcher.getNode("vp")); 70 | 71 | // only extract if verb matches 72 | if (headVerb.isPresent() && classifer.checkAttribution(headVerb.get())) { 73 | Relation relation = Relation.ATTRIBUTION; 74 | 75 | Extraction res = new Extraction( 76 | getClass().getSimpleName(), 77 | false, 78 | null, 79 | relation, 80 | false, 81 | Arrays.asList(leftConstituent, rightConstituent) 82 | ); 83 | 84 | return Optional.of(res); 85 | } 86 | } 87 | 88 | return Optional.empty(); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SubordinationPostAttributionExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostAttributionExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SubordinationPostAttributionExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (NP $.. (VP=vp <+(VP) (SBAR=sbar))))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | System.out.println("Matched ............................... !!!!!!!!!"); 54 | 55 | // the left, !subordinate! constituent 56 | List leftConstituentWords = new ArrayList<>(); 57 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 58 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 59 | 60 | // rephrase 61 | leftConstituentWords = rephraseIntraSententialAttribution(leftConstituentWords); 62 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 63 | leftConstituent.dontAllowSplit(); 64 | leftConstituent.setToSimpleContext(true); 65 | 66 | // the right, !superordinate! constituent 67 | List rightConstituentWords = ParseTreeExtractionUtils.getContainingWords(matcher.getNode("sbar")); 68 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 69 | 70 | // relation 71 | Optional headVerb = getHeadVerb(matcher.getNode("vp")); 72 | 73 | // only extract if verb matches 74 | if (headVerb.isPresent() && classifer.checkAttribution(headVerb.get())) { 75 | Relation relation = Relation.ATTRIBUTION; 76 | 77 | Extraction res = new Extraction( 78 | getClass().getSimpleName(), 79 | false, 80 | null, 81 | relation, 82 | false, 83 | Arrays.asList(leftConstituent, rightConstituent) 84 | ); 85 | 86 | return Optional.of(res); 87 | } 88 | } 89 | 90 | return Optional.empty(); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/model/Element.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : Element 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.model; 24 | 25 | import com.fasterxml.jackson.annotation.JsonIgnoreProperties; 26 | import com.fasterxml.jackson.annotation.JsonProperty; 27 | import edu.stanford.nlp.trees.Tree; 28 | import org.lambda3.text.simplification.discourse.utils.IDGenerator; 29 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 30 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 31 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeParser; 32 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 33 | 34 | import java.util.ArrayList; 35 | import java.util.List; 36 | 37 | /** 38 | * 39 | */ 40 | @JsonIgnoreProperties(ignoreUnknown = true) 41 | public class Element { 42 | private String id; 43 | private Tree parseTree; 44 | private int sentenceIdx; 45 | private int contextLayer; 46 | private List simpleContexts; 47 | private List linkedContexts; 48 | 49 | // for deserialization 50 | public Element() { 51 | } 52 | 53 | public Element(Tree parseTree, int sentenceIdx, int contextLayer) { 54 | this.id = IDGenerator.generateUUID(); 55 | this.parseTree = parseTree; 56 | this.sentenceIdx = sentenceIdx; 57 | this.contextLayer = contextLayer; 58 | this.simpleContexts = new ArrayList<>(); 59 | this.linkedContexts = new ArrayList<>(); 60 | } 61 | 62 | // not efficient -> prefer to use constructor with tree 63 | public Element(String text, int sentenceIdx, int contextLayer) throws ParseTreeException { 64 | this(ParseTreeParser.parse(text), sentenceIdx, contextLayer); 65 | } 66 | 67 | public void addLinkedContext(LinkedContext context) { 68 | if (!linkedContexts.contains(context)) { 69 | linkedContexts.add(context); 70 | } 71 | } 72 | 73 | public void addSimpleContext(SimpleContext context) { 74 | if (!simpleContexts.contains(context)) { 75 | simpleContexts.add(context); 76 | } 77 | } 78 | 79 | public String getId() { 80 | return id; 81 | } 82 | 83 | public Tree getParseTree() { 84 | return parseTree; 85 | } 86 | 87 | public void setParseTree(Tree parseTree) { 88 | this.parseTree = parseTree; 89 | } 90 | 91 | @JsonProperty("text") 92 | public String getText() { 93 | return WordsUtils.wordsToString(ParseTreeExtractionUtils.getContainingWords(parseTree)); 94 | } 95 | 96 | public int getSentenceIdx() { 97 | return sentenceIdx; 98 | } 99 | 100 | public int getContextLayer() { 101 | return contextLayer; 102 | } 103 | 104 | public List getSimpleContexts() { 105 | return simpleContexts; 106 | } 107 | 108 | public List getLinkedContexts() { 109 | return linkedContexts; 110 | } 111 | 112 | @Override 113 | public String toString() { 114 | StringBuilder strb = new StringBuilder(); 115 | strb.append(id + " " + contextLayer + " " + getText() + "\n"); 116 | getSimpleContexts().forEach(c -> strb.append("\tS:" + c.getRelation() + " " + c.getText() + "\n")); 117 | getLinkedContexts().forEach(c -> strb.append("\tL:" + c.getRelation() + " " + c.getTargetID() + "\n")); 118 | return strb.toString(); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/NonRestrictiveRelativeClauseWhereExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class NonRestrictiveRelativeClauseWhereExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV << (/.*/=head < (NP|PP $+ (/,/=comma $+ (SBAR=sbar <, (WHADVP $+ S=s & <<: WRB) & ?$+ /,/=comma2)))))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | // the left, superordinate constituent 54 | List leftConstituentWords = new ArrayList<>(); 55 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("comma"), false)); 56 | if (matcher.getNode("comma2") != null) { 57 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("comma2"), false)); 58 | } else { 59 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 60 | } 61 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 62 | 63 | // the right, subordinate constituent 64 | List rightConstituentWords = new ArrayList<>(); 65 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s"))); 66 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 67 | 68 | // relation 69 | //List cuePhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("sbar"), matcher.getNode("s"), false); 70 | //Relation relation = classifer.classifySubordinating(cuePhraseWords).orElse(Relation.UNKNOWN_SUBORDINATION); 71 | 72 | Relation relation = Relation.SPATIAL; 73 | 74 | //TODO not always doDiscourseExtraction? 75 | Extraction res = new Extraction( 76 | getClass().getSimpleName(), 77 | false, 78 | null,//cuePhraseWords, 79 | relation, 80 | true, 81 | Arrays.asList(leftConstituent, rightConstituent) 82 | ); 83 | 84 | return Optional.of(res); 85 | } 86 | 87 | return Optional.empty(); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/RestrictiveParticipialExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class RestrictiveParticipialExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | //TregexPattern p = TregexPattern.compile("ROOT <<: (S=s << (NP=np <, (NP $+ (VP=vp <, VBN|VBG=vbgn)))) "); 49 | TregexPattern p = TregexPattern.compile("ROOT <<: (S=s < VP=mainverb &<< (NP|PP=head <, (NP=np $+ (VP=vp [<, (ADVP|PP $+ VBG|VBN=vbgn) | <, VBG|VBN=vbgn] )) & [> (PP !> S)| > (VP > S)]))"); 50 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 51 | 52 | while (matcher.findAt(leaf.getParseTree())) { 53 | 54 | 55 | // the left, superordinate constituent 56 | List leftConstituentWords = new ArrayList<>(); 57 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("vp"), false)); 58 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("vp"), false)); 59 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 60 | 61 | // the right, subordinate constituent 62 | List rightConstituentWords = new ArrayList<>(); 63 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getContainingWords(matcher.getNode("np"))); 64 | rightConstituentWords.addAll(rephraseAppositionNonRes(matcher.getNode("mainverb"), matcher.getNode("np"), matcher.getNode("vbgn"))); 65 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(matcher.getNode("head"), matcher.getNode("vbgn"), false)); 66 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 67 | 68 | 69 | List cuePhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("vbgn"), matcher.getNode("s"), false); 70 | Relation relation = classifer.classifySubordinating(cuePhraseWords).orElse(Relation.IDENTIFYING_DEFINITION); 71 | 72 | //TODO not always doDiscourseExtraction? 73 | Extraction res = new Extraction( 74 | getClass().getSimpleName(), 75 | false, 76 | cuePhraseWords, 77 | relation, 78 | true, 79 | Arrays.asList(leftConstituent, rightConstituent) 80 | ); 81 | 82 | return Optional.of(res); 83 | } 84 | 85 | return Optional.empty(); 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SharedNPPreParticipalExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SharedNPPreParticipalExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SharedNPPreParticipalExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | String participalNode = "(__=node [== S=s | == (PP|ADVP <+(PP|ADVP) S=s)]) : (=s <: (VP <<, VBG|VBN=vbgn))"; 50 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < " + participalNode + ") : (=node $.. (NP=np $.. VP=vp))"); 51 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 52 | 53 | while (matcher.findAt(leaf.getParseTree())) { 54 | List cuePhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("node"), matcher.getNode("s"), false); 55 | 56 | // the left, subordinate constituent 57 | List leftConstituentWords = new ArrayList<>(); 58 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("node"), false)); 59 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getContainingWords(matcher.getNode("np"))); 60 | leftConstituentWords.addAll(getRephrasedParticipalS(matcher.getNode("np"), matcher.getNode("vp"), matcher.getNode("s"), matcher.getNode("vbgn"))); 61 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("vp"), false)); 62 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 63 | 64 | // the right, superordinate constituent 65 | List rightConstituentWords = new ArrayList<>(); 66 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("node"), false)); 67 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("node"), false)); 68 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 69 | 70 | // relation 71 | Relation relation = classifer.classifySubordinating(cuePhraseWords).orElse(Relation.UNKNOWN_COORDINATION); 72 | 73 | Extraction res = new Extraction( 74 | getClass().getSimpleName(), 75 | false, 76 | cuePhraseWords, 77 | relation, 78 | false, 79 | Arrays.asList(leftConstituent, rightConstituent) 80 | ); 81 | 82 | return Optional.of(res); 83 | } 84 | 85 | return Optional.empty(); 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/Relation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : Relation 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree; 24 | 25 | import java.util.Optional; 26 | 27 | public enum Relation { 28 | 29 | UNKNOWN, 30 | 31 | // Coordinations 32 | UNKNOWN_COORDINATION, // the default for coordination 33 | CONTRAST, 34 | CAUSE_C, 35 | RESULT_C, 36 | LIST, 37 | DISJUNCTION, 38 | TEMPORAL_AFTER_C, 39 | TEMPORAL_BEFORE_C, 40 | 41 | // Subordinations 42 | UNKNOWN_SUBORDINATION, // the default for subordination 43 | ATTRIBUTION, 44 | BACKGROUND, 45 | CAUSE, 46 | RESULT, 47 | CONDITION, 48 | ELABORATION, 49 | PURPOSE, 50 | TEMPORAL_AFTER, 51 | TEMPORAL_BEFORE, 52 | 53 | // for sentence simplification 54 | NOUN_BASED, 55 | SPATIAL, 56 | TEMPORAL, 57 | TEMPORAL_TIME, // indicating a particular instance on a time scale (e.g. “Next Sunday 2 pm”). 58 | TEMPORAL_DURATION, // the amount of time between the two end-points of a time interval (e.g. “2 weeks"). 59 | TEMPORAL_DATE, // particular date (e.g. “On 7 April 2013”). 60 | TEMPORAL_SET, IDENTIFYING_DEFINITION, DESCRIBING_DEFINITION; // periodic temporal sets representing times that occur with some frequency (“Every Tuesday”). 61 | 62 | static { 63 | UNKNOWN_COORDINATION.coordination = true; 64 | CONTRAST.coordination = true; 65 | CAUSE_C.coordination = true; 66 | RESULT_C.coordination = true; 67 | LIST.coordination = true; 68 | DISJUNCTION.coordination = true; 69 | TEMPORAL_AFTER_C.coordination = true; 70 | TEMPORAL_BEFORE_C.coordination = true; 71 | 72 | CAUSE.coordinateVersion = CAUSE_C; 73 | RESULT.coordinateVersion = RESULT_C; 74 | TEMPORAL_AFTER.coordinateVersion = TEMPORAL_AFTER_C; 75 | TEMPORAL_BEFORE.coordinateVersion = TEMPORAL_BEFORE_C; 76 | 77 | CAUSE_C.subordinateVersion = CAUSE; 78 | RESULT_C.subordinateVersion = RESULT; 79 | TEMPORAL_AFTER_C.subordinateVersion = TEMPORAL_AFTER; 80 | TEMPORAL_BEFORE_C.subordinateVersion = TEMPORAL_BEFORE; 81 | 82 | CAUSE_C.inverse = RESULT_C; 83 | RESULT_C.inverse = CAUSE_C; 84 | TEMPORAL_AFTER_C.inverse = TEMPORAL_BEFORE_C; 85 | TEMPORAL_BEFORE_C.inverse = TEMPORAL_AFTER_C; 86 | CAUSE.inverse = RESULT; 87 | RESULT.inverse = CAUSE; 88 | TEMPORAL_AFTER.inverse = TEMPORAL_BEFORE; 89 | TEMPORAL_BEFORE.inverse = TEMPORAL_AFTER; 90 | } 91 | 92 | private boolean coordination; 93 | private Relation regular; // class of context span (in subordination) or right span (coordination) 94 | private Relation inverse; // class of core span (in subordination) or left span (coordination) 95 | private Relation coordinateVersion; // optional 96 | private Relation subordinateVersion; // optional 97 | 98 | Relation() { 99 | this.coordination = false; 100 | this.regular = this; 101 | this.inverse = this; // only used in coordinations 102 | this.coordinateVersion = null; 103 | this.subordinateVersion = null; 104 | } 105 | 106 | public boolean isCoordination() { 107 | return coordination; 108 | } 109 | 110 | public Relation getRegulatRelation() { 111 | return regular; 112 | } 113 | 114 | public Relation getInverseRelation() { 115 | return inverse; 116 | } 117 | 118 | public Optional getCoordinateVersion() { 119 | return Optional.ofNullable(coordinateVersion); 120 | } 121 | 122 | public Optional getSubordinateVersion() { 123 | return Optional.ofNullable(subordinateVersion); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/ListNP/ListNPExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : ListNPExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules.ListNP; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.utils.ListNPSplitter; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.List; 38 | import java.util.Optional; 39 | 40 | /** 41 | * 42 | */ 43 | public abstract class ListNPExtractor extends ExtractionRule { 44 | private final String pattern; 45 | 46 | public ListNPExtractor(String pattern) { 47 | this.pattern = pattern; 48 | } 49 | 50 | @Override 51 | public Optional extract(Leaf leaf) throws ParseTreeException { 52 | 53 | TregexPattern p = TregexPattern.compile(pattern); 54 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 55 | 56 | while (matcher.findAt(leaf.getParseTree())) { 57 | 58 | Optional r = ListNPSplitter.splitList(leaf.getParseTree(), matcher.getNode("np")); 59 | if (r.isPresent()) { 60 | 61 | // constituents 62 | List precedingWords = ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("np"), false); 63 | List followingWords = ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("np"), false); 64 | 65 | List constituents = new ArrayList<>(); 66 | 67 | if (r.get().getIntroductionWords().isPresent()) { 68 | List words = new ArrayList(); 69 | words.addAll(precedingWords); 70 | words.addAll(r.get().getIntroductionWords().get()); 71 | words.addAll(followingWords); 72 | 73 | Leaf constituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(words)); 74 | constituent.dontAllowSplit(); 75 | constituents.add(constituent); 76 | } 77 | 78 | for (List element : r.get().getElementsWords()) { 79 | List words = new ArrayList(); 80 | words.addAll(precedingWords); 81 | words.addAll(element); 82 | words.addAll(followingWords); 83 | 84 | Leaf constituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(words)); 85 | constituent.dontAllowSplit(); 86 | constituents.add(constituent); 87 | } 88 | 89 | 90 | Extraction res = new Extraction( 91 | getClass().getSimpleName(), 92 | false, 93 | null, 94 | r.get().getRelation(), 95 | true, 96 | constituents 97 | ); 98 | 99 | return Optional.of(res); 100 | } 101 | } 102 | 103 | return Optional.empty(); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/SharedNPPostParticipalExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SharedNPPostParticipalExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class SharedNPPostParticipalExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | 49 | String participalNode = "(__=node [== S=s | == (PP|ADVP <+(PP|ADVP) S=s)]) : (=s <: (VP <<, VBG|VBN=vbgn))"; 50 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV < (NP=np $.. (VP=vp <+(VP) (NP|PP $.. " + participalNode + "))))"); 51 | 52 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 53 | 54 | while (matcher.findAt(leaf.getParseTree())) { 55 | List cuePhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("node"), matcher.getNode("s"), false); 56 | 57 | 58 | // the left, superordinate constituent 59 | List leftConstituentWords = new ArrayList<>(); 60 | // leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("node"), false)); 61 | 62 | 63 | // the left, superordinate constituent 64 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("s"), false)); 65 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("s"), false)); 66 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 67 | 68 | // the right, subordinate constituent 69 | List rightConstituentWords = new ArrayList<>(); 70 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("vp"), false)); 71 | rightConstituentWords.addAll(getRephrasedParticipalS(matcher.getNode("np"), matcher.getNode("vp"), matcher.getNode("s"), matcher.getNode("vbgn"))); 72 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("s"), false)); 73 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 74 | 75 | // relation 76 | Relation relation = classifer.classifySubordinating(cuePhraseWords).orElse(Relation.UNKNOWN_COORDINATION); 77 | 78 | Extraction res = new Extraction( 79 | getClass().getSimpleName(), 80 | false, 81 | null, 82 | relation, 83 | true, 84 | Arrays.asList(leftConstituent, rightConstituent) 85 | ); 86 | 87 | return Optional.of(res); 88 | } 89 | 90 | return Optional.empty(); 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/java/org/lambda3/text/simplification/discourse/runner/discourse_tree/extraction/rules/NonRestrictiveRelativeClausePrepWhichWhoExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ==========================License-Start============================= 3 | * DiscourseSimplification : SubordinationPostExtractor 4 | * 5 | * Copyright © 2017 Lambda³ 6 | * 7 | * GNU General Public License 3 8 | * This program is free software: you can redistribute it and/or modify 9 | * it under the terms of the GNU General Public License as published by 10 | * the Free Software Foundation, either version 3 of the License, or 11 | * (at your option) any later version. 12 | * 13 | * This program is distributed in the hope that it will be useful, 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | * GNU General Public License for more details. 17 | * 18 | * You should have received a copy of the GNU General Public License 19 | * along with this program. If not, see http://www.gnu.org/licenses/. 20 | * ==========================License-End============================== 21 | */ 22 | 23 | package org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.rules; 24 | 25 | import edu.stanford.nlp.ling.Word; 26 | import edu.stanford.nlp.trees.tregex.TregexMatcher; 27 | import edu.stanford.nlp.trees.tregex.TregexPattern; 28 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.Relation; 29 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.Extraction; 30 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.extraction.ExtractionRule; 31 | import org.lambda3.text.simplification.discourse.runner.discourse_tree.model.Leaf; 32 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeException; 33 | import org.lambda3.text.simplification.discourse.utils.parseTree.ParseTreeExtractionUtils; 34 | import org.lambda3.text.simplification.discourse.utils.words.WordsUtils; 35 | 36 | import java.util.ArrayList; 37 | import java.util.Arrays; 38 | import java.util.List; 39 | import java.util.Optional; 40 | 41 | /** 42 | * 43 | */ 44 | public class NonRestrictiveRelativeClausePrepWhichWhoExtractor extends ExtractionRule { 45 | 46 | @Override 47 | public Optional extract(Leaf leaf) throws ParseTreeException { 48 | TregexPattern p = TregexPattern.compile("ROOT <<: (S|SINV << (NP=head <, NP=np & < (/,/=comma $+ (SBAR=sbar <, (WHPP $+ S=s & <, IN=prep & <- WHNP) & ?$+ /,/=comma2))))"); 49 | TregexMatcher matcher = p.matcher(leaf.getParseTree()); 50 | 51 | while (matcher.findAt(leaf.getParseTree())) { 52 | 53 | // the left, superordinate constituent 54 | List leftConstituentWords = new ArrayList<>(); 55 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getPrecedingWords(leaf.getParseTree(), matcher.getNode("comma"), false)); 56 | if (matcher.getNode("comma2") != null) { 57 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("comma2"), false)); 58 | } else { 59 | leftConstituentWords.addAll(ParseTreeExtractionUtils.getFollowingWords(leaf.getParseTree(), matcher.getNode("sbar"), false)); 60 | } 61 | Leaf leftConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(leftConstituentWords)); 62 | 63 | // the right, subordinate constituent 64 | List rightConstituentWords = new ArrayList<>(); 65 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getContainingWords(matcher.getNode("s"))); 66 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getContainingWords(matcher.getNode("prep"))); 67 | rightConstituentWords.addAll(ParseTreeExtractionUtils.getWordsInBetween(leaf.getParseTree(), matcher.getNode("np"), matcher.getNode("comma"), true, false)); 68 | Leaf rightConstituent = new Leaf(getClass().getSimpleName(), WordsUtils.wordsToProperSentenceString(rightConstituentWords)); 69 | 70 | // relation 71 | List cuePhraseWords = ParseTreeExtractionUtils.getPrecedingWords(matcher.getNode("sbar"), matcher.getNode("s"), false); 72 | Relation relation = classifer.classifySubordinating(cuePhraseWords).orElse(Relation.DESCRIBING_DEFINITION); 73 | 74 | //TODO not always doDiscourseExtraction? 75 | Extraction res = new Extraction( 76 | getClass().getSimpleName(), 77 | false, 78 | cuePhraseWords, 79 | relation, 80 | true, 81 | Arrays.asList(leftConstituent, rightConstituent) 82 | ); 83 | 84 | return Optional.of(res); 85 | } 86 | 87 | return Optional.empty(); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /DiscourseSimplification/src/main/resources/cue_phrases.conf: -------------------------------------------------------------------------------- 1 | cue_phrases { 2 | default_phrases { 3 | matching = contained 4 | phrases { 5 | // CONTRAST 6 | "although" = CONTRAST 7 | "but" = CONTRAST 8 | "but now" = CONTRAST 9 | "despite" = CONTRAST 10 | "even though" = CONTRAST 11 | "even when" = CONTRAST 12 | "except when" = CONTRAST 13 | "however" = CONTRAST 14 | "instead" = CONTRAST 15 | "rather" = CONTRAST 16 | "still" = CONTRAST 17 | "though" = CONTRAST 18 | "thus" = CONTRAST 19 | "until recently" = CONTRAST 20 | "while" = CONTRAST 21 | "yet" = CONTRAST 22 | 23 | // LIST 24 | "and" = LIST 25 | "in addition" = LIST 26 | "in addition to" = LIST 27 | "moreover" = LIST 28 | 29 | // DISJUNCTION 30 | "or" = DISJUNCTION 31 | 32 | // CAUSE 33 | // "largely because" = CAUSE_C 34 | "because" = CAUSE_C // changed from EXPLANATION TO CAUSE 35 | "since" = CAUSE_C // changed from TEMPORAL_AFTER TO CAUSE 36 | 37 | // RESULT 38 | "as a result" = RESULT_C 39 | "as a result of" = RESULT_C 40 | 41 | // TEMPORAL_AFTER 42 | "after" = TEMPORAL_AFTER_C 43 | "and after" = TEMPORAL_AFTER_C 44 | "next" = TEMPORAL_AFTER_C 45 | "then" = TEMPORAL_AFTER_C 46 | 47 | // TEMPORAL_BEFORE 48 | "before" = TEMPORAL_BEFORE_C 49 | "previously" = TEMPORAL_BEFORE_C // changed from BACKGROUND TO TEMPORAL_BEFORE 50 | 51 | // BACKGROUND 52 | "as" = BACKGROUND 53 | "now" = BACKGROUND 54 | "once" = BACKGROUND 55 | "when" = BACKGROUND 56 | "with" = BACKGROUND 57 | "without" = BACKGROUND 58 | 59 | // CONDITION 60 | "if" = CONDITION 61 | "in case" = CONDITION 62 | "unless" = CONDITION 63 | "until" = CONDITION 64 | 65 | // ELABORATION 66 | "more provocatively" = ELABORATION 67 | "even before" = ELABORATION 68 | "for example" = ELABORATION 69 | "further" = ELABORATION 70 | "recently" = ELABORATION 71 | "since(\\W(.*?\\W)?)now" = ELABORATION 72 | "so" = ELABORATION 73 | "so far" = ELABORATION 74 | "where" = ELABORATION 75 | "whereby" = ELABORATION 76 | "whether" = ELABORATION 77 | 78 | // // EXPLANATION 79 | // "simply because" = EXPLANATION 80 | // "because of" = EXPLANATION 81 | // "indeed" = EXPLANATION 82 | // "so(\\W(.*?\\W)?)that" = EXPLANATION 83 | } 84 | } 85 | 86 | subordinating_phrases { 87 | matching = contained 88 | phrases = ${cue_phrases.default_phrases.phrases} 89 | phrases { 90 | // CAUSE 91 | // "largely because" = CAUSE 92 | "because" = CAUSE // changed from EXPLANATION TO CAUSE 93 | "since" = CAUSE // changed from TEMPORAL_AFTER TO CAUSE 94 | 95 | // RESULT 96 | "as a result" = RESULT 97 | "as a result of" = RESULT 98 | 99 | // TEMPORAL_AFTER 100 | "after" = TEMPORAL_BEFORE 101 | "and after" = TEMPORAL_BEFORE 102 | "next" = TEMPORAL_AFTER 103 | "then" = TEMPORAL_AFTER 104 | 105 | // TEMPORAL_BEFORE 106 | "before" = TEMPORAL_AFTER 107 | "previously" = TEMPORAL_AFTER // changed from BACKGROUND TO TEMPORAL_BEFORE 108 | } 109 | } 110 | 111 | coordinating_phrases { 112 | matching = contained 113 | phrases = ${cue_phrases.default_phrases.phrases} 114 | } 115 | 116 | adverbial_phrases { 117 | matching = exact 118 | phrases = ${cue_phrases.default_phrases.phrases} 119 | phrases { 120 | // CAUSE 121 | // "largely because(\\W(.*?\\W)?)(this|that)" = CAUSE_C 122 | "because(\\W(.*?\\W)?)(this|that)" = CAUSE_C // changed from EXPLANATION TO CAUSE 123 | 124 | // RESULT 125 | "as a result(\\W(.*?\\W)?)(this|that)" = RESULT_C 126 | "as a result of(\\W(.*?\\W)?)(this|that)" = RESULT_C 127 | 128 | // TEMPORAL_AFTER 129 | "after(\\W(.*?\\W)?)(this|that)" = TEMPORAL_AFTER_C 130 | "and after(\\W(.*?\\W)?)(this|that)" = TEMPORAL_AFTER_C 131 | 132 | // TEMPORAL_BEFORE 133 | "before(\\W(.*?\\W)?)(this|that)" = TEMPORAL_BEFORE_C 134 | "previously(\\W(.*?\\W)?)(this|that)" = TEMPORAL_BEFORE_C // changed from BACKGROUND TO TEMPORAL_BEFORE 135 | } 136 | } 137 | } --------------------------------------------------------------------------------