├── topictiling.png ├── de.tudarmstadt.langtech.semantics.segmentation.topictiling ├── scripts │ ├── topictiling.sh │ ├── package.sh │ └── topictiling.bat ├── src │ ├── main │ │ ├── resources │ │ │ └── desc │ │ │ │ └── type │ │ │ │ ├── Segment.xml │ │ │ │ ├── SegmentQuantity.xml │ │ │ │ ├── CohesionIndicator.xml │ │ │ │ ├── SegmentScore.xml │ │ │ │ └── GibbsLdaDescriptor.xml │ │ └── java │ │ │ └── de │ │ │ └── tudarmstadt │ │ │ └── langtech │ │ │ ├── semantics │ │ │ ├── type │ │ │ │ ├── Segment.java │ │ │ │ ├── Segment_Type.java │ │ │ │ ├── SegmentQuantity.java │ │ │ │ ├── SegmentQuantity_Type.java │ │ │ │ ├── SegmentScore.java │ │ │ │ └── SegmentScore_Type.java │ │ │ └── segmentation │ │ │ │ └── segmenter │ │ │ │ ├── annotator │ │ │ │ ├── SimpleSegmenter.java │ │ │ │ ├── OutputSegments.java │ │ │ │ ├── TopicTilingDocumentSegmenterAnnotator.java │ │ │ │ └── TopicTilingSegmenterAnnotator.java │ │ │ │ ├── RunTopicTilingOnFile.java │ │ │ │ ├── TopicTilingTopicDocument.java │ │ │ │ └── TextTilingWindowOptimized.java │ │ │ └── ml │ │ │ └── lda │ │ │ └── type │ │ │ ├── GibbsLdaTopic.java │ │ │ └── GibbsLdaTopic_Type.java │ └── test │ │ └── java │ │ ├── TestSimpleReader.java │ │ └── RunTopicTilingOnFile.java ├── .project ├── .classpath ├── pom.xml └── README.txt ├── split_output.py ├── de.tudarmstadt.langtech.lda ├── .project ├── pom.xml ├── src │ ├── main │ │ ├── java │ │ │ ├── jgibbslda │ │ │ │ ├── Constants.java │ │ │ │ ├── Conversion.java │ │ │ │ ├── LDACmdOption.java │ │ │ │ ├── Pair.java │ │ │ │ ├── LDA.java │ │ │ │ ├── Document.java │ │ │ │ ├── Estimator.java │ │ │ │ ├── Dictionary.java │ │ │ │ ├── LogSaveEstimator.java │ │ │ │ ├── Inferencer.java │ │ │ │ └── LDADataset.java │ │ │ └── de │ │ │ │ └── tudarmstadt │ │ │ │ └── langtech │ │ │ │ └── lda │ │ │ │ ├── consumer │ │ │ │ └── GibbsLdaModelGeneratorConsumer.java │ │ │ │ ├── annotator │ │ │ │ ├── GibbsLdaDocumentBasedTopicIdAnnotator.java │ │ │ │ ├── GibbsLdaSentenceBasedTopicIdAnnotator.java │ │ │ │ ├── GibbsLdaTopicModelAnnotator.java │ │ │ │ └── GibbsLdaTopicIdAnnotator.java │ │ │ │ └── type │ │ │ │ ├── Topic.java │ │ │ │ ├── Topic_Type.java │ │ │ │ ├── TopicDistribution.java │ │ │ │ ├── WordTopicDistribution.java │ │ │ │ ├── TopicDistribution_Type.java │ │ │ │ └── WordTopicDistribution_Type.java │ │ └── resources │ │ │ └── desc │ │ │ └── type │ │ │ └── gibbsldatypes.xml │ └── test │ │ └── java │ │ └── de │ │ └── tudarmstadt │ │ └── langtech │ │ └── lda │ │ └── TestLdaTopicModelAnnotator.java └── .classpath └── README.md /topictiling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/riedlma/topictiling/HEAD/topictiling.png -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/topictiling.sh: -------------------------------------------------------------------------------- 1 | java -Xmx1G -cp $(echo dependency/*jar| tr ' ' ':'):de.tudarmstadt.langtech.semantics.segmentation.topictiling-0.0.2.jar de.tudarmstadt.langtech.semantics.segmentation.segmenter.RunTopicTilingOnFile $@ 2 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/package.sh: -------------------------------------------------------------------------------- 1 | outp=topictiling_0.0.2 2 | 3 | cd ../de.tudarmstadt.langtech.lda 4 | mvn package 5 | mvn install 6 | cd ../de.tudarmstadt.langtech.semantics.segmentation.topictiling 7 | mvn package 8 | mvn dependency:copy-dependencies 9 | 10 | mkdir $outp 11 | cp target/*jar $outp 12 | cp -r target/dependency $outp 13 | cp scripts/top*sh $outp 14 | cp scripts/top*bat $outp 15 | 16 | cp README.txt $outp -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | SegmentDescriptor 4 | 5 | 1.0 6 | 7 | 8 | 9 | de.tudarmstadt.langtech.semantics.type.Segment 10 | 11 | uima.tcas.Annotation 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /split_output.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | topic_output = sys.argv[1] 5 | output_folder = sys.argv[2] 6 | 7 | if not os.path.exists(output_folder): 8 | os.makedirs(output_folder) 9 | 10 | out = "" 11 | out_filename = "" 12 | for l in open(topic_output): 13 | out+=l 14 | if l.strip()=="": 15 | out="" 16 | if l.strip().startswith(""): 17 | docname = l.strip().replace("","").replace("","") 18 | if l.strip().startswith(""): 19 | fw = open(os.path.join(output_folder,docname),"w") 20 | fw.write(out) 21 | fw.close() 22 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | de.tudarmstadt.langtech.lda 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | de.tudarmstadt.langtech.semantics.segmentation.topictiling 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentQuantity.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | SegmentQuantity 4 | 5 | 1.0 6 | 7 | 8 | 9 | de.tudarmstadt.langtech.semantics.type.SegmentQuantity 10 | Saves the number of segments a document should consist of according to a given gold-standard. 11 | uima.tcas.Annotation 12 | 13 | 14 | segmentCount 15 | 16 | uima.cas.Integer 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/CohesionIndicator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | CohesionIndicator 4 | 5 | 1.0 6 | 7 | 8 | 9 | de.tudarmstadt.ukp.dkpro.semantics.type.CohesionIndicator 10 | Marks a range that is relevant for cohesion. This may be, for instance, a Lemma. 11 | uima.tcas.Annotation 12 | 13 | 14 | stringRepresentation 15 | 16 | uima.cas.String 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentScore.xml: -------------------------------------------------------------------------------- 1 | 2 | SegmentScore 3 | 4 | 1.0 5 | 6 | 7 | 8 | de.tudarmstadt.langtech.semantics.type.SegmentScore 9 | 10 | uima.tcas.Annotation 11 | 12 | 13 | score 14 | 15 | uima.cas.Double 16 | 17 | 18 | similarityScores 19 | 20 | uima.cas.String 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/GibbsLdaDescriptor.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | GibbsLdaDescriptor 4 | 5 | 1.0 6 | 7 | 8 | 9 | de.tudarmstadt.langtech.ml.lda.type.GibbsLdaTopic 10 | 11 | uima.tcas.Annotation 12 | 13 | 14 | topic 15 | 16 | uima.cas.Integer 17 | 18 | 19 | termId 20 | 21 | uima.cas.Integer 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | de.tudarmstadt.ukp.dkpro 4 | de.tudarmstadt.ukp.dkpro.lda 5 | 0.0.1-SNAPSHOT 6 | 7 | 8 | args4j 9 | args4j 10 | 2.0.16 11 | jar 12 | compile 13 | 14 | 15 | de.tudarmstadt.ukp.dkpro.core 16 | de.tudarmstadt.ukp.dkpro.core.api.metadata-asl 17 | 1.4.0 18 | 19 | 20 | de.tudarmstadt.ukp.dkpro.core 21 | de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl 22 | 1.4.0 23 | 24 | 25 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/test/java/TestSimpleReader.java: -------------------------------------------------------------------------------- 1 | import java.io.File; 2 | import java.io.IOException; 3 | 4 | import org.apache.commons.io.FileUtils; 5 | import org.apache.uima.UIMAException; 6 | import org.apache.uima.analysis_engine.AnalysisEngine; 7 | import org.apache.uima.jcas.JCas; 8 | import org.uimafit.factory.AnalysisEngineFactory; 9 | import org.uimafit.factory.JCasFactory; 10 | import org.uimafit.pipeline.SimplePipeline; 11 | import org.uimafit.util.JCasUtil; 12 | 13 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.SimpleSegmenter; 14 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; 15 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 16 | 17 | public class TestSimpleReader { 18 | public static void main(String[] args) throws UIMAException, IOException { 19 | // String f = "test.txt"; 20 | // JCas jcas = JCasFactory.createJCas(); 21 | // jcas.setDocumentText(FileUtils.readFileToString(new File(f))); 22 | // AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(SimpleSegmenter.class); 23 | // SimplePipeline.runPipeline(jcas, segmenter); 24 | // for(Sentence s:JCasUtil.select(jcas, Sentence.class)){ 25 | // System.out.println(s.getCoveredText()); 26 | // for (Token t: JCasUtil.selectCovered( Token.class,s)){ 27 | // System.out.println(t.getCoveredText()); 28 | // } 29 | // } 30 | 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Constants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * JGibbsLDA is a free software; you can redistribute it and/or modify 15 | * it under the terms of the GNU General Public License as published 16 | * by the Free Software Foundation; either version 2 of the License, 17 | * or (at your option) any later version. 18 | * 19 | * JGibbsLDA is distributed in the hope that it will be useful, but 20 | * WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU General Public License 25 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 27 | */ 28 | 29 | package jgibbslda; 30 | 31 | public class Constants { 32 | public static final long BUFFER_SIZE_LONG = 1000000; 33 | public static final short BUFFER_SIZE_SHORT = 512; 34 | 35 | public static final int MODEL_STATUS_UNKNOWN = 0; 36 | public static final int MODEL_STATUS_EST = 1; 37 | public static final int MODEL_STATUS_ESTC = 2; 38 | public static final int MODEL_STATUS_INF = 3; 39 | } 40 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Conversion.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * JGibbsLDA is a free software; you can redistribute it and/or modify 15 | * it under the terms of the GNU General Public License as published 16 | * by the Free Software Foundation; either version 2 of the License, 17 | * or (at your option) any later version. 18 | * 19 | * JGibbsLDA is distributed in the hope that it will be useful, but 20 | * WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU General Public License 25 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 27 | */ 28 | 29 | package jgibbslda; 30 | 31 | public class Conversion { 32 | public static String ZeroPad( int number, int width ) 33 | { 34 | StringBuffer result = new StringBuffer(""); 35 | for( int i = 0; i < width-Integer.toString(number).length(); i++ ) 36 | result.append( "0" ); 37 | result.append( Integer.toString(number) ); 38 | 39 | return result.toString(); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/consumer/GibbsLdaModelGeneratorConsumer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | package de.tudarmstadt.langtech.lda.consumer; 25 | 26 | import jgibbslda.Estimator; 27 | import jgibbslda.LDACmdOption; 28 | 29 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 30 | import org.apache.uima.jcas.JCas; 31 | import org.uimafit.component.JCasConsumer_ImplBase; 32 | 33 | public class GibbsLdaModelGeneratorConsumer extends JCasConsumer_ImplBase { 34 | 35 | @Override 36 | public void process(JCas aJCas) 37 | throws AnalysisEngineProcessException { 38 | LDACmdOption options = new LDACmdOption(); 39 | Estimator es = new Estimator(); 40 | es.init(options); 41 | 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | de.tudarmstadt.langtech.semantics.segmentation 4 | de.tudarmstadt.langtech.semantics.segmentation.topictiling 5 | 0.0.2 6 | 7 | 8 | de.tudarmstadt.ukp.dkpro.core 9 | 10 | de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl 11 | 12 | 1.4.0 13 | 14 | 15 | args4j 16 | args4j 17 | 2.0.16 18 | 19 | 20 | de.tudarmstadt.ukp.dkpro.core 21 | 22 | de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl 23 | 24 | 1.4.0 25 | 26 | 27 | de.tudarmstadt.ukp.dkpro.core 28 | 29 | de.tudarmstadt.ukp.dkpro.core.io.text-asl 30 | 31 | 1.4.0 32 | 33 | 34 | org.uimafit 35 | uimafit 36 | 1.4.0 37 | 38 | 39 | de.tudarmstadt.ukp.dkpro 40 | de.tudarmstadt.ukp.dkpro.lda 41 | 0.0.1-SNAPSHOT 42 | 43 | 44 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDACmdOption.java: -------------------------------------------------------------------------------- 1 | package jgibbslda; 2 | 3 | import org.kohsuke.args4j.*; 4 | 5 | public class LDACmdOption { 6 | 7 | @Option(name="-est", usage="Specify whether we want to estimate model from scratch") 8 | public boolean est = false; 9 | 10 | @Option(name="-estc", usage="Specify whether we want to continue the last estimation") 11 | public boolean estc = false; 12 | 13 | @Option(name="-inf", usage="Specify whether we want to do inference") 14 | public boolean inf = true; 15 | 16 | @Option(name="-dir", usage="Specify directory") 17 | public String dir = ""; 18 | 19 | @Option(name="-dfile", usage="Specify data file") 20 | public String dfile = ""; 21 | 22 | @Option(name="-model", usage="Specify the model name") 23 | public String modelName = ""; 24 | 25 | @Option(name="-alpha", usage="Specify alpha") 26 | public double alpha = -1.0; 27 | 28 | @Option(name="-beta", usage="Specify beta") 29 | public double beta = -1.0; 30 | 31 | @Option(name="-ntopics", usage="Specify the number of topics") 32 | public int K = 100; 33 | 34 | @Option(name="-niters", usage="Specify the number of iterations") 35 | public int niters = 1000; 36 | 37 | @Option(name="-savestep", usage="Specify the number of steps to save the model since the last save") 38 | public int savestep = 100; 39 | 40 | @Option(name="-twords", usage="Specify the number of most likely words to be printed for each topic") 41 | public int twords = 100; 42 | 43 | @Option(name="-withrawdata", usage="Specify whether we include raw data in the input") 44 | public boolean withrawdata = false; 45 | 46 | @Option(name="-wordmap", usage="Specify the wordmap file") 47 | public String wordMapFileName = "wordmap.txt"; 48 | } 49 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Pair.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * JGibbsLDA is a free software; you can redistribute it and/or modify 15 | * it under the terms of the GNU General Public License as published 16 | * by the Free Software Foundation; either version 2 of the License, 17 | * or (at your option) any later version. 18 | * 19 | * JGibbsLDA is distributed in the hope that it will be useful, but 20 | * WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU General Public License 25 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 27 | */ 28 | 29 | package jgibbslda; 30 | 31 | import java.util.Comparator; 32 | 33 | public class Pair implements Comparable { 34 | public Object first; 35 | public Comparable second; 36 | public static boolean naturalOrder = false; 37 | 38 | public Pair(Object k, Comparable v){ 39 | first = k; 40 | second = v; 41 | } 42 | 43 | public Pair(Object k, Comparable v, boolean naturalOrder){ 44 | first = k; 45 | second = v; 46 | Pair.naturalOrder = naturalOrder; 47 | } 48 | 49 | public int compareTo(Pair p){ 50 | if (naturalOrder) 51 | return this.second.compareTo(p.second); 52 | else return -this.second.compareTo(p.second); 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/resources/desc/type/gibbsldatypes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | gibbsldatypes 4 | 5 | 1.0 6 | 7 | 8 | 9 | de.tudarmstadt.ukp.dkpro.lda.type.Topic 10 | 11 | uima.tcas.Annotation 12 | 13 | 14 | topicId 15 | 16 | uima.cas.Integer 17 | 18 | 19 | topicModeId 20 | 21 | uima.cas.Integer 22 | 23 | 24 | 25 | 26 | de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution 27 | 28 | uima.tcas.Annotation 29 | 30 | 31 | topicDistribution 32 | 33 | uima.cas.DoubleArray 34 | 35 | 36 | 37 | 38 | de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution 39 | 40 | uima.tcas.Annotation 41 | 42 | 43 | topicDistribution 44 | 45 | uima.cas.DoubleArray 46 | true 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/Segment.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* First created by JCasGen Fri Nov 08 17:22:57 CET 2013 */ 4 | package de.tudarmstadt.langtech.semantics.type; 5 | 6 | import org.apache.uima.jcas.JCas; 7 | import org.apache.uima.jcas.JCasRegistry; 8 | import org.apache.uima.jcas.cas.TOP_Type; 9 | 10 | import org.apache.uima.jcas.tcas.Annotation; 11 | 12 | 13 | /** 14 | * Updated by JCasGen Fri Nov 08 17:22:57 CET 2013 15 | * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml 16 | * @generated */ 17 | public class Segment extends Annotation { 18 | /** @generated 19 | * @ordered 20 | */ 21 | @SuppressWarnings ("hiding") 22 | public final static int typeIndexID = JCasRegistry.register(Segment.class); 23 | /** @generated 24 | * @ordered 25 | */ 26 | @SuppressWarnings ("hiding") 27 | public final static int type = typeIndexID; 28 | /** @generated */ 29 | @Override 30 | public int getTypeIndexID() {return typeIndexID;} 31 | 32 | /** Never called. Disable default constructor 33 | * @generated */ 34 | protected Segment() {/* intentionally empty block */} 35 | 36 | /** Internal - constructor used by generator 37 | * @generated */ 38 | public Segment(int addr, TOP_Type type) { 39 | super(addr, type); 40 | readObject(); 41 | } 42 | 43 | /** @generated */ 44 | public Segment(JCas jcas) { 45 | super(jcas); 46 | readObject(); 47 | } 48 | 49 | /** @generated */ 50 | public Segment(JCas jcas, int begin, int end) { 51 | super(jcas); 52 | setBegin(begin); 53 | setEnd(end); 54 | readObject(); 55 | } 56 | 57 | /** 58 | * Write your own initialization here 59 | * 60 | @generated modifiable */ 61 | private void readObject() {/*default - does nothing empty block */} 62 | 63 | } 64 | 65 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/topictiling.bat: -------------------------------------------------------------------------------- 1 | java -cp dependency/ant-1.8.1.jar:dependency/ant-launcher-1.8.1.jar:dependency/aopalliance-1.0.jar:dependency/args4j-2.0.16.jar:dependency/commons-compress-1.4.1.jar:dependency/commons-io-2.0.1.jar:dependency/commons-lang-2.6.jar:dependency/commons-logging-1.1.0.jboss.jar:dependency/commons-logging-1.1.1.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.coref-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.io-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.metadata-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.ner-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.parameter-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.resources-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.syntax-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.io.text-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp .dkpro.core.stanfordnlp-gpl-1.4.0.jar:dependency/icu4j-4.0.1.jar:dependency/jVinci-2.4.0.jar:dependency/joda-time-2.0.jar:dependency/lda.jar:dependency/org.apache.log4j-1.2.13.v200706111418.jar:dependency/serializer-2.7.1.jar:dependency/spring-aop-3.1.0.RELEASE.jar:dependency/spring-asm-3.1.0.RELEASE.jar:dependency/spring-beans-3.1.0.RELEASE.jar:dependency/spring-context-3.1.0.RELEASE.jar:dependency/spring-core-3.1.0.RELEASE.jar:dependency/spring-expression-3.1.0.RELEASE.jar:dependency/stanford-corenlp-1.3.3.jar:dependency/uimafit-1.4.0.jar:dependency/uimaj-adapter-vinci-2.4.0.jar:dependency/uimaj-core-2.4.0.jar:dependency/uimaj-cpe-2.4.0.jar:dependency/uimaj-document-annotation-2.4.0.jar:dependency/uimaj-tools-2.4.0.jar:dependency/xalan-2.7.1.jar:dependency/xercesImpl-2.9.1.jar:dependency/xml-apis-1.3.03.jar:dependency/xom-1.2.5.jar:dependency/xz-1.0.jar:de.tudarmstadt.langtech.semantics.segmentation.topictiling-0.0.2.jar de.tudarmstadt.langtech.semantics.segmentation.segmenter.RunTopicTilingOnFile 2 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/SimpleSegmenter.java: -------------------------------------------------------------------------------- 1 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator; 2 | 3 | import java.text.BreakIterator; 4 | 5 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 6 | import org.apache.uima.jcas.JCas; 7 | import org.uimafit.component.JCasAnnotator_ImplBase; 8 | import org.uimafit.descriptor.ConfigurationParameter; 9 | 10 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; 11 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 12 | 13 | public class SimpleSegmenter extends JCasAnnotator_ImplBase{ 14 | public static final String PARAM_TOKEN_BOUNDARY="TokenBoundary"; 15 | public static final String PARAM_SENTENCE_BOUNDARY="SentenceBoundary"; 16 | @ConfigurationParameter(name = PARAM_SENTENCE_BOUNDARY,mandatory=false) 17 | private char sentenceBoundary = '\n'; 18 | @ConfigurationParameter(name = PARAM_TOKEN_BOUNDARY,mandatory=false) 19 | private char tokenBoundary = ' '; 20 | 21 | 22 | @Override 23 | public void process(JCas aJCas) throws AnalysisEngineProcessException { 24 | String txt = aJCas.getDocumentText(); 25 | int prevToken = 0; 26 | int prevSentence = 0; 27 | System.out.println(txt); 28 | int i =0; 29 | for (i=0;i0){ 32 | Sentence s = new Sentence(aJCas,prevSentence,i); 33 | s.addToIndexes(); 34 | prevSentence=i+1; 35 | Token t = new Token(aJCas,prevToken,i); 36 | t.addToIndexes(); 37 | prevToken=i+1; 38 | } 39 | if (txt.charAt(i)==tokenBoundary && i-prevToken>0){ 40 | Token t = new Token(aJCas,prevToken,i); 41 | t.addToIndexes(); 42 | prevToken=i+1; 43 | } 44 | 45 | } 46 | if (i-prevSentence>0){ 47 | Sentence s = new Sentence(aJCas,prevSentence,i); 48 | s.addToIndexes(); 49 | Token t = new Token(aJCas,prevToken,i); 50 | t.addToIndexes(); 51 | } 52 | } 53 | 54 | 55 | } 56 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaDocumentBasedTopicIdAnnotator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | package de.tudarmstadt.langtech.lda.annotator; 25 | 26 | import static org.uimafit.util.JCasUtil.select; 27 | import static org.uimafit.util.JCasUtil.selectCovered; 28 | 29 | import java.util.ArrayList; 30 | import java.util.Collection; 31 | import java.util.List; 32 | 33 | import org.apache.uima.jcas.JCas; 34 | 35 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; 36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 37 | 38 | 39 | 40 | public class GibbsLdaDocumentBasedTopicIdAnnotator 41 | extends GibbsLdaTopicIdAnnotator { 42 | 43 | @Override 44 | public List[] getDocuments(JCas jcas) { 45 | Collection sentences = select(jcas, Sentence.class); 46 | @SuppressWarnings("unchecked") 47 | List[] arr = new ArrayList[1]; 48 | arr[0]= new ArrayList(); 49 | for (Sentence s : sentences) { 50 | for (Token t : selectCovered(Token.class, s)) { 51 | arr[0].add(t.getCoveredText()); 52 | } 53 | } 54 | 55 | return arr; 56 | } 57 | 58 | 59 | } 60 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/Segment_Type.java: -------------------------------------------------------------------------------- 1 | 2 | /* First created by JCasGen Fri Nov 08 17:22:57 CET 2013 */ 3 | package de.tudarmstadt.langtech.semantics.type; 4 | 5 | import org.apache.uima.jcas.JCas; 6 | import org.apache.uima.jcas.JCasRegistry; 7 | import org.apache.uima.cas.impl.CASImpl; 8 | import org.apache.uima.cas.impl.FSGenerator; 9 | import org.apache.uima.cas.FeatureStructure; 10 | import org.apache.uima.cas.impl.TypeImpl; 11 | import org.apache.uima.cas.Type; 12 | import org.apache.uima.jcas.tcas.Annotation_Type; 13 | 14 | /** 15 | * Updated by JCasGen Fri Nov 08 17:22:57 CET 2013 16 | * @generated */ 17 | public class Segment_Type extends Annotation_Type { 18 | /** @generated */ 19 | @Override 20 | protected FSGenerator getFSGenerator() {return fsGenerator;} 21 | /** @generated */ 22 | private final FSGenerator fsGenerator = 23 | new FSGenerator() { 24 | public FeatureStructure createFS(int addr, CASImpl cas) { 25 | if (Segment_Type.this.useExistingInstance) { 26 | // Return eq fs instance if already created 27 | FeatureStructure fs = Segment_Type.this.jcas.getJfsFromCaddr(addr); 28 | if (null == fs) { 29 | fs = new Segment(addr, Segment_Type.this); 30 | Segment_Type.this.jcas.putJfsFromCaddr(addr, fs); 31 | return fs; 32 | } 33 | return fs; 34 | } else return new Segment(addr, Segment_Type.this); 35 | } 36 | }; 37 | /** @generated */ 38 | @SuppressWarnings ("hiding") 39 | public final static int typeIndexID = Segment.typeIndexID; 40 | /** @generated 41 | @modifiable */ 42 | @SuppressWarnings ("hiding") 43 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.semantics.type.Segment"); 44 | 45 | 46 | 47 | /** initialize variables to correspond with Cas Type and Features 48 | * @generated */ 49 | public Segment_Type(JCas jcas, Type casType) { 50 | super(jcas, casType); 51 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); 52 | 53 | } 54 | } 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaSentenceBasedTopicIdAnnotator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | package de.tudarmstadt.langtech.lda.annotator; 25 | 26 | import static org.uimafit.util.JCasUtil.select; 27 | import static org.uimafit.util.JCasUtil.selectCovered; 28 | 29 | import java.util.ArrayList; 30 | import java.util.Collection; 31 | import java.util.List; 32 | 33 | import org.apache.uima.jcas.JCas; 34 | 35 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; 36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 37 | 38 | public class GibbsLdaSentenceBasedTopicIdAnnotator extends 39 | GibbsLdaTopicIdAnnotator { 40 | 41 | 42 | public List[] getDocuments(JCas jcas) { 43 | Collection sentences = select(jcas, Sentence.class); 44 | @SuppressWarnings("unchecked") 45 | List[] arr = new ArrayList[sentences.size()]; 46 | int i = 0; 47 | for (Sentence s : select(jcas, Sentence.class)) { 48 | System.out.println(s.getCoveredText()); 49 | } 50 | for (Sentence s : sentences) { 51 | StringBuffer line = new StringBuffer(); 52 | arr[i] = new ArrayList(); 53 | for (Token t : selectCovered(Token.class, s)) { 54 | line.append(t.getCoveredText()); 55 | line.append(" "); 56 | arr[i].add(t.getCoveredText()); 57 | } 58 | i++; 59 | } 60 | 61 | return arr; 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDA.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * JGibbsLDA is a free software; you can redistribute it and/or modify 15 | * it under the terms of the GNU General Public License as published 16 | * by the Free Software Foundation; either version 2 of the License, 17 | * or (at your option) any later version. 18 | * 19 | * JGibbsLDA is distributed in the hope that it will be useful, but 20 | * WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU General Public License 25 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 27 | */ 28 | 29 | package jgibbslda; 30 | 31 | import org.kohsuke.args4j.*; 32 | 33 | public class LDA { 34 | 35 | public static void main(String args[]){ 36 | LDACmdOption option = new LDACmdOption(); 37 | CmdLineParser parser = new CmdLineParser(option); 38 | 39 | try { 40 | if (args.length == 0){ 41 | showHelp(parser); 42 | return; 43 | } 44 | 45 | parser.parseArgument(args); 46 | 47 | if (option.est || option.estc){ 48 | Estimator estimator = new Estimator(); 49 | estimator.init(option); 50 | estimator.estimate(); 51 | } 52 | else if (option.inf){ 53 | Inferencer inferencer = new Inferencer(); 54 | inferencer.init(option); 55 | 56 | Model newModel = inferencer.inference(); 57 | 58 | for (int i = 0; i < newModel.phi.length; ++i){ 59 | //phi: K * V 60 | System.out.println("-----------------------\ntopic" + i + " : "); 61 | for (int j = 0; j < 10; ++j){ 62 | System.out.println(inferencer.globalDict.id2word.get(j) + "\t" + newModel.phi[i][j]); 63 | } 64 | } 65 | } 66 | } 67 | catch (CmdLineException cle){ 68 | System.out.println("Command line error: " + cle.getMessage()); 69 | showHelp(parser); 70 | return; 71 | } 72 | catch (Exception e){ 73 | System.out.println("Error in main: " + e.getMessage()); 74 | e.printStackTrace(); 75 | return; 76 | } 77 | } 78 | 79 | public static void showHelp(CmdLineParser parser){ 80 | System.out.println("LDA [options ...] [arguments...]"); 81 | parser.printUsage(System.out); 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Document.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * JGibbsLDA is a free software; you can redistribute it and/or modify 15 | * it under the terms of the GNU General Public License as published 16 | * by the Free Software Foundation; either version 2 of the License, 17 | * or (at your option) any later version. 18 | * 19 | * JGibbsLDA is distributed in the hope that it will be useful, but 20 | * WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU General Public License 25 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 27 | */ 28 | 29 | package jgibbslda; 30 | 31 | import java.util.Vector; 32 | 33 | public class Document { 34 | 35 | //---------------------------------------------------- 36 | //Instance Variables 37 | //---------------------------------------------------- 38 | public int [] words; 39 | public String rawStr; 40 | public int length; 41 | 42 | //---------------------------------------------------- 43 | //Constructors 44 | //---------------------------------------------------- 45 | public Document(){ 46 | words = null; 47 | rawStr = ""; 48 | length = 0; 49 | } 50 | 51 | public Document(int length){ 52 | this.length = length; 53 | rawStr = ""; 54 | words = new int[length]; 55 | } 56 | 57 | public Document(int length, int [] words){ 58 | this.length = length; 59 | rawStr = ""; 60 | 61 | this.words = new int[length]; 62 | for (int i =0 ; i < length; ++i){ 63 | this.words[i] = words[i]; 64 | } 65 | } 66 | 67 | public Document(int length, int [] words, String rawStr){ 68 | this.length = length; 69 | this.rawStr = rawStr; 70 | 71 | this.words = new int[length]; 72 | for (int i =0 ; i < length; ++i){ 73 | this.words[i] = words[i]; 74 | } 75 | } 76 | 77 | public Document(Vector doc){ 78 | this.length = doc.size(); 79 | rawStr = ""; 80 | this.words = new int[length]; 81 | for (int i = 0; i < length; i++){ 82 | this.words[i] = doc.get(i); 83 | } 84 | } 85 | 86 | public Document(Vector doc, String rawStr){ 87 | this.length = doc.size(); 88 | this.rawStr = rawStr; 89 | this.words = new int[length]; 90 | for (int i = 0; i < length; ++i){ 91 | this.words[i] = doc.get(i); 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentQuantity.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* First created by JCasGen Fri Nov 08 16:28:43 CET 2013 */ 4 | package de.tudarmstadt.langtech.semantics.type; 5 | 6 | import org.apache.uima.jcas.JCas; 7 | import org.apache.uima.jcas.JCasRegistry; 8 | import org.apache.uima.jcas.cas.TOP_Type; 9 | 10 | import org.apache.uima.jcas.tcas.Annotation; 11 | 12 | 13 | /** Saves the number of segments a document should consist of according to a given gold-standard. 14 | * Updated by JCasGen Fri Nov 08 16:59:47 CET 2013 15 | * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml 16 | * @generated */ 17 | public class SegmentQuantity extends Annotation { 18 | /** @generated 19 | * @ordered 20 | */ 21 | @SuppressWarnings ("hiding") 22 | public final static int typeIndexID = JCasRegistry.register(SegmentQuantity.class); 23 | /** @generated 24 | * @ordered 25 | */ 26 | @SuppressWarnings ("hiding") 27 | public final static int type = typeIndexID; 28 | /** @generated */ 29 | @Override 30 | public int getTypeIndexID() {return typeIndexID;} 31 | 32 | /** Never called. Disable default constructor 33 | * @generated */ 34 | protected SegmentQuantity() {/* intentionally empty block */} 35 | 36 | /** Internal - constructor used by generator 37 | * @generated */ 38 | public SegmentQuantity(int addr, TOP_Type type) { 39 | super(addr, type); 40 | readObject(); 41 | } 42 | 43 | /** @generated */ 44 | public SegmentQuantity(JCas jcas) { 45 | super(jcas); 46 | readObject(); 47 | } 48 | 49 | /** @generated */ 50 | public SegmentQuantity(JCas jcas, int begin, int end) { 51 | super(jcas); 52 | setBegin(begin); 53 | setEnd(end); 54 | readObject(); 55 | } 56 | 57 | /** 58 | * Write your own initialization here 59 | * 60 | @generated modifiable */ 61 | private void readObject() {/*default - does nothing empty block */} 62 | 63 | 64 | 65 | //*--------------* 66 | //* Feature: segmentCount 67 | 68 | /** getter for segmentCount - gets 69 | * @generated */ 70 | public int getSegmentCount() { 71 | if (SegmentQuantity_Type.featOkTst && ((SegmentQuantity_Type)jcasType).casFeat_segmentCount == null) 72 | jcasType.jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity"); 73 | return jcasType.ll_cas.ll_getIntValue(addr, ((SegmentQuantity_Type)jcasType).casFeatCode_segmentCount);} 74 | 75 | /** setter for segmentCount - sets 76 | * @generated */ 77 | public void setSegmentCount(int v) { 78 | if (SegmentQuantity_Type.featOkTst && ((SegmentQuantity_Type)jcasType).casFeat_segmentCount == null) 79 | jcasType.jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity"); 80 | jcasType.ll_cas.ll_setIntValue(addr, ((SegmentQuantity_Type)jcasType).casFeatCode_segmentCount, v);} 81 | } 82 | 83 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/Topic.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* First created by JCasGen Tue Feb 21 09:57:17 CET 2012 */ 4 | package de.tudarmstadt.langtech.lda.type; 5 | 6 | import org.apache.uima.jcas.JCas; 7 | import org.apache.uima.jcas.JCasRegistry; 8 | import org.apache.uima.jcas.cas.TOP_Type; 9 | 10 | import org.apache.uima.jcas.tcas.Annotation; 11 | 12 | 13 | /** 14 | * Updated by JCasGen Thu Apr 12 12:36:02 CEST 2012 15 | * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml 16 | * @generated */ 17 | public class Topic extends Annotation { 18 | /** @generated 19 | * @ordered 20 | */ 21 | public final static int typeIndexID = JCasRegistry.register(Topic.class); 22 | /** @generated 23 | * @ordered 24 | */ 25 | public final static int type = typeIndexID; 26 | /** @generated */ 27 | public int getTypeIndexID() {return typeIndexID;} 28 | 29 | /** Never called. Disable default constructor 30 | * @generated */ 31 | protected Topic() {} 32 | 33 | /** Internal - constructor used by generator 34 | * @generated */ 35 | public Topic(int addr, TOP_Type type) { 36 | super(addr, type); 37 | readObject(); 38 | } 39 | 40 | /** @generated */ 41 | public Topic(JCas jcas) { 42 | super(jcas); 43 | readObject(); 44 | } 45 | 46 | /** @generated */ 47 | public Topic(JCas jcas, int begin, int end) { 48 | super(jcas); 49 | setBegin(begin); 50 | setEnd(end); 51 | readObject(); 52 | } 53 | 54 | /** 55 | * Write your own initialization here 56 | * 57 | @generated modifiable */ 58 | private void readObject() {} 59 | 60 | //*--------------* 61 | //* Feature: topicId 62 | 63 | /** getter for topicId - gets 64 | * @generated */ 65 | public int getTopicId() { 66 | if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicId == null) 67 | jcasType.jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 68 | return jcasType.ll_cas.ll_getIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicId);} 69 | 70 | /** setter for topicId - sets 71 | * @generated */ 72 | public void setTopicId(int v) { 73 | if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicId == null) 74 | jcasType.jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 75 | jcasType.ll_cas.ll_setIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicId, v);} 76 | 77 | 78 | //*--------------* 79 | //* Feature: topicModeId 80 | 81 | /** getter for topicModeId - gets 82 | * @generated */ 83 | public int getTopicModeId() { 84 | if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicModeId == null) 85 | jcasType.jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 86 | return jcasType.ll_cas.ll_getIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicModeId);} 87 | 88 | /** setter for topicModeId - sets 89 | * @generated */ 90 | public void setTopicModeId(int v) { 91 | if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicModeId == null) 92 | jcasType.jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 93 | jcasType.ll_cas.ll_setIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicModeId, v);} 94 | } 95 | 96 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentQuantity_Type.java: -------------------------------------------------------------------------------- 1 | 2 | /* First created by JCasGen Fri Nov 08 16:28:43 CET 2013 */ 3 | package de.tudarmstadt.langtech.semantics.type; 4 | 5 | import org.apache.uima.jcas.JCas; 6 | import org.apache.uima.jcas.JCasRegistry; 7 | import org.apache.uima.cas.impl.CASImpl; 8 | import org.apache.uima.cas.impl.FSGenerator; 9 | import org.apache.uima.cas.FeatureStructure; 10 | import org.apache.uima.cas.impl.TypeImpl; 11 | import org.apache.uima.cas.Type; 12 | import org.apache.uima.cas.impl.FeatureImpl; 13 | import org.apache.uima.cas.Feature; 14 | import org.apache.uima.jcas.tcas.Annotation_Type; 15 | 16 | /** Saves the number of segments a document should consist of according to a given gold-standard. 17 | * Updated by JCasGen Fri Nov 08 16:59:47 CET 2013 18 | * @generated */ 19 | public class SegmentQuantity_Type extends Annotation_Type { 20 | /** @generated */ 21 | @Override 22 | protected FSGenerator getFSGenerator() {return fsGenerator;} 23 | /** @generated */ 24 | private final FSGenerator fsGenerator = 25 | new FSGenerator() { 26 | public FeatureStructure createFS(int addr, CASImpl cas) { 27 | if (SegmentQuantity_Type.this.useExistingInstance) { 28 | // Return eq fs instance if already created 29 | FeatureStructure fs = SegmentQuantity_Type.this.jcas.getJfsFromCaddr(addr); 30 | if (null == fs) { 31 | fs = new SegmentQuantity(addr, SegmentQuantity_Type.this); 32 | SegmentQuantity_Type.this.jcas.putJfsFromCaddr(addr, fs); 33 | return fs; 34 | } 35 | return fs; 36 | } else return new SegmentQuantity(addr, SegmentQuantity_Type.this); 37 | } 38 | }; 39 | /** @generated */ 40 | @SuppressWarnings ("hiding") 41 | public final static int typeIndexID = SegmentQuantity.typeIndexID; 42 | /** @generated 43 | @modifiable */ 44 | @SuppressWarnings ("hiding") 45 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity"); 46 | 47 | /** @generated */ 48 | final Feature casFeat_segmentCount; 49 | /** @generated */ 50 | final int casFeatCode_segmentCount; 51 | /** @generated */ 52 | public int getSegmentCount(int addr) { 53 | if (featOkTst && casFeat_segmentCount == null) 54 | jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity"); 55 | return ll_cas.ll_getIntValue(addr, casFeatCode_segmentCount); 56 | } 57 | /** @generated */ 58 | public void setSegmentCount(int addr, int v) { 59 | if (featOkTst && casFeat_segmentCount == null) 60 | jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity"); 61 | ll_cas.ll_setIntValue(addr, casFeatCode_segmentCount, v);} 62 | 63 | 64 | 65 | 66 | 67 | /** initialize variables to correspond with Cas Type and Features 68 | * @generated */ 69 | public SegmentQuantity_Type(JCas jcas, Type casType) { 70 | super(jcas, casType); 71 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); 72 | 73 | 74 | casFeat_segmentCount = jcas.getRequiredFeatureDE(casType, "segmentCount", "uima.cas.Integer", featOkTst); 75 | casFeatCode_segmentCount = (null == casFeat_segmentCount) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_segmentCount).getCode(); 76 | 77 | } 78 | } 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/test/java/de/tudarmstadt/langtech/lda/TestLdaTopicModelAnnotator.java: -------------------------------------------------------------------------------- 1 | package de.tudarmstadt.langtech.lda; 2 | 3 | import static org.uimafit.factory.AnalysisEngineFactory.createPrimitive; 4 | 5 | import java.io.IOException; 6 | import java.text.BreakIterator; 7 | 8 | import org.apache.uima.UIMAException; 9 | import org.apache.uima.analysis_engine.AnalysisEngine; 10 | import org.apache.uima.jcas.JCas; 11 | import org.uimafit.component.xwriter.CASDumpWriter; 12 | import org.uimafit.factory.AnalysisEngineFactory; 13 | import org.uimafit.factory.JCasFactory; 14 | import org.uimafit.pipeline.SimplePipeline; 15 | 16 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaDocumentBasedTopicIdAnnotator; 17 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaSentenceBasedTopicIdAnnotator; 18 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaTopicIdAnnotator; 19 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; 20 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; 21 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 22 | 23 | public class TestLdaTopicModelAnnotator { 24 | public static void main(String[] args) throws UIMAException, IOException { 25 | JCas jcas = getJCas(); 26 | 27 | //sentence wise 28 | AnalysisEngine ae = AnalysisEngineFactory.createPrimitive(GibbsLdaSentenceBasedTopicIdAnnotator.class, 29 | GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_NAME, "model-final", 30 | GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_DIR, "src/test/resources/model", 31 | GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION,true, 32 | GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION,true, 33 | GibbsLdaTopicIdAnnotator.PARAM_LDA_REPEAT_INFERENCE, 100 34 | ); 35 | 36 | //document wise 37 | AnalysisEngine ae2 = AnalysisEngineFactory.createPrimitive(GibbsLdaDocumentBasedTopicIdAnnotator.class, 38 | GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_NAME, "model-final", 39 | GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_DIR, "src/test/resources/model", 40 | GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION,true, 41 | GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION,true, 42 | GibbsLdaTopicIdAnnotator.PARAM_LDA_REPEAT_INFERENCE, 100 43 | ); 44 | 45 | 46 | AnalysisEngine out = createPrimitive(CASDumpWriter.class); 47 | SimplePipeline.runPipeline(jcas, ae,out); 48 | } 49 | 50 | private static JCas getJCas() throws UIMAException { 51 | JCas jcas = JCasFactory.createJCas(); 52 | jcas.setDocumentLanguage("en"); 53 | String text = "This is some example document. And there is more text"; 54 | jcas.setDocumentText(text); 55 | DocumentMetaData metaData = new DocumentMetaData(jcas); 56 | metaData.setDocumentTitle("Titel"); 57 | metaData.addToIndexes(); 58 | BreakIterator boundary = BreakIterator.getWordInstance(); 59 | 60 | 61 | // print each sentence in reverse order 62 | boundary.setText(text); 63 | int start = boundary.first(); 64 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { 65 | Token t = new Token(jcas, start, end); 66 | t.addToIndexes(); 67 | } 68 | boundary = BreakIterator.getSentenceInstance(); 69 | boundary.setText(text); 70 | 71 | start = boundary.first(); 72 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { 73 | Sentence t = new Sentence(jcas, start, end); 74 | t.addToIndexes(); 75 | } 76 | return jcas; 77 | 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/ml/lda/type/GibbsLdaTopic.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* First created by JCasGen Fri Nov 08 16:28:12 CET 2013 */ 4 | package de.tudarmstadt.langtech.ml.lda.type; 5 | 6 | import org.apache.uima.jcas.JCas; 7 | import org.apache.uima.jcas.JCasRegistry; 8 | import org.apache.uima.jcas.cas.TOP_Type; 9 | 10 | import org.apache.uima.jcas.tcas.Annotation; 11 | 12 | 13 | /** 14 | * Updated by JCasGen Fri Nov 08 16:59:29 CET 2013 15 | * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/GibbsLdaDescriptor.xml 16 | * @generated */ 17 | public class GibbsLdaTopic extends Annotation { 18 | /** @generated 19 | * @ordered 20 | */ 21 | @SuppressWarnings ("hiding") 22 | public final static int typeIndexID = JCasRegistry.register(GibbsLdaTopic.class); 23 | /** @generated 24 | * @ordered 25 | */ 26 | @SuppressWarnings ("hiding") 27 | public final static int type = typeIndexID; 28 | /** @generated */ 29 | @Override 30 | public int getTypeIndexID() {return typeIndexID;} 31 | 32 | /** Never called. Disable default constructor 33 | * @generated */ 34 | protected GibbsLdaTopic() {/* intentionally empty block */} 35 | 36 | /** Internal - constructor used by generator 37 | * @generated */ 38 | public GibbsLdaTopic(int addr, TOP_Type type) { 39 | super(addr, type); 40 | readObject(); 41 | } 42 | 43 | /** @generated */ 44 | public GibbsLdaTopic(JCas jcas) { 45 | super(jcas); 46 | readObject(); 47 | } 48 | 49 | /** @generated */ 50 | public GibbsLdaTopic(JCas jcas, int begin, int end) { 51 | super(jcas); 52 | setBegin(begin); 53 | setEnd(end); 54 | readObject(); 55 | } 56 | 57 | /** 58 | * Write your own initialization here 59 | * 60 | @generated modifiable */ 61 | private void readObject() {/*default - does nothing empty block */} 62 | 63 | 64 | 65 | //*--------------* 66 | //* Feature: topic 67 | 68 | /** getter for topic - gets 69 | * @generated */ 70 | public int getTopic() { 71 | if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_topic == null) 72 | jcasType.jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 73 | return jcasType.ll_cas.ll_getIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_topic);} 74 | 75 | /** setter for topic - sets 76 | * @generated */ 77 | public void setTopic(int v) { 78 | if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_topic == null) 79 | jcasType.jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 80 | jcasType.ll_cas.ll_setIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_topic, v);} 81 | 82 | 83 | //*--------------* 84 | //* Feature: termId 85 | 86 | /** getter for termId - gets 87 | * @generated */ 88 | public int getTermId() { 89 | if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_termId == null) 90 | jcasType.jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 91 | return jcasType.ll_cas.ll_getIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_termId);} 92 | 93 | /** setter for termId - sets 94 | * @generated */ 95 | public void setTermId(int v) { 96 | if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_termId == null) 97 | jcasType.jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 98 | jcasType.ll_cas.ll_setIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_termId, v);} 99 | } 100 | 101 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentScore.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* First created by JCasGen Fri Nov 08 16:51:38 CET 2013 */ 4 | package de.tudarmstadt.langtech.semantics.type; 5 | 6 | import org.apache.uima.jcas.JCas; 7 | import org.apache.uima.jcas.JCasRegistry; 8 | import org.apache.uima.jcas.cas.TOP_Type; 9 | 10 | import org.apache.uima.jcas.tcas.Annotation; 11 | 12 | 13 | /** 14 | * Updated by JCasGen Wed Aug 26 15:50:04 CEST 2015 15 | * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentScore.xml 16 | * @generated */ 17 | public class SegmentScore extends Annotation { 18 | /** @generated 19 | * @ordered 20 | */ 21 | @SuppressWarnings ("hiding") 22 | public final static int typeIndexID = JCasRegistry.register(SegmentScore.class); 23 | /** @generated 24 | * @ordered 25 | */ 26 | @SuppressWarnings ("hiding") 27 | public final static int type = typeIndexID; 28 | /** @generated */ 29 | @Override 30 | public int getTypeIndexID() {return typeIndexID;} 31 | 32 | /** Never called. Disable default constructor 33 | * @generated */ 34 | protected SegmentScore() {/* intentionally empty block */} 35 | 36 | /** Internal - constructor used by generator 37 | * @generated */ 38 | public SegmentScore(int addr, TOP_Type type) { 39 | super(addr, type); 40 | readObject(); 41 | } 42 | 43 | /** @generated */ 44 | public SegmentScore(JCas jcas) { 45 | super(jcas); 46 | readObject(); 47 | } 48 | 49 | /** @generated */ 50 | public SegmentScore(JCas jcas, int begin, int end) { 51 | super(jcas); 52 | setBegin(begin); 53 | setEnd(end); 54 | readObject(); 55 | } 56 | 57 | /** 58 | * Write your own initialization here 59 | * 60 | @generated modifiable */ 61 | private void readObject() {/*default - does nothing empty block */} 62 | 63 | 64 | 65 | //*--------------* 66 | //* Feature: score 67 | 68 | /** getter for score - gets 69 | * @generated */ 70 | public double getScore() { 71 | if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_score == null) 72 | jcasType.jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore"); 73 | return jcasType.ll_cas.ll_getDoubleValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_score);} 74 | 75 | /** setter for score - sets 76 | * @generated */ 77 | public void setScore(double v) { 78 | if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_score == null) 79 | jcasType.jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore"); 80 | jcasType.ll_cas.ll_setDoubleValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_score, v);} 81 | 82 | 83 | //*--------------* 84 | //* Feature: similarityScores 85 | 86 | /** getter for similarityScores - gets 87 | * @generated */ 88 | public String getSimilarityScores() { 89 | if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_similarityScores == null) 90 | jcasType.jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore"); 91 | return jcasType.ll_cas.ll_getStringValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_similarityScores);} 92 | 93 | /** setter for similarityScores - sets 94 | * @generated */ 95 | public void setSimilarityScores(String v) { 96 | if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_similarityScores == null) 97 | jcasType.jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore"); 98 | jcasType.ll_cas.ll_setStringValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_similarityScores, v);} 99 | } 100 | 101 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/OutputSegments.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator; 25 | 26 | import java.io.FileNotFoundException; 27 | import java.io.PrintStream; 28 | import java.util.Collection; 29 | 30 | import org.apache.commons.lang.StringEscapeUtils; 31 | import org.apache.uima.UimaContext; 32 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 33 | import org.apache.uima.jcas.JCas; 34 | import org.apache.uima.resource.ResourceInitializationException; 35 | import org.uimafit.component.JCasAnnotator_ImplBase; 36 | import org.uimafit.descriptor.ConfigurationParameter; 37 | import org.uimafit.util.JCasUtil; 38 | 39 | import de.tudarmstadt.langtech.semantics.type.SegmentScore; 40 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; 41 | 42 | public class OutputSegments extends JCasAnnotator_ImplBase { 43 | public static final String PARAM_OUTPUT = "Output"; 44 | @ConfigurationParameter(name = PARAM_OUTPUT, mandatory = false) 45 | private String output; 46 | private PrintStream ps; 47 | @Override 48 | public void initialize(UimaContext context) 49 | throws ResourceInitializationException { 50 | super.initialize(context); 51 | if(output==null){ 52 | ps = System.out; 53 | }else{ 54 | try { 55 | ps = new PrintStream(output); 56 | } catch (FileNotFoundException e) { 57 | e.printStackTrace(); 58 | } 59 | } 60 | ps.println(""); 61 | } 62 | 63 | @Override 64 | public void process(JCas aJCas) throws AnalysisEngineProcessException { 65 | ps.println(""); 66 | ps.println(""+DocumentMetaData.get(aJCas).getDocumentTitle()+""); 67 | ps.println(""); 68 | Collection ss = JCasUtil.select(aJCas, SegmentScore.class); 69 | int i = 0; 70 | for (SegmentScore s : ss) { 71 | if(i==0){ 72 | if(s.getBegin()!=0){ 73 | ps.println(""); 74 | ps.println(""); 75 | ps.println(""); 76 | ps.println(StringEscapeUtils.escapeXml(aJCas.getDocumentText().substring(0,s.getBegin()))); 77 | ps.println(""); 78 | ps.println(""); 79 | } 80 | } 81 | ps.println(""); 82 | // ps.println(""+s.getSimilarityScores()+""); 83 | ps.println(""+s.getScore()+""); 84 | ps.println(""); 85 | ps.println(StringEscapeUtils.escapeXml(s.getCoveredText())); 86 | ps.println(""); 87 | ps.println(""); 88 | i+=1; 89 | } 90 | ps.println(""); 91 | ps.println(""); 92 | } 93 | @Override 94 | public void collectionProcessComplete() 95 | throws AnalysisEngineProcessException { 96 | ps.println(""); 97 | ps.close(); 98 | super.collectionProcessComplete(); 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/Topic_Type.java: -------------------------------------------------------------------------------- 1 | 2 | /* First created by JCasGen Tue Feb 21 09:57:17 CET 2012 */ 3 | package de.tudarmstadt.langtech.lda.type; 4 | 5 | import org.apache.uima.jcas.JCas; 6 | import org.apache.uima.jcas.JCasRegistry; 7 | import org.apache.uima.cas.impl.CASImpl; 8 | import org.apache.uima.cas.impl.FSGenerator; 9 | import org.apache.uima.cas.FeatureStructure; 10 | import org.apache.uima.cas.impl.TypeImpl; 11 | import org.apache.uima.cas.Type; 12 | import org.apache.uima.cas.impl.FeatureImpl; 13 | import org.apache.uima.cas.Feature; 14 | import org.apache.uima.jcas.tcas.Annotation_Type; 15 | 16 | /** 17 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012 18 | * @generated */ 19 | public class Topic_Type extends Annotation_Type { 20 | /** @generated */ 21 | protected FSGenerator getFSGenerator() {return fsGenerator;} 22 | /** @generated */ 23 | private final FSGenerator fsGenerator = 24 | new FSGenerator() { 25 | public FeatureStructure createFS(int addr, CASImpl cas) { 26 | if (Topic_Type.this.useExistingInstance) { 27 | // Return eq fs instance if already created 28 | FeatureStructure fs = Topic_Type.this.jcas.getJfsFromCaddr(addr); 29 | if (null == fs) { 30 | fs = new Topic(addr, Topic_Type.this); 31 | Topic_Type.this.jcas.putJfsFromCaddr(addr, fs); 32 | return fs; 33 | } 34 | return fs; 35 | } else return new Topic(addr, Topic_Type.this); 36 | } 37 | }; 38 | /** @generated */ 39 | public final static int typeIndexID = Topic.typeIndexID; 40 | /** @generated 41 | @modifiable */ 42 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 43 | 44 | 45 | 46 | /** @generated */ 47 | final Feature casFeat_topicId; 48 | /** @generated */ 49 | final int casFeatCode_topicId; 50 | /** @generated */ 51 | public int getTopicId(int addr) { 52 | if (featOkTst && casFeat_topicId == null) 53 | jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 54 | return ll_cas.ll_getIntValue(addr, casFeatCode_topicId); 55 | } 56 | /** @generated */ 57 | public void setTopicId(int addr, int v) { 58 | if (featOkTst && casFeat_topicId == null) 59 | jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 60 | ll_cas.ll_setIntValue(addr, casFeatCode_topicId, v);} 61 | 62 | 63 | 64 | /** @generated */ 65 | final Feature casFeat_topicModeId; 66 | /** @generated */ 67 | final int casFeatCode_topicModeId; 68 | /** @generated */ 69 | public int getTopicModeId(int addr) { 70 | if (featOkTst && casFeat_topicModeId == null) 71 | jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 72 | return ll_cas.ll_getIntValue(addr, casFeatCode_topicModeId); 73 | } 74 | /** @generated */ 75 | public void setTopicModeId(int addr, int v) { 76 | if (featOkTst && casFeat_topicModeId == null) 77 | jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic"); 78 | ll_cas.ll_setIntValue(addr, casFeatCode_topicModeId, v);} 79 | 80 | 81 | 82 | 83 | 84 | /** initialize variables to correspond with Cas Type and Features 85 | * @generated */ 86 | public Topic_Type(JCas jcas, Type casType) { 87 | super(jcas, casType); 88 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); 89 | 90 | 91 | casFeat_topicId = jcas.getRequiredFeatureDE(casType, "topicId", "uima.cas.Integer", featOkTst); 92 | casFeatCode_topicId = (null == casFeat_topicId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicId).getCode(); 93 | 94 | 95 | casFeat_topicModeId = jcas.getRequiredFeatureDE(casType, "topicModeId", "uima.cas.Integer", featOkTst); 96 | casFeatCode_topicModeId = (null == casFeat_topicModeId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicModeId).getCode(); 97 | 98 | } 99 | } 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/ml/lda/type/GibbsLdaTopic_Type.java: -------------------------------------------------------------------------------- 1 | 2 | /* First created by JCasGen Fri Nov 08 16:28:12 CET 2013 */ 3 | package de.tudarmstadt.langtech.ml.lda.type; 4 | 5 | import org.apache.uima.jcas.JCas; 6 | import org.apache.uima.jcas.JCasRegistry; 7 | import org.apache.uima.cas.impl.CASImpl; 8 | import org.apache.uima.cas.impl.FSGenerator; 9 | import org.apache.uima.cas.FeatureStructure; 10 | import org.apache.uima.cas.impl.TypeImpl; 11 | import org.apache.uima.cas.Type; 12 | import org.apache.uima.cas.impl.FeatureImpl; 13 | import org.apache.uima.cas.Feature; 14 | import org.apache.uima.jcas.tcas.Annotation_Type; 15 | 16 | /** 17 | * Updated by JCasGen Fri Nov 08 16:59:29 CET 2013 18 | * @generated */ 19 | public class GibbsLdaTopic_Type extends Annotation_Type { 20 | /** @generated */ 21 | @Override 22 | protected FSGenerator getFSGenerator() {return fsGenerator;} 23 | /** @generated */ 24 | private final FSGenerator fsGenerator = 25 | new FSGenerator() { 26 | public FeatureStructure createFS(int addr, CASImpl cas) { 27 | if (GibbsLdaTopic_Type.this.useExistingInstance) { 28 | // Return eq fs instance if already created 29 | FeatureStructure fs = GibbsLdaTopic_Type.this.jcas.getJfsFromCaddr(addr); 30 | if (null == fs) { 31 | fs = new GibbsLdaTopic(addr, GibbsLdaTopic_Type.this); 32 | GibbsLdaTopic_Type.this.jcas.putJfsFromCaddr(addr, fs); 33 | return fs; 34 | } 35 | return fs; 36 | } else return new GibbsLdaTopic(addr, GibbsLdaTopic_Type.this); 37 | } 38 | }; 39 | /** @generated */ 40 | @SuppressWarnings ("hiding") 41 | public final static int typeIndexID = GibbsLdaTopic.typeIndexID; 42 | /** @generated 43 | @modifiable */ 44 | @SuppressWarnings ("hiding") 45 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 46 | 47 | /** @generated */ 48 | final Feature casFeat_topic; 49 | /** @generated */ 50 | final int casFeatCode_topic; 51 | /** @generated */ 52 | public int getTopic(int addr) { 53 | if (featOkTst && casFeat_topic == null) 54 | jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 55 | return ll_cas.ll_getIntValue(addr, casFeatCode_topic); 56 | } 57 | /** @generated */ 58 | public void setTopic(int addr, int v) { 59 | if (featOkTst && casFeat_topic == null) 60 | jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 61 | ll_cas.ll_setIntValue(addr, casFeatCode_topic, v);} 62 | 63 | 64 | 65 | /** @generated */ 66 | final Feature casFeat_termId; 67 | /** @generated */ 68 | final int casFeatCode_termId; 69 | /** @generated */ 70 | public int getTermId(int addr) { 71 | if (featOkTst && casFeat_termId == null) 72 | jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 73 | return ll_cas.ll_getIntValue(addr, casFeatCode_termId); 74 | } 75 | /** @generated */ 76 | public void setTermId(int addr, int v) { 77 | if (featOkTst && casFeat_termId == null) 78 | jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic"); 79 | ll_cas.ll_setIntValue(addr, casFeatCode_termId, v);} 80 | 81 | 82 | 83 | 84 | 85 | /** initialize variables to correspond with Cas Type and Features 86 | * @generated */ 87 | public GibbsLdaTopic_Type(JCas jcas, Type casType) { 88 | super(jcas, casType); 89 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); 90 | 91 | 92 | casFeat_topic = jcas.getRequiredFeatureDE(casType, "topic", "uima.cas.Integer", featOkTst); 93 | casFeatCode_topic = (null == casFeat_topic) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topic).getCode(); 94 | 95 | 96 | casFeat_termId = jcas.getRequiredFeatureDE(casType, "termId", "uima.cas.Integer", featOkTst); 97 | casFeatCode_termId = (null == casFeat_termId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_termId).getCode(); 98 | 99 | } 100 | } 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaTopicModelAnnotator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | 25 | package de.tudarmstadt.langtech.lda.annotator; 26 | 27 | import java.util.ArrayList; 28 | import java.util.List; 29 | 30 | import jgibbslda.Dictionary; 31 | import jgibbslda.Inferencer; 32 | import jgibbslda.LDACmdOption; 33 | import jgibbslda.Model; 34 | 35 | import org.apache.uima.UimaContext; 36 | import org.apache.uima.resource.ResourceInitializationException; 37 | import org.uimafit.component.JCasAnnotator_ImplBase; 38 | import org.uimafit.descriptor.ConfigurationParameter; 39 | 40 | /** 41 | * @author Martin Riedl 42 | */ 43 | public abstract class GibbsLdaTopicModelAnnotator extends JCasAnnotator_ImplBase{ 44 | public static final String PARAM_LDA_MODEL_DIR = "LdaModelDir"; 45 | public static final String PARAM_LDA_MODEL_NAME = "LdaModelName"; 46 | public static final String PARAM_LDA_INFERENCE_ITERATIONS = "LdaInferenceIterations"; 47 | public static final String PARAM_LDA_INFERENCE_SAVE_PATH = "LdaInferenceSavePath"; 48 | 49 | 50 | 51 | @ConfigurationParameter(name = PARAM_LDA_INFERENCE_SAVE_PATH, mandatory = false) 52 | private String ldaInferenceSavePath; 53 | private String ldaInferenceSaveName; 54 | 55 | public String getLdaInferenceSaveName() { 56 | return ldaInferenceSaveName; 57 | } 58 | public void setLdaInferenceSaveName(String ldaInferenceSaveName) { 59 | this.ldaInferenceSaveName = ldaInferenceSaveName; 60 | } 61 | @ConfigurationParameter(name = PARAM_LDA_MODEL_DIR, mandatory = true) 62 | private String ldaModelDir; 63 | @ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true) 64 | private String ldaModelName; 65 | @ConfigurationParameter(name = PARAM_LDA_INFERENCE_ITERATIONS, mandatory = false, description = "Inference iterations used to built topic distribution for new model", defaultValue = "100") 66 | private int ldaInferenceIteration; 67 | 68 | private Inferencer inferencer; 69 | 70 | // public Model inference(String[] documents) { 71 | // Model m = inferencer.inference(documents); 72 | // if(ldaInferenceSavePath!=null){ 73 | // m.dir = ldaInferenceSavePath; 74 | // m.saveModel("inference_"+ldaInferenceSaveName); 75 | // } 76 | // return m; 77 | // } 78 | 79 | public Model inference(List[] documents) { 80 | Model m = inferencer.inference(documents); 81 | if(ldaInferenceSavePath!=null){ 82 | m.dir = ldaInferenceSavePath; 83 | m.saveModel("inference_"+ldaInferenceSaveName); 84 | } 85 | return m; 86 | } 87 | public int getInferenceNiters() { 88 | return inferencer.niters; 89 | } 90 | 91 | public ArrayList getInferenceModeValues() { 92 | return inferencer.values; 93 | } 94 | 95 | public Dictionary getInferencerGlobalDict(){ 96 | return inferencer.globalDict; 97 | } 98 | 99 | 100 | @Override 101 | public void initialize(UimaContext context) 102 | throws ResourceInitializationException { 103 | super.initialize(context); 104 | LDACmdOption options = new LDACmdOption(); 105 | options.dir = ldaModelDir; 106 | options.modelName = ldaModelName; 107 | options.niters = ldaInferenceIteration; 108 | //Initiliaze inferencer 109 | inferencer = new Inferencer(); 110 | inferencer.init(options); 111 | } 112 | 113 | 114 | 115 | } 116 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/README.txt: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------- 2 | | TopicTiling | 3 | ---------------------------------------------------- 4 | 5 | Topic Tiling is a LDA based Text Segmentation algorithm. 6 | This algorithm is based on the well-known TextTiling 7 | algorithm, and segments documents using the Latent 8 | Dirichlet Allocation (LDA) topic model. TopicTiling performs 9 | the segmentation in linear time and thus is computationally 10 | less expensive than other LDA-based segmentation methods. 11 | 12 | USE: 13 | 14 | The tool has been developed and tested using unix-based systems. 15 | As TopicTiling is written in Java it should also run on Windows 16 | machines. For executing TopicTiling you have to uncompress the 17 | zip file and execute the topictiling.sh (Unix-based system) or 18 | topictiling.bat (Windows-based system). The output is given in 19 | an XML format with suggested topical boundaries. 20 | 21 | HINT FOR NON-LATIN LANGUAGES: 22 | If you want to process e.g. Chinese, Arabic languages with TopicTiling 23 | you have to provide tokenized text (both for TopicTiling and GibbsLDA) 24 | and in addition use the flag -s which disables the Stanford tokenization 25 | and uses instead a simple whitespace tokenizer that expects one sentence 26 | per line 27 | 28 | 29 | The parameters of the script are shown when no parameters are given: 30 | 31 | [java] Option "-fd" is required 32 | [java] java -jar myprogram.jar [options...] arguments... 33 | [java] -dn : Use the direct neighbor otherwise the highest neighbor will be used 34 | [java] (default false) 35 | [java] -fd VAL : Directory fo the test files 36 | [java] -fp VAL : File pattern for the test files 37 | [java] -i N : Number of inference iterations used to annotate words with topic 38 | [java] IDs (default 100) 39 | [java] -m : Use mode counting (true/false) (default=true) 40 | [java] -out VAL : File the content is written to (otherwise stdout will be used) 41 | [java] -ri N : Use the repeated inference method 42 | [java] -rs N : Use the repeated segmentation 43 | [java] -s : Use simple segmentation (default=false) 44 | [java] -tmd VAL : Directory of the topic model (GibbsLDA should be used) 45 | [java] -tmn VAL : Name of the topic model (GibbsLDA should be used) 46 | [java] -w N : Window size used to calculate the sentence similarity 47 | 48 | The parameters -fp, -fd, -tmd, -tmn are the ones that have to be specified 49 | and –ri should be parametrized by using about 5 repeated inferences. 50 | 51 | For the algorithms it’s important to have a trained LDA model. The model should 52 | be in a similar domain as the data you apply my algorithm. You have to train it 53 | yourself using GibssLda++ or JGibbslda (http://gibbslda.sourceforge.net/) . They 54 | both have the same output format. The output of the algorithms is given in XML 55 | and looks like: 56 | 57 | 58 | 59 | 60 | score 61 | 62 | 63 | … 64 | 65 | 66 | 67 | The code returns all possible boundary positions (all maxima). If the number of 68 | segments is known, select the the N highest depthScore values as boundary positions. 69 | 70 | 71 | LICENSE: 72 | 73 | The software is released under GPL 3.0 74 | 75 | PAPERS: 76 | 77 | 78 | Riedl, M., Biemann, C. (2012): Text Segmentation with Topic Models. Journal for Language Technology and Computational Linguistics (JLCL), Vol. 27, No. 1, pp. 47--70, August 2012 (pdf) 79 | Riedl M., Biemann C. (2012): How Text Segmentation Algorithms Gain from Topic Models, Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2012), Montreal, Canada. (pdf) 80 | Riedl M., Biemann C. (2012): TopicTiling: A Text Segmentation Algorithm based on LDA, Proceedings of the Student Research Workshop of the 50th Meeting of the Association for Computational Linguistics, Jeju, Republic of Korea. (pdf) 81 | Riedl M., Biemann C. (2012): Sweeping through the Topic Space: Bad luck? Roll again! In Proceedings of the Joint Workshop on Unsupervised and Semi-Supervised Learning in NLP held in conjunction with EACL 2012, Avignon, France (pdf) 82 | 83 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/TopicDistribution.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* First created by JCasGen Wed Apr 11 15:17:37 CEST 2012 */ 4 | package de.tudarmstadt.langtech.lda.type; 5 | 6 | import org.apache.uima.jcas.JCas; 7 | import org.apache.uima.jcas.JCasRegistry; 8 | import org.apache.uima.jcas.cas.TOP_Type; 9 | 10 | import org.apache.uima.jcas.tcas.Annotation; 11 | import org.apache.uima.jcas.cas.DoubleArray; 12 | 13 | 14 | /** 15 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012 16 | * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml 17 | * @generated */ 18 | public class TopicDistribution extends Annotation { 19 | /** @generated 20 | * @ordered 21 | */ 22 | public final static int typeIndexID = JCasRegistry.register(TopicDistribution.class); 23 | /** @generated 24 | * @ordered 25 | */ 26 | public final static int type = typeIndexID; 27 | /** @generated */ 28 | public int getTypeIndexID() {return typeIndexID;} 29 | 30 | /** Never called. Disable default constructor 31 | * @generated */ 32 | protected TopicDistribution() {} 33 | 34 | /** Internal - constructor used by generator 35 | * @generated */ 36 | public TopicDistribution(int addr, TOP_Type type) { 37 | super(addr, type); 38 | readObject(); 39 | } 40 | 41 | /** @generated */ 42 | public TopicDistribution(JCas jcas) { 43 | super(jcas); 44 | readObject(); 45 | } 46 | 47 | /** @generated */ 48 | public TopicDistribution(JCas jcas, int begin, int end) { 49 | super(jcas); 50 | setBegin(begin); 51 | setEnd(end); 52 | readObject(); 53 | } 54 | 55 | /** 56 | * Write your own initialization here 57 | * 58 | @generated modifiable */ 59 | private void readObject() {} 60 | 61 | 62 | 63 | //*--------------* 64 | //* Feature: topicDistribution 65 | 66 | /** getter for topicDistribution - gets 67 | * @generated */ 68 | public DoubleArray getTopicDistribution() { 69 | if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null) 70 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 71 | return (DoubleArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution)));} 72 | 73 | /** setter for topicDistribution - sets 74 | * @generated */ 75 | public void setTopicDistribution(DoubleArray v) { 76 | if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null) 77 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 78 | jcasType.ll_cas.ll_setRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution, jcasType.ll_cas.ll_getFSRef(v));} 79 | 80 | /** indexed getter for topicDistribution - gets an indexed value - 81 | * @generated */ 82 | public double getTopicDistribution(int i) { 83 | if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null) 84 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 85 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i); 86 | return jcasType.ll_cas.ll_getDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);} 87 | 88 | /** indexed setter for topicDistribution - sets an indexed value - 89 | * @generated */ 90 | public void setTopicDistribution(int i, double v) { 91 | if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null) 92 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 93 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i); 94 | jcasType.ll_cas.ll_setDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i, v);} 95 | } 96 | 97 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentScore_Type.java: -------------------------------------------------------------------------------- 1 | 2 | /* First created by JCasGen Fri Nov 08 16:51:38 CET 2013 */ 3 | package de.tudarmstadt.langtech.semantics.type; 4 | 5 | import org.apache.uima.jcas.JCas; 6 | import org.apache.uima.jcas.JCasRegistry; 7 | import org.apache.uima.cas.impl.CASImpl; 8 | import org.apache.uima.cas.impl.FSGenerator; 9 | import org.apache.uima.cas.FeatureStructure; 10 | import org.apache.uima.cas.impl.TypeImpl; 11 | import org.apache.uima.cas.Type; 12 | import org.apache.uima.cas.impl.FeatureImpl; 13 | import org.apache.uima.cas.Feature; 14 | import org.apache.uima.jcas.tcas.Annotation_Type; 15 | 16 | /** 17 | * Updated by JCasGen Wed Aug 26 15:50:04 CEST 2015 18 | * @generated */ 19 | public class SegmentScore_Type extends Annotation_Type { 20 | /** @generated */ 21 | @Override 22 | protected FSGenerator getFSGenerator() {return fsGenerator;} 23 | /** @generated */ 24 | private final FSGenerator fsGenerator = 25 | new FSGenerator() { 26 | public FeatureStructure createFS(int addr, CASImpl cas) { 27 | if (SegmentScore_Type.this.useExistingInstance) { 28 | // Return eq fs instance if already created 29 | FeatureStructure fs = SegmentScore_Type.this.jcas.getJfsFromCaddr(addr); 30 | if (null == fs) { 31 | fs = new SegmentScore(addr, SegmentScore_Type.this); 32 | SegmentScore_Type.this.jcas.putJfsFromCaddr(addr, fs); 33 | return fs; 34 | } 35 | return fs; 36 | } else return new SegmentScore(addr, SegmentScore_Type.this); 37 | } 38 | }; 39 | /** @generated */ 40 | @SuppressWarnings ("hiding") 41 | public final static int typeIndexID = SegmentScore.typeIndexID; 42 | /** @generated 43 | @modifiable */ 44 | @SuppressWarnings ("hiding") 45 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.langtech.semantics.type.SegmentScore"); 46 | 47 | /** @generated */ 48 | final Feature casFeat_score; 49 | /** @generated */ 50 | final int casFeatCode_score; 51 | /** @generated */ 52 | public double getScore(int addr) { 53 | if (featOkTst && casFeat_score == null) 54 | jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore"); 55 | return ll_cas.ll_getDoubleValue(addr, casFeatCode_score); 56 | } 57 | /** @generated */ 58 | public void setScore(int addr, double v) { 59 | if (featOkTst && casFeat_score == null) 60 | jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore"); 61 | ll_cas.ll_setDoubleValue(addr, casFeatCode_score, v);} 62 | 63 | 64 | 65 | /** @generated */ 66 | final Feature casFeat_similarityScores; 67 | /** @generated */ 68 | final int casFeatCode_similarityScores; 69 | /** @generated */ 70 | public String getSimilarityScores(int addr) { 71 | if (featOkTst && casFeat_similarityScores == null) 72 | jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore"); 73 | return ll_cas.ll_getStringValue(addr, casFeatCode_similarityScores); 74 | } 75 | /** @generated */ 76 | public void setSimilarityScores(int addr, String v) { 77 | if (featOkTst && casFeat_similarityScores == null) 78 | jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore"); 79 | ll_cas.ll_setStringValue(addr, casFeatCode_similarityScores, v);} 80 | 81 | 82 | 83 | 84 | 85 | /** initialize variables to correspond with Cas Type and Features 86 | * @generated */ 87 | public SegmentScore_Type(JCas jcas, Type casType) { 88 | super(jcas, casType); 89 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); 90 | 91 | 92 | casFeat_score = jcas.getRequiredFeatureDE(casType, "score", "uima.cas.Double", featOkTst); 93 | casFeatCode_score = (null == casFeat_score) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_score).getCode(); 94 | 95 | 96 | casFeat_similarityScores = jcas.getRequiredFeatureDE(casType, "similarityScores", "uima.cas.String", featOkTst); 97 | casFeatCode_similarityScores = (null == casFeat_similarityScores) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_similarityScores).getCode(); 98 | 99 | } 100 | } 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/WordTopicDistribution.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* First created by JCasGen Thu Apr 12 12:36:03 CEST 2012 */ 4 | package de.tudarmstadt.langtech.lda.type; 5 | 6 | import org.apache.uima.jcas.JCas; 7 | import org.apache.uima.jcas.JCasRegistry; 8 | import org.apache.uima.jcas.cas.TOP_Type; 9 | 10 | import org.apache.uima.jcas.tcas.Annotation; 11 | import org.apache.uima.jcas.cas.DoubleArray; 12 | 13 | 14 | /** 15 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012 16 | * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml 17 | * @generated */ 18 | public class WordTopicDistribution extends Annotation { 19 | /** @generated 20 | * @ordered 21 | */ 22 | public final static int typeIndexID = JCasRegistry.register(WordTopicDistribution.class); 23 | /** @generated 24 | * @ordered 25 | */ 26 | public final static int type = typeIndexID; 27 | /** @generated */ 28 | public int getTypeIndexID() {return typeIndexID;} 29 | 30 | /** Never called. Disable default constructor 31 | * @generated */ 32 | protected WordTopicDistribution() {} 33 | 34 | /** Internal - constructor used by generator 35 | * @generated */ 36 | public WordTopicDistribution(int addr, TOP_Type type) { 37 | super(addr, type); 38 | readObject(); 39 | } 40 | 41 | /** @generated */ 42 | public WordTopicDistribution(JCas jcas) { 43 | super(jcas); 44 | readObject(); 45 | } 46 | 47 | /** @generated */ 48 | public WordTopicDistribution(JCas jcas, int begin, int end) { 49 | super(jcas); 50 | setBegin(begin); 51 | setEnd(end); 52 | readObject(); 53 | } 54 | 55 | /** 56 | * Write your own initialization here 57 | * 58 | @generated modifiable */ 59 | private void readObject() {} 60 | 61 | 62 | 63 | //*--------------* 64 | //* Feature: topicDistribution 65 | 66 | /** getter for topicDistribution - gets 67 | * @generated */ 68 | public DoubleArray getTopicDistribution() { 69 | if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null) 70 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 71 | return (DoubleArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution)));} 72 | 73 | /** setter for topicDistribution - sets 74 | * @generated */ 75 | public void setTopicDistribution(DoubleArray v) { 76 | if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null) 77 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 78 | jcasType.ll_cas.ll_setRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution, jcasType.ll_cas.ll_getFSRef(v));} 79 | 80 | /** indexed getter for topicDistribution - gets an indexed value - 81 | * @generated */ 82 | public double getTopicDistribution(int i) { 83 | if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null) 84 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 85 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i); 86 | return jcasType.ll_cas.ll_getDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);} 87 | 88 | /** indexed setter for topicDistribution - sets an indexed value - 89 | * @generated */ 90 | public void setTopicDistribution(int i, double v) { 91 | if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null) 92 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 93 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i); 94 | jcasType.ll_cas.ll_setDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i, v);} 95 | } 96 | 97 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/TopicDistribution_Type.java: -------------------------------------------------------------------------------- 1 | 2 | /* First created by JCasGen Wed Apr 11 15:17:37 CEST 2012 */ 3 | package de.tudarmstadt.langtech.lda.type; 4 | 5 | import org.apache.uima.jcas.JCas; 6 | import org.apache.uima.jcas.JCasRegistry; 7 | import org.apache.uima.cas.impl.CASImpl; 8 | import org.apache.uima.cas.impl.FSGenerator; 9 | import org.apache.uima.cas.FeatureStructure; 10 | import org.apache.uima.cas.impl.TypeImpl; 11 | import org.apache.uima.cas.Type; 12 | import org.apache.uima.cas.impl.FeatureImpl; 13 | import org.apache.uima.cas.Feature; 14 | import org.apache.uima.jcas.tcas.Annotation_Type; 15 | 16 | /** 17 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012 18 | * @generated */ 19 | public class TopicDistribution_Type extends Annotation_Type { 20 | /** @generated */ 21 | protected FSGenerator getFSGenerator() {return fsGenerator;} 22 | /** @generated */ 23 | private final FSGenerator fsGenerator = 24 | new FSGenerator() { 25 | public FeatureStructure createFS(int addr, CASImpl cas) { 26 | if (TopicDistribution_Type.this.useExistingInstance) { 27 | // Return eq fs instance if already created 28 | FeatureStructure fs = TopicDistribution_Type.this.jcas.getJfsFromCaddr(addr); 29 | if (null == fs) { 30 | fs = new TopicDistribution(addr, TopicDistribution_Type.this); 31 | TopicDistribution_Type.this.jcas.putJfsFromCaddr(addr, fs); 32 | return fs; 33 | } 34 | return fs; 35 | } else return new TopicDistribution(addr, TopicDistribution_Type.this); 36 | } 37 | }; 38 | /** @generated */ 39 | public final static int typeIndexID = TopicDistribution.typeIndexID; 40 | /** @generated 41 | @modifiable */ 42 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 43 | 44 | /** @generated */ 45 | final Feature casFeat_topicDistribution; 46 | /** @generated */ 47 | final int casFeatCode_topicDistribution; 48 | /** @generated */ 49 | public int getTopicDistribution(int addr) { 50 | if (featOkTst && casFeat_topicDistribution == null) 51 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 52 | return ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution); 53 | } 54 | /** @generated */ 55 | public void setTopicDistribution(int addr, int v) { 56 | if (featOkTst && casFeat_topicDistribution == null) 57 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 58 | ll_cas.ll_setRefValue(addr, casFeatCode_topicDistribution, v);} 59 | 60 | /** @generated */ 61 | public double getTopicDistribution(int addr, int i) { 62 | if (featOkTst && casFeat_topicDistribution == null) 63 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 64 | if (lowLevelTypeChecks) 65 | return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, true); 66 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i); 67 | return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i); 68 | } 69 | 70 | /** @generated */ 71 | public void setTopicDistribution(int addr, int i, double v) { 72 | if (featOkTst && casFeat_topicDistribution == null) 73 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution"); 74 | if (lowLevelTypeChecks) 75 | ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v, true); 76 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i); 77 | ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v); 78 | } 79 | 80 | 81 | 82 | 83 | /** initialize variables to correspond with Cas Type and Features 84 | * @generated */ 85 | public TopicDistribution_Type(JCas jcas, Type casType) { 86 | super(jcas, casType); 87 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); 88 | 89 | 90 | casFeat_topicDistribution = jcas.getRequiredFeatureDE(casType, "topicDistribution", "uima.cas.DoubleArray", featOkTst); 91 | casFeatCode_topicDistribution = (null == casFeat_topicDistribution) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicDistribution).getCode(); 92 | 93 | } 94 | } 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/WordTopicDistribution_Type.java: -------------------------------------------------------------------------------- 1 | 2 | /* First created by JCasGen Thu Apr 12 12:36:03 CEST 2012 */ 3 | package de.tudarmstadt.langtech.lda.type; 4 | 5 | import org.apache.uima.jcas.JCas; 6 | import org.apache.uima.jcas.JCasRegistry; 7 | import org.apache.uima.cas.impl.CASImpl; 8 | import org.apache.uima.cas.impl.FSGenerator; 9 | import org.apache.uima.cas.FeatureStructure; 10 | import org.apache.uima.cas.impl.TypeImpl; 11 | import org.apache.uima.cas.Type; 12 | import org.apache.uima.cas.impl.FeatureImpl; 13 | import org.apache.uima.cas.Feature; 14 | import org.apache.uima.jcas.tcas.Annotation_Type; 15 | 16 | /** 17 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012 18 | * @generated */ 19 | public class WordTopicDistribution_Type extends Annotation_Type { 20 | /** @generated */ 21 | protected FSGenerator getFSGenerator() {return fsGenerator;} 22 | /** @generated */ 23 | private final FSGenerator fsGenerator = 24 | new FSGenerator() { 25 | public FeatureStructure createFS(int addr, CASImpl cas) { 26 | if (WordTopicDistribution_Type.this.useExistingInstance) { 27 | // Return eq fs instance if already created 28 | FeatureStructure fs = WordTopicDistribution_Type.this.jcas.getJfsFromCaddr(addr); 29 | if (null == fs) { 30 | fs = new WordTopicDistribution(addr, WordTopicDistribution_Type.this); 31 | WordTopicDistribution_Type.this.jcas.putJfsFromCaddr(addr, fs); 32 | return fs; 33 | } 34 | return fs; 35 | } else return new WordTopicDistribution(addr, WordTopicDistribution_Type.this); 36 | } 37 | }; 38 | /** @generated */ 39 | public final static int typeIndexID = WordTopicDistribution.typeIndexID; 40 | /** @generated 41 | @modifiable */ 42 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 43 | 44 | /** @generated */ 45 | final Feature casFeat_topicDistribution; 46 | /** @generated */ 47 | final int casFeatCode_topicDistribution; 48 | /** @generated */ 49 | public int getTopicDistribution(int addr) { 50 | if (featOkTst && casFeat_topicDistribution == null) 51 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 52 | return ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution); 53 | } 54 | /** @generated */ 55 | public void setTopicDistribution(int addr, int v) { 56 | if (featOkTst && casFeat_topicDistribution == null) 57 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 58 | ll_cas.ll_setRefValue(addr, casFeatCode_topicDistribution, v);} 59 | 60 | /** @generated */ 61 | public double getTopicDistribution(int addr, int i) { 62 | if (featOkTst && casFeat_topicDistribution == null) 63 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 64 | if (lowLevelTypeChecks) 65 | return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, true); 66 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i); 67 | return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i); 68 | } 69 | 70 | /** @generated */ 71 | public void setTopicDistribution(int addr, int i, double v) { 72 | if (featOkTst && casFeat_topicDistribution == null) 73 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution"); 74 | if (lowLevelTypeChecks) 75 | ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v, true); 76 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i); 77 | ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v); 78 | } 79 | 80 | 81 | 82 | 83 | /** initialize variables to correspond with Cas Type and Features 84 | * @generated */ 85 | public WordTopicDistribution_Type(JCas jcas, Type casType) { 86 | super(jcas, casType); 87 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); 88 | 89 | 90 | casFeat_topicDistribution = jcas.getRequiredFeatureDE(casType, "topicDistribution", "uima.cas.DoubleArray", featOkTst); 91 | casFeatCode_topicDistribution = (null == casFeat_topicDistribution) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicDistribution).getCode(); 92 | 93 | } 94 | } 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/test/java/RunTopicTilingOnFile.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.uima.UIMAException; 6 | import org.apache.uima.analysis_engine.AnalysisEngine; 7 | import org.apache.uima.collection.CollectionReader; 8 | import org.apache.uima.resource.ResourceInitializationException; 9 | import org.kohsuke.args4j.CmdLineException; 10 | import org.kohsuke.args4j.CmdLineParser; 11 | import org.kohsuke.args4j.Option; 12 | import org.uimafit.factory.AnalysisEngineFactory; 13 | import org.uimafit.factory.CollectionReaderFactory; 14 | import org.uimafit.pipeline.SimplePipeline; 15 | 16 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.OutputSegments; 17 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.TopicTilingSegmenterAnnotator; 18 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; 19 | import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter; 20 | 21 | public class RunTopicTilingOnFile { 22 | 23 | private static class Options { 24 | @Option(name="-tmd",usage="Directory of the topic model (GibbsLDA should be used)",required = true) 25 | String topicModelDirectory; 26 | @Option(name="-tmn",usage="Name of the topic model (GibbsLDA should be used)",required = true) 27 | String topicModelName; 28 | @Option(name="-dn",usage="Use the direct neighbor otherwise the highest neighbor will be used (default false)",required=false) 29 | boolean useDirectNeighbor=false; 30 | @Option(name="-i",usage="Number of inference iterations used to annotate words with topic IDs (default 100)",required=false) 31 | int inferenceIterations=100; 32 | @Option(name="-m",usage="Use mode counting (true/false) (default=true)",required=false) 33 | boolean modeCounting=true; 34 | @Option(name="-w",usage="Window size used to calculate the sentence similarity", required=false) 35 | int windowSize=1; 36 | @Option(name="-ri",usage="Use the repeated inference method",required = false) 37 | int repeatedInference=1; 38 | @Option(name="-rs",usage="Use the repeated segmentation",required = false) 39 | int repeatedSegmentation=1; 40 | @Option(name="-fd",usage="Directory fo the test files",required = true) 41 | public String fileDirectory; 42 | @Option(name="-fp",usage="File pattern for the test files",required = true) 43 | public String filePattern; 44 | @Option(name="-out",usage="File the content is written to (otherwise stdout will be used)",required = false) 45 | public String output=null; 46 | // @Option(name="-n",usage="Number of segments that should be made (the value -1 indicates, that segments are searched automatically)",required = true) 47 | // public String segmentNumber; 48 | } 49 | 50 | public static void main(final String[] args) 51 | throws ResourceInitializationException, UIMAException, IOException { 52 | Options options = new Options(); 53 | CmdLineParser parser = new CmdLineParser(options); 54 | try { 55 | parser.parseArgument(args); 56 | } catch( CmdLineException e ) { 57 | System.err.println(e.getMessage()); 58 | System.err.println("java -jar myprogram.jar [options...] arguments..."); 59 | parser.printUsage(System.err); 60 | return; 61 | } 62 | 63 | new RunTopicTilingOnFile(options); 64 | 65 | } 66 | 67 | public RunTopicTilingOnFile(Options opt) throws UIMAException, IOException { 68 | String neighbor = "HIGHEST_NEIGHBOR"; 69 | if (opt.useDirectNeighbor) 70 | neighbor = "DIRECT_NEIGHBOR"; 71 | final CollectionReader reader = CollectionReaderFactory.createCollectionReader( 72 | TextReader.class, 73 | TextReader.PARAM_PATH, opt.fileDirectory 74 | , 75 | 76 | 77 | TextReader.PARAM_PATTERNS, new String[] { "[+]"+opt.filePattern } 78 | ); 79 | 80 | AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(StanfordSegmenter.class); 81 | AnalysisEngine topicTiling = AnalysisEngineFactory 82 | .createPrimitive( 83 | TopicTilingSegmenterAnnotator.class, 84 | TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_DIRECTORY, 85 | opt.topicModelDirectory, 86 | TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_NAME, 87 | opt.topicModelName, 88 | TopicTilingSegmenterAnnotator.PARAM_INFERENCE_ITERATION, 89 | opt.inferenceIterations, 90 | TopicTilingSegmenterAnnotator.PARAM_REPEAT_INFERENCE, 91 | opt.repeatedInference, 92 | TopicTilingSegmenterAnnotator.PARAM_REPEAT_SEGMENTATION, 93 | opt.repeatedSegmentation, 94 | TopicTilingSegmenterAnnotator.PARAM_WINDOW, 95 | opt.windowSize, 96 | TopicTilingSegmenterAnnotator.PARAM_DEPTH_SCORE, 97 | neighbor, 98 | TopicTilingSegmenterAnnotator.PARAM_MODE_COUNTING, 99 | opt.modeCounting); 100 | AnalysisEngine outputSegments = AnalysisEngineFactory.createPrimitive(OutputSegments.class,OutputSegments.PARAM_OUTPUT,opt.output); 101 | SimplePipeline.runPipeline(reader, segmenter, topicTiling,outputSegments); 102 | 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Estimator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * JGibbsLDA is a free software; you can redistribute it and/or modify 15 | * it under the terms of the GNU General Public License as published 16 | * by the Free Software Foundation; either version 2 of the License, 17 | * or (at your option) any later version. 18 | * 19 | * JGibbsLDA is distributed in the hope that it will be useful, but 20 | * WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU General Public License 25 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 27 | */ 28 | 29 | package jgibbslda; 30 | 31 | import java.io.File; 32 | 33 | public class Estimator { 34 | 35 | // output model 36 | protected Model trnModel; 37 | LDACmdOption option; 38 | 39 | public boolean init(LDACmdOption option){ 40 | this.option = option; 41 | trnModel = new Model(); 42 | 43 | if (option.est){ 44 | if (!trnModel.initNewModel(option)) 45 | return false; 46 | trnModel.data.localDict.writeWordMap(option.dir + File.separator + option.wordMapFileName); 47 | } 48 | else if (option.estc){ 49 | if (!trnModel.initEstimatedModel(option)) 50 | return false; 51 | } 52 | 53 | return true; 54 | } 55 | 56 | public void estimate(){ 57 | System.out.println("Sampling " + trnModel.niters + " iteration!"); 58 | 59 | int lastIter = trnModel.liter; 60 | for (trnModel.liter = lastIter + 1; trnModel.liter < trnModel.niters + lastIter; trnModel.liter++){ 61 | System.out.println("Iteration " + trnModel.liter + " ..."); 62 | 63 | // for all z_i 64 | for (int m = 0; m < trnModel.M; m++){ 65 | for (int n = 0; n < trnModel.data.docs[m].length; n++){ 66 | // z_i = z[m][n] 67 | // sample from p(z_i|z_-i, w) 68 | int topic = sampling(m, n); 69 | trnModel.z[m].set(n, topic); 70 | }// end for each word 71 | }// end for each document 72 | 73 | if (option.savestep > 0){ 74 | if (trnModel.liter % option.savestep == 0){ 75 | System.out.println("Saving the model at iteration " + trnModel.liter + " ..."); 76 | computeTheta(); 77 | computePhi(); 78 | trnModel.saveModel("model-" + Conversion.ZeroPad(trnModel.liter, 5)); 79 | } 80 | } 81 | }// end iterations 82 | 83 | System.out.println("Gibbs sampling completed!\n"); 84 | System.out.println("Saving the final model!\n"); 85 | computeTheta(); 86 | computePhi(); 87 | trnModel.liter--; 88 | trnModel.saveModel("model-final"); 89 | } 90 | 91 | /** 92 | * Do sampling 93 | * @param m document number 94 | * @param n word number 95 | * @return topic id 96 | */ 97 | public int sampling(int m, int n){ 98 | // remove z_i from the count variable 99 | int topic = trnModel.z[m].get(n); 100 | int w = trnModel.data.docs[m].words[n]; 101 | 102 | trnModel.nw[w][topic] -= 1; 103 | trnModel.nd[m][topic] -= 1; 104 | trnModel.nwsum[topic] -= 1; 105 | trnModel.ndsum[m] -= 1; 106 | 107 | double Vbeta = trnModel.V * trnModel.beta; 108 | double Kalpha = trnModel.K * trnModel.alpha; 109 | 110 | //do multinominal sampling via cumulative method 111 | for (int k = 0; k < trnModel.K; k++){ 112 | trnModel.p[k] = (trnModel.nw[w][k] + trnModel.beta)/(trnModel.nwsum[k] + Vbeta) * 113 | (trnModel.nd[m][k] + trnModel.alpha)/(trnModel.ndsum[m] + Kalpha); 114 | } 115 | 116 | // cumulate multinomial parameters 117 | for (int k = 1; k < trnModel.K; k++){ 118 | trnModel.p[k] += trnModel.p[k - 1]; 119 | } 120 | 121 | // scaled sample because of unnormalized p[] 122 | double u = Math.random() * trnModel.p[trnModel.K - 1]; 123 | 124 | for (topic = 0; topic < trnModel.K; topic++){ 125 | if (trnModel.p[topic] > u) //sample topic w.r.t distribution p 126 | break; 127 | } 128 | 129 | // add newly estimated z_i to count variables 130 | trnModel.nw[w][topic] += 1; 131 | trnModel.nd[m][topic] += 1; 132 | trnModel.nwsum[topic] += 1; 133 | trnModel.ndsum[m] += 1; 134 | 135 | return topic; 136 | } 137 | 138 | public void computeTheta(){ 139 | for (int m = 0; m < trnModel.M; m++){ 140 | for (int k = 0; k < trnModel.K; k++){ 141 | trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha); 142 | } 143 | } 144 | } 145 | 146 | public void computePhi(){ 147 | for (int k = 0; k < trnModel.K; k++){ 148 | for (int w = 0; w < trnModel.V; w++){ 149 | trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta); 150 | } 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Dictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * JGibbsLDA is a free software; you can redistribute it and/or modify 15 | * it under the terms of the GNU General Public License as published 16 | * by the Free Software Foundation; either version 2 of the License, 17 | * or (at your option) any later version. 18 | * 19 | * JGibbsLDA is distributed in the hope that it will be useful, but 20 | * WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU General Public License 25 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 27 | */ 28 | package jgibbslda; 29 | 30 | import java.io.BufferedReader; 31 | import java.io.BufferedWriter; 32 | import java.io.FileInputStream; 33 | import java.io.FileOutputStream; 34 | import java.io.InputStreamReader; 35 | import java.io.OutputStreamWriter; 36 | import java.util.HashMap; 37 | import java.util.Iterator; 38 | import java.util.Map; 39 | import java.util.StringTokenizer; 40 | 41 | public class Dictionary { 42 | public Map word2id; 43 | public Map id2word; 44 | 45 | //-------------------------------------------------- 46 | // constructors 47 | //-------------------------------------------------- 48 | 49 | public Dictionary(){ 50 | word2id = new HashMap(); 51 | id2word = new HashMap(); 52 | } 53 | 54 | //--------------------------------------------------- 55 | // get/set methods 56 | //--------------------------------------------------- 57 | 58 | public String getWord(int id){ 59 | return id2word.get(id); 60 | } 61 | 62 | public Integer getID (String word){ 63 | return word2id.get(word); 64 | } 65 | 66 | //---------------------------------------------------- 67 | // checking methods 68 | //---------------------------------------------------- 69 | /** 70 | * check if this dictionary contains a specified word 71 | */ 72 | public boolean contains(String word){ 73 | return word2id.containsKey(word); 74 | } 75 | 76 | public boolean contains(int id){ 77 | return id2word.containsKey(id); 78 | } 79 | //--------------------------------------------------- 80 | // manupulating methods 81 | //--------------------------------------------------- 82 | /** 83 | * add a word into this dictionary 84 | * return the corresponding id 85 | */ 86 | public int addWord(String word){ 87 | if (!contains(word)){ 88 | int id = word2id.size(); 89 | 90 | word2id.put(word, id); 91 | id2word.put(id,word); 92 | 93 | return id; 94 | } 95 | else return getID(word); 96 | } 97 | 98 | //--------------------------------------------------- 99 | // I/O methods 100 | //--------------------------------------------------- 101 | /** 102 | * read dictionary from file 103 | */ 104 | public boolean readWordMap(String wordMapFile){ 105 | try{ 106 | BufferedReader reader = new BufferedReader(new InputStreamReader( 107 | new FileInputStream(wordMapFile), "UTF-8")); 108 | String line; 109 | 110 | //read the number of words 111 | line = reader.readLine(); 112 | int nwords = Integer.parseInt(line); 113 | 114 | //read map 115 | for (int i = 0; i < nwords; ++i){ 116 | line = reader.readLine(); 117 | StringTokenizer tknr = new StringTokenizer(line, " \t\n\r"); 118 | 119 | if (tknr.countTokens() != 2) continue; 120 | 121 | String word = tknr.nextToken(); 122 | String id = tknr.nextToken(); 123 | int intID = Integer.parseInt(id); 124 | 125 | id2word.put(intID, word); 126 | word2id.put(word, intID); 127 | } 128 | 129 | reader.close(); 130 | return true; 131 | } 132 | catch (Exception e){ 133 | System.out.println("Error while reading dictionary:" + e.getMessage()); 134 | e.printStackTrace(); 135 | return false; 136 | } 137 | } 138 | 139 | public boolean writeWordMap(String wordMapFile){ 140 | try{ 141 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( 142 | new FileOutputStream(wordMapFile), "UTF-8")); 143 | 144 | //write number of words 145 | writer.write(word2id.size() + "\n"); 146 | 147 | //write word to id 148 | Iterator it = word2id.keySet().iterator(); 149 | while (it.hasNext()){ 150 | String key = it.next(); 151 | Integer value = word2id.get(key); 152 | 153 | writer.write(key + " " + value + "\n"); 154 | } 155 | 156 | writer.close(); 157 | return true; 158 | } 159 | catch (Exception e){ 160 | System.out.println("Error while writing word map " + e.getMessage()); 161 | e.printStackTrace(); 162 | return false; 163 | } 164 | 165 | 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LogSaveEstimator.java: -------------------------------------------------------------------------------- 1 | package jgibbslda; 2 | 3 | /* 4 | * Copyright (C) 2007 by 5 | * 6 | * Xuan-Hieu Phan 7 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 8 | * Graduate School of Information Sciences 9 | * Tohoku University 10 | * 11 | * Cam-Tu Nguyen 12 | * ncamtu@gmail.com 13 | * College of Technology 14 | * Vietnam National University, Hanoi 15 | * 16 | * Martin Riedl 17 | * riedl@cs.tu-darmstadt.de 18 | * FG Language Technology 19 | * Technische Universität Darmstadt, Germany 20 | * 21 | * JGibbsLDA is a free software; you can redistribute it and/or modify 22 | * it under the terms of the GNU General Public License as published 23 | * by the Free Software Foundation; either version 2 of the License, 24 | * or (at your option) any later version. 25 | * 26 | * JGibbsLDA is distributed in the hope that it will be useful, but 27 | * WITHOUT ANY WARRANTY; without even the implied warranty of 28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | * GNU General Public License for more details. 30 | * 31 | * You should have received a copy of the GNU General Public License 32 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 33 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 34 | */ 35 | 36 | 37 | import java.io.File; 38 | import java.util.Arrays; 39 | 40 | public class LogSaveEstimator { 41 | 42 | // output model 43 | protected Model trnModel; 44 | LDACmdOption option; 45 | public int[] savesteps; 46 | 47 | public boolean init(LDACmdOption option){ 48 | this.option = option; 49 | trnModel = new Model(); 50 | 51 | if (option.est){ 52 | if (!trnModel.initNewModel(option)) 53 | return false; 54 | trnModel.data.localDict.writeWordMap(option.dir + File.separator + option.wordMapFileName); 55 | } 56 | else if (option.estc){ 57 | if (!trnModel.initEstimatedModel(option)) 58 | return false; 59 | } 60 | 61 | return true; 62 | } 63 | 64 | public void estimate(){ 65 | System.out.println("Sampling " + trnModel.niters + " iteration!"); 66 | 67 | 68 | int lastIter = trnModel.liter; 69 | for (trnModel.liter = lastIter + 1; trnModel.liter < trnModel.niters + lastIter; trnModel.liter++){ 70 | System.out.println("Iteration " + trnModel.liter + " ..."); 71 | 72 | // for all z_i 73 | for (int m = 0; m < trnModel.M; m++){ 74 | 75 | for (int n = 0; n < trnModel.data.docs[m].length; n++){ 76 | // z_i = z[m][n] 77 | // sample from p(z_i|z_-i, w) 78 | int topic = sampling(m, n); 79 | trnModel.z[m].set(n, topic); 80 | }// end for each word 81 | }// end for each document 82 | 83 | // if (option.savestep > 0){ 84 | if (Arrays.binarySearch(savesteps,trnModel.liter)>=0){ 85 | System.out.println("Saving the model at iteration " + trnModel.liter + " ..."); 86 | computeTheta(); 87 | computePhi(); 88 | trnModel.saveModel("model-" + Conversion.ZeroPad(trnModel.liter, 5)); 89 | } 90 | // } 91 | }// end iterations 92 | 93 | System.out.println("Gibbs sampling completed!\n"); 94 | System.out.println("Saving the final model!\n"); 95 | computeTheta(); 96 | computePhi(); 97 | trnModel.liter--; 98 | trnModel.saveModel("model-final"); 99 | } 100 | 101 | /** 102 | * Do sampling 103 | * @param m document number 104 | * @param n word number 105 | * @return topic id 106 | */ 107 | public int sampling(int m, int n){ 108 | // remove z_i from the count variable 109 | int topic = trnModel.z[m].get(n); 110 | int w = trnModel.data.docs[m].words[n]; 111 | trnModel.nw[w][topic] -= 1; 112 | trnModel.nd[m][topic] -= 1; 113 | trnModel.nwsum[topic] -= 1; 114 | trnModel.ndsum[m] -= 1; 115 | 116 | double Vbeta = trnModel.V * trnModel.beta; 117 | double Kalpha = trnModel.K * trnModel.alpha; 118 | 119 | //do multinominal sampling via cumulative method 120 | for (int k = 0; k < trnModel.K; k++){ 121 | trnModel.p[k] = (trnModel.nw[w][k] + trnModel.beta)/(trnModel.nwsum[k] + Vbeta) * 122 | (trnModel.nd[m][k] + trnModel.alpha)/(trnModel.ndsum[m] + Kalpha); 123 | } 124 | 125 | // cumulate multinomial parameters 126 | for (int k = 1; k < trnModel.K; k++){ 127 | trnModel.p[k] += trnModel.p[k - 1]; 128 | } 129 | 130 | // scaled sample because of unnormalized p[] 131 | double u = Math.random() * trnModel.p[trnModel.K - 1]; 132 | 133 | for (topic = 0; topic < trnModel.K; topic++){ 134 | if (trnModel.p[topic] > u) //sample topic w.r.t distribution p 135 | break; 136 | } 137 | 138 | // add newly estimated z_i to count variables 139 | trnModel.nw[w][topic] += 1; 140 | trnModel.nd[m][topic] += 1; 141 | trnModel.nwsum[topic] += 1; 142 | trnModel.ndsum[m] += 1; 143 | 144 | return topic; 145 | } 146 | 147 | public void computeTheta(){ 148 | for (int m = 0; m < trnModel.M; m++){ 149 | for (int k = 0; k < trnModel.K; k++){ 150 | trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha); 151 | } 152 | } 153 | } 154 | 155 | public void computePhi(){ 156 | for (int k = 0; k < trnModel.K; k++){ 157 | for (int w = 0; w < trnModel.V; w++){ 158 | trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta); 159 | } 160 | } 161 | } 162 | } 163 | 164 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/RunTopicTilingOnFile.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | 25 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter; 26 | 27 | 28 | import java.io.IOException; 29 | 30 | import org.apache.uima.UIMAException; 31 | import org.apache.uima.analysis_engine.AnalysisEngine; 32 | import org.apache.uima.collection.CollectionReader; 33 | import org.apache.uima.resource.ResourceInitializationException; 34 | import org.kohsuke.args4j.CmdLineException; 35 | import org.kohsuke.args4j.CmdLineParser; 36 | import org.kohsuke.args4j.Option; 37 | import org.uimafit.factory.AnalysisEngineFactory; 38 | import org.uimafit.factory.CollectionReaderFactory; 39 | import org.uimafit.pipeline.SimplePipeline; 40 | 41 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.OutputSegments; 42 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.SimpleSegmenter; 43 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.TopicTilingSegmenterAnnotator; 44 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader; 45 | import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter; 46 | 47 | public class RunTopicTilingOnFile { 48 | 49 | private static class Options { 50 | @Option(name="-tmd",usage="Directory of the topic model (GibbsLDA should be used)",required = true) 51 | String topicModelDirectory; 52 | @Option(name="-tmn",usage="Name of the topic model (GibbsLDA should be used)",required = true) 53 | String topicModelName; 54 | @Option(name="-dn",usage="Use the direct neighbor otherwise the highest neighbor will be used (default false)",required=false) 55 | boolean useDirectNeighbor=false; 56 | @Option(name="-d",usage="Print debugging output (default false)",required=false) 57 | boolean debug=false; 58 | @Option(name="-i",usage="Number of inference iterations used to annotate words with topic IDs (default 100)",required=false) 59 | int inferenceIterations=100; 60 | @Option(name="-s",usage="Use simple segmentation (default=false)",required=false) 61 | boolean useSimpleSegmentation=false; 62 | 63 | @Option(name="-m",usage="Use mode counting (true/false) (default=true)",required=false) 64 | boolean modeCounting=true; 65 | @Option(name="-w",usage="Window size used to calculate the sentence similarity", required=false) 66 | int windowSize=1; 67 | @Option(name="-ri",usage="Use the repeated inference method",required = false) 68 | int repeatedInference=1; 69 | @Option(name="-rs",usage="Use the repeated segmentation",required = false) 70 | int repeatedSegmentation=1; 71 | @Option(name="-fd",usage="Directory fo the test files",required = true) 72 | public String fileDirectory; 73 | @Option(name="-fp",usage="File pattern for the test files",required = true) 74 | public String filePattern; 75 | @Option(name="-out",usage="File the content is written to (otherwise stdout will be used)",required = false) 76 | public String output=null; 77 | // @Option(name="-n",usage="Number of segments that should be made (the value -1 indicates, that segments are searched automatically)",required = true) 78 | // public String segmentNumber; 79 | } 80 | 81 | public static void main(final String[] args) 82 | throws ResourceInitializationException, UIMAException, IOException { 83 | Options options = new Options(); 84 | CmdLineParser parser = new CmdLineParser(options); 85 | try { 86 | parser.parseArgument(args); 87 | } catch( CmdLineException e ) { 88 | System.err.println(e.getMessage()); 89 | System.err.println("java -jar myprogram.jar [options...] arguments..."); 90 | parser.printUsage(System.err); 91 | return; 92 | } 93 | 94 | new RunTopicTilingOnFile(options); 95 | 96 | } 97 | 98 | public RunTopicTilingOnFile(Options opt) throws UIMAException, IOException { 99 | String neighbor = "HIGHEST_NEIGHBOR"; 100 | if (opt.useDirectNeighbor) 101 | neighbor = "DIRECT_NEIGHBOR"; 102 | final CollectionReader reader = CollectionReaderFactory.createCollectionReader( 103 | TextReader.class, 104 | TextReader.PARAM_PATH, opt.fileDirectory, 105 | TextReader.PARAM_PATTERNS, new String[] { "[+]" + opt.filePattern }); 106 | 107 | AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(StanfordSegmenter.class); 108 | if(opt.useSimpleSegmentation){ 109 | segmenter = AnalysisEngineFactory.createPrimitive(SimpleSegmenter.class); 110 | } 111 | AnalysisEngine topicTiling = AnalysisEngineFactory 112 | .createPrimitive( 113 | TopicTilingSegmenterAnnotator.class, 114 | TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_DIRECTORY, 115 | opt.topicModelDirectory, 116 | TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_NAME, 117 | opt.topicModelName, 118 | TopicTilingSegmenterAnnotator.PARAM_INFERENCE_ITERATION, 119 | opt.inferenceIterations, 120 | TopicTilingSegmenterAnnotator.PARAM_REPEAT_INFERENCE, 121 | opt.repeatedInference, 122 | TopicTilingSegmenterAnnotator.PARAM_REPEAT_SEGMENTATION, 123 | opt.repeatedSegmentation, 124 | TopicTilingSegmenterAnnotator.PARAM_WINDOW, 125 | opt.windowSize, 126 | TopicTilingSegmenterAnnotator.PARAM_DEPTH_SCORE, 127 | neighbor, 128 | TopicTilingSegmenterAnnotator.PARAM_DEBUG, 129 | opt.debug, 130 | TopicTilingSegmenterAnnotator.PARAM_MODE_COUNTING, 131 | opt.modeCounting); 132 | AnalysisEngine outputSegments = AnalysisEngineFactory.createPrimitive(OutputSegments.class,OutputSegments.PARAM_OUTPUT,opt.output); 133 | SimplePipeline.runPipeline(reader, segmenter, topicTiling,outputSegments); 134 | 135 | } 136 | 137 | } 138 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Inferencer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * Martin Riedl 15 | * riedl@cs.tu-darmstadt.de 16 | * FG Language Technology 17 | * Technische Universität Darmstadt, Germany 18 | * 19 | * JGibbsLDA is a free software; you can redistribute it and/or modify 20 | * it under the terms of the GNU General Public License as published 21 | * by the Free Software Foundation; either version 2 of the License, 22 | * or (at your option) any later version. 23 | * 24 | * JGibbsLDA is distributed in the hope that it will be useful, but 25 | * WITHOUT ANY WARRANTY; without even the implied warranty of 26 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 | * GNU General Public License for more details. 28 | * 29 | * You should have received a copy of the GNU General Public License 30 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 31 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 32 | */ 33 | 34 | package jgibbslda; 35 | 36 | import java.util.ArrayList; 37 | import java.util.List; 38 | 39 | import org.apache.uima.UIMAFramework; 40 | import org.apache.uima.util.Level; 41 | import org.apache.uima.util.Logger; 42 | 43 | 44 | 45 | 46 | public class Inferencer { 47 | // Train model 48 | public Model trnModel; 49 | public Dictionary globalDict; 50 | private LDACmdOption option; 51 | public ArrayList values ; 52 | private Model newModel; 53 | public int niters = 100; 54 | public static Logger logger = UIMAFramework.getLogger(Inferencer.class); 55 | //----------------------------------------------------- 56 | // Init method 57 | //----------------------------------------------------- 58 | public boolean init(LDACmdOption option){ 59 | this.option = option; 60 | trnModel = new Model(); 61 | 62 | if (!trnModel.initEstimatedModel(option)) 63 | return false; 64 | 65 | globalDict = trnModel.data.localDict; 66 | computeTrnTheta(); 67 | computeTrnPhi(); 68 | 69 | return true; 70 | } 71 | 72 | //inference new model ~ getting data from a specified dataset 73 | public Model inference( LDADataset newData){ 74 | logger.log(Level.FINE,"init new model"); 75 | Model newModel = new Model(); 76 | 77 | newModel.initNewModel(option, newData, trnModel); 78 | this.newModel = newModel; 79 | 80 | //initialiaze for repeated mode (RIEDL) 81 | values = new ArrayList(); 82 | for (int doc = 0; doc < newModel.z.length; doc++) { 83 | values.add(new int[newModel.z[doc].size()][newModel.K]); 84 | } 85 | 86 | //----------------------- 87 | logger.log(Level.FINE,"Sampling " + niters + " iteration for inference!"); 88 | // TopicTiling.printDim(newModel.z); 89 | 90 | 91 | for (newModel.liter = 1; newModel.liter <= niters; newModel.liter++){ 92 | //System.out.println("Iteration " + newModel.liter + " ..."); 93 | 94 | // for all newz_i 95 | for (int m = 0; m < newModel.M; ++m){//num of docs 96 | for (int n = 0; n < newModel.data.docs[m].length; n++){ 97 | // (newz_i = newz[m][n] 98 | // sample from p(z_i|z_-1,w) 99 | int topic = infSampling(m, n); 100 | newModel.z[m].set(n, topic); 101 | //MR 102 | values.get(m)[n][topic]++; 103 | //END MR 104 | } 105 | }//end foreach new doc 106 | 107 | }// end iterations 108 | 109 | 110 | logger.log(Level.FINE,"Gibbs sampling for inference completed!"); 111 | 112 | computeNewTheta(); 113 | computeNewPhi(); 114 | newModel.liter--; 115 | 116 | return this.newModel; 117 | } 118 | 119 | public Model inference(String [] strs){ 120 | //System.out.println("inference"); 121 | // Model newModel = new Model(); 122 | 123 | //System.out.println("read dataset"); 124 | LDADataset dataset = LDADataset.readDataSet(strs, globalDict); 125 | 126 | return inference(dataset); 127 | } 128 | 129 | public Model inference(List [] strs){ 130 | //System.out.println("inference"); 131 | // Model newModel = new Model(); 132 | 133 | //System.out.println("read dataset"); 134 | LDADataset dataset = LDADataset.readDataSet(strs, globalDict); 135 | 136 | return inference(dataset); 137 | } 138 | 139 | //inference new model ~ getting dataset from file specified in option 140 | public Model inference(){ 141 | //System.out.println("inference"); 142 | 143 | newModel = new Model(); 144 | if (!newModel.initNewModel(option, trnModel)) return null; 145 | 146 | logger.log(Level.INFO,"Sampling " + niters + " iteration for inference!"); 147 | 148 | for (newModel.liter = 1; newModel.liter <= niters; newModel.liter++){ 149 | //System.out.println("Iteration " + newModel.liter + " ..."); 150 | 151 | // for all newz_i 152 | for (int m = 0; m < newModel.M; ++m){ 153 | for (int n = 0; n < newModel.data.docs[m].length; n++){ 154 | // (newz_i = newz[m][n] 155 | // sample from p(z_i|z_-1,w) 156 | int topic = infSampling(m, n); 157 | newModel.z[m].set(n, topic); 158 | 159 | } 160 | }//end foreach new doc 161 | 162 | }// end iterations 163 | 164 | logger.log(Level.FINE,"Gibbs sampling for inference completed!"); 165 | logger.log(Level.FINE,"Saving the inference outputs!"); 166 | 167 | computeNewTheta(); 168 | computeNewPhi(); 169 | newModel.liter--; 170 | newModel.saveModel(newModel.dfile + "." + newModel.modelName); 171 | 172 | return newModel; 173 | } 174 | 175 | /** 176 | * do sampling for inference 177 | * m: document number 178 | * n: word number? 179 | */ 180 | protected int infSampling(int m, int n){ 181 | // remove z_i from the count variables 182 | int topic = newModel.z[m].get(n); 183 | int _w = newModel.data.docs[m].words[n]; 184 | int w = newModel.data.lid2gid.get(_w); 185 | newModel.nw[_w][topic] -= 1; 186 | newModel.nd[m][topic] -= 1; 187 | newModel.nwsum[topic] -= 1; 188 | newModel.ndsum[m] -= 1; 189 | 190 | double Vbeta = trnModel.V * newModel.beta; 191 | double Kalpha = trnModel.K * newModel.alpha; 192 | 193 | // do multinomial sampling via cummulative method 194 | for (int k = 0; k < newModel.K; k++){ 195 | newModel.p[k] = (trnModel.nw[w][k] + newModel.nw[_w][k] + newModel.beta)/(trnModel.nwsum[k] + newModel.nwsum[k] + Vbeta) * 196 | (newModel.nd[m][k] + newModel.alpha)/(newModel.ndsum[m] + Kalpha); 197 | } 198 | 199 | // cummulate multinomial parameters 200 | for (int k = 1; k < newModel.K; k++){ 201 | newModel.p[k] += newModel.p[k - 1]; 202 | } 203 | 204 | // scaled sample because of unnormalized p[] 205 | double u = Math.random() * newModel.p[newModel.K - 1]; 206 | 207 | for (topic = 0; topic < newModel.K; topic++){ 208 | if (newModel.p[topic] > u) 209 | break; 210 | } 211 | 212 | // add newly estimated z_i to count variables 213 | newModel.nw[_w][topic] += 1; 214 | newModel.nd[m][topic] += 1; 215 | newModel.nwsum[topic] += 1; 216 | newModel.ndsum[m] += 1; 217 | 218 | return topic; 219 | } 220 | 221 | protected void computeNewTheta(){ 222 | for (int m = 0; m < newModel.M; m++){ 223 | for (int k = 0; k < newModel.K; k++){ 224 | newModel.theta[m][k] = (newModel.nd[m][k] + newModel.alpha) / (newModel.ndsum[m] + newModel.K * newModel.alpha); 225 | }//end foreach topic 226 | }//end foreach new document 227 | } 228 | 229 | protected void computeNewPhi(){ 230 | for (int k = 0; k < newModel.K; k++){ 231 | for (int _w = 0; _w < newModel.V; _w++){ 232 | Integer id = newModel.data.lid2gid.get(_w); 233 | 234 | if (id != null){ 235 | newModel.phi[k][_w] = (trnModel.nw[id][k] + newModel.nw[_w][k] + newModel.beta) / (newModel.nwsum[k] + newModel.nwsum[k] + trnModel.V * newModel.beta); 236 | } 237 | }//end foreach word 238 | }// end foreach topic 239 | } 240 | 241 | protected void computeTrnTheta(){ 242 | for (int m = 0; m < trnModel.M; m++){ 243 | for (int k = 0; k < trnModel.K; k++){ 244 | trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha); 245 | } 246 | } 247 | } 248 | 249 | protected void computeTrnPhi(){ 250 | for (int k = 0; k < trnModel.K; k++){ 251 | for (int w = 0; w < trnModel.V; w++){ 252 | trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta); 253 | } 254 | } 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TopicTiling 2 | 3 | TopicTiling 4 | 5 | Topic Tiling is an LDA-based Text Segmentation algorithm. 6 | The algorithm is inspired by the well-known [TextTiling](http://www.aclweb.org/anthology/J97-1003) 7 | algorithm developed by [Marti Hearst](http://people.ischool.berkeley.edu/~hearst/), and segments documents using the Latent 8 | Dirichlet Allocation (LDA) topic model. TopicTiling performs 9 | the segmentation in linear time and thus is computationally 10 | less expensive than other LDA-based segmentation methods. 11 | 12 | I have moved the project from SourceForge to Github. Whereas the code is still the same, I have updated the documentation on this page. 13 | 14 | For the LDA computation we use [JGibbLda](http://jgibblda.sourceforge.net/) in a slightly modified version, making this project to be licenced under GPL. 15 | 16 | 17 | Table of Content 18 | ================ 19 | 20 | 21 | * [Usage of the binaries](#usage-of-the-binaries) 22 | * [Usage for non latin languages](#usage-for-non-latin-languages) 23 | * [Usage of the source code](#usage-of-the-source-code) 24 | * [Compute a topic model](#compute-a-topic-model) 25 | * [Split output file by documents](#split-output-file-by-documents) 26 | * [Citation](#citation) 27 | * [License](#license) 28 | 29 | 30 | 31 | 32 | Usage of the binaries 33 | =============== 34 | 35 | The tool has been developed and tested using unix-based systems. 36 | As TopicTiling is written in Java it should also run on Windows 37 | machines. 38 | 39 | To start TopicTiling, you have to download the binary ([zip](https://github.com/riedlma/topictiling/releases/download/v1.0/topictiling_v1.0.zip)|[tar.gz](https://github.com/riedlma/topictiling/releases/download/v1.0/topictiling_v1.0.tar.gz)) and decompress the archive. To execute the segmentation method, open the commandline and navigate to the uncompressed folder 40 | 41 | ``` 42 | cd topictiling_v1.0 43 | ``` 44 | 45 | We provide an batch script to start the segmentation for Windows: 46 | ``` 47 | bash topictiling.bat 48 | ``` 49 | and a shell script to start the segmentation for unix-based operation systems: 50 | ``` 51 | sh topictiling.sh 52 | ``` 53 | 54 | These commands will output all parameters of TopicTiling: 55 | 56 | 57 | ``` 58 | [java] Option "-fd" is required 59 | [java] java -jar myprogram.jar [options...] arguments... 60 | [java] -dn : Use the direct neighbor otherwise the highest neighbor will be used 61 | [java] (default false) 62 | [java] -fd VAL : Directory fo the test files 63 | [java] -fp VAL : File pattern for the test files 64 | [java] -i N : Number of inference iterations used to annotate words with topic 65 | [java] IDs (default 100) 66 | [java] -m : Use mode counting (true/false) (default=true) 67 | [java] -out VAL : File the content is written to (otherwise stdout will be used) 68 | [java] -ri N : Use the repeated inference method 69 | [java] -rs N : Use the repeated segmentation 70 | [java] -s : Use simple segmentation (default=false) 71 | [java] -tmd VAL : Directory of the topic model (GibbsLDA should be used) 72 | [java] -tmn VAL : Name of the topic model (GibbsLDA should be used) 73 | [java] -w N : Window size used to calculate the sentence similarity 74 | ``` 75 | 76 | We recommend using the mode counting (-m). In each inference iteration of LDA, a topicId is assigned to a word. In the default implementation the assignment is done via sampling. Thus, it could happen that a word has a different topicId in each inference step. To stabelize the topicId assignment, we store each topicId assignment for each inference iteration and at the end we use the one that has been sampled most. 77 | 78 | In order to test TopicTiling, you also require a topic model that has been computed with either [JGibbLDA](http://jgibblda.sourceforge.net/) or [GibbsLda++](http://gibbslda.sourceforge.net/). Some description for the computation is given [here](#compute-a-topic-model). 79 | 80 | Once you have computed a topic model, you might have a folder called *topicmodel* with the following files: 81 | ``` 82 | topicmodel/model-final.others 83 | topicmodel/model-final.phi 84 | topicmodel/model-final.tassign 85 | topicmodel/model-final.theta 86 | topicmodel/model-final.twords 87 | topicmodel/wordmap.txt 88 | ``` 89 | 90 | 91 | For the segmentation, we advise to repeat the inference five times (*-ri 5*) (see [paper](http://www.aclweb.org/anthology/W12-0703)). To start the segmentation, you can then use the following command, considering that the files you want to segment are stored in the folder *files_to_segment* and use as file ending "txt": 92 | 93 | ``` 94 | sh topictiling.sh -ri 5 -tmd topicmodel -tmn mode-final -fp "*txt" -fd files_to_segment 95 | ``` 96 | 97 | The output of the algorithms is in XML format: 98 | 99 | ``` 100 | 101 | 102 | 103 | score 104 | 105 | 106 | … 107 | 108 | 109 | ``` 110 | 111 | The code returns all maxima where a boundary might be set. If you know the number of segments, you can just select the N semgents with the highest depthScore scores and ignore the remaining ones. 112 | 113 | 114 | Usage for non latin languages 115 | =============== 116 | The current version uses the Stanford segmenter for tokenization. However, this tokenizer does not play well on languages without any latin characters (e.g. Chinese, Arabic, Hebrew, Japanese, etc.). In order to segment such languages, segment the texts beforehand and use the parameter *-s* that disables the tokenization and expects all words segmented by white spaces. 117 | 118 | Usage of the source code 119 | =============== 120 | Import both projects into Eclipse. The LDA project contains JGibbLda with slight modifications, so the mode method can be computed. Additionally it contains UIMA Annotators, so it can be used within a UIMA Pipeline. The project also has dependencies to DKPro and uimafit. To run the TopicTiling algorithm, execute the class TopicTilingTopicDocument. 121 | 122 | Compute a topic model 123 | =============== 124 | 125 | To compute the topic model with LDA, documents are required that represent the domain of texts, the segmentation method will be applied to. For the computation you can use either [JGibbLDA](http://jgibblda.sourceforge.net/) (written in Java) or the faster C++ version [GibbsLda++](http://gibbslda.sourceforge.net/). To get an impression of the usage of different parameters of LDA you can have a look at our paper: [Sweeping through the Topic Space: Bad luck? Roll again!](http://www.aclweb.org/anthology/W12-0703). In general, we would advise training a topic model with 100 topics, alpha with 50/(number of topics) and alpha equals 0.01. 126 | 127 | 128 | Split output file by documents 129 | =============== 130 | 131 | The output of the standard TopicTiling method returns one file with segments for all documents. If you want to have one file with segments for each document you can use the python script in the repository called: *split_output.py*. For this, The output of TopicTiling should be redirected to a file (e.g. *output_file*). The python script expects two parameters: the output file of TopicTiling (*output_file*) and a folder that is created and where all single document files are stored (*output_folder*) 132 | ``` 133 | python split_output.py output_file output_folder 134 | ``` 135 | 136 | 137 | Citation 138 | =============== 139 | If you use TextTiling, please cite one of the following papers/article: 140 | 141 | ``` 142 | 143 | @article{Riedl:jlcl, 144 | author = {Martin Riedl and Chris Biemann}, 145 | title = {{Text Segmentation with Topic Models }}, 146 | journal = {Journal for Language Technology and Computational Linguistics (JLCL)}, 147 | year={2012}, 148 | volume={27}, 149 | number={47-69}, 150 | pages={13-24}, 151 | url={http://www.jlcl.org/2012_Heft1/jlcl2012-1-3.pdf} 152 | } 153 | 154 | @inproceedings{riedl12_acl, 155 | author = {Martin Riedl and Chris Biemann}, 156 | title = {TopicTiling: A Text Segmentation Algorithm based on LDA}, 157 | year = {2012}, 158 | address = {Jeju, Republic of Korea}, 159 | booktitle = {Proceedings of the Student Research Workshop of the 50th Meeting of the Association for 160 | Computational Linguistics}, 161 | pages = {37--42}, 162 | url={http://www.aclweb.org/anthology/W12-3307}, 163 | } 164 | 165 | ``` 166 | 167 | 168 | 169 | License 170 | =============== 171 | As JGibbLDA is published under GPL 2.0 license, which is contained in the current repository, I had to license via this license. 172 | 173 | TopicTiling is a free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation. 174 | 175 | TopicTiling is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/TopicTilingDocumentSegmenterAnnotator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator; 25 | 26 | import java.text.DecimalFormat; 27 | import java.util.ArrayList; 28 | import java.util.Iterator; 29 | import java.util.List; 30 | 31 | import org.apache.uima.UimaContext; 32 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 33 | import org.apache.uima.jcas.JCas; 34 | import org.apache.uima.resource.ResourceInitializationException; 35 | import org.uimafit.component.JCasAnnotator_ImplBase; 36 | import org.uimafit.descriptor.ConfigurationParameter; 37 | import org.uimafit.util.JCasUtil; 38 | 39 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TextTilingWindowOptimized; 40 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TopicTilingTopicDocument; 41 | import de.tudarmstadt.langtech.semantics.type.Segment; 42 | import de.tudarmstadt.langtech.semantics.type.SegmentQuantity; 43 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; 44 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; 45 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 46 | 47 | public class TopicTilingDocumentSegmenterAnnotator 48 | extends JCasAnnotator_ImplBase { 49 | private boolean printSegments = false; 50 | 51 | public static final String PARAM_LDA_MODEL_DIRECTORY = "LdaModelDirectory"; 52 | public static final String PARAM_LDA_MODEL_NAME = "LdaModelName"; 53 | public static final String PARAM_WINDOW = "TopicTilingWindow"; 54 | public static final String PARAM_REPEAT_SEGMENTATION = "RepeatedSegmentation"; 55 | public static final String PARAM_INFERENCE_ITERATION = "InferenceIteration"; 56 | public static final String PARAM_REPEAT_INFERENCE = "RepeatedInference"; 57 | 58 | @ConfigurationParameter(name = PARAM_LDA_MODEL_DIRECTORY, mandatory = true) 59 | private String ldaModelDirectory; 60 | @ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true) 61 | private String ldaModelName; 62 | @ConfigurationParameter(name = PARAM_WINDOW, mandatory = true) 63 | private int window; 64 | @ConfigurationParameter(name = PARAM_REPEAT_INFERENCE, mandatory = true) 65 | private int repeatInferences; 66 | @ConfigurationParameter(name = PARAM_REPEAT_SEGMENTATION, mandatory = true) 67 | private int repeatSegmentation; 68 | @ConfigurationParameter(name = PARAM_INFERENCE_ITERATION, mandatory = true) 69 | private int inferenceIteration; 70 | 71 | 72 | 73 | @Override 74 | public void initialize(UimaContext context) 75 | throws ResourceInitializationException { 76 | super.initialize(context); 77 | 78 | } 79 | 80 | @Override 81 | public void process(JCas jcas) 82 | throws AnalysisEngineProcessException { 83 | 84 | List> s = new ArrayList>(); 85 | 86 | // int i = 0; 87 | Iterator segments = JCasUtil.select(jcas, Segment.class) 88 | .iterator(); 89 | Segment seg = null; 90 | if (segments.hasNext()) 91 | seg = segments.next(); 92 | 93 | for (Sentence ss : JCasUtil.select(jcas, Sentence.class)) { 94 | 95 | s.add(JCasUtil.selectCovered(Token.class, ss)); 96 | 97 | } 98 | 99 | TopicTilingTopicDocument tttd ; 100 | 101 | if (JCasUtil.select(jcas, SegmentQuantity.class).size() == 0) { 102 | 103 | tttd = new TopicTilingTopicDocument(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInferences, inferenceIteration); 104 | } else { 105 | int segNum = JCasUtil.select(jcas, SegmentQuantity.class) 106 | .iterator().next().getSegmentCount(); 107 | tttd = new TopicTilingTopicDocument(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInferences, inferenceIteration,segNum); 108 | } 109 | 110 | 111 | List segmentPositions = tttd.segment(s); 112 | // print(jcas,segmentPositions); 113 | // printRcode(jcas, segmentCounts, wtt2, segmentPositionsWnew); 114 | annotateSegments(jcas, segmentPositions); 115 | } 116 | 117 | private void printRcode(JCas jcas, int segmentCount, 118 | TextTilingWindowOptimized tt, List segments) { 119 | // if (!printRcode) 120 | // return; 121 | DocumentMetaData metaData = DocumentMetaData.get(jcas); 122 | ; 123 | String main = metaData.getDocumentTitle() 124 | + ": Cosine Similarity between sentences "; 125 | if (segmentCount < 0) 126 | main = main + " (segments given: " + segmentCount + ")"; 127 | StringBuffer buffer = new StringBuffer(); 128 | buffer.append("#Cosine Similarity\n"); 129 | buffer.append("pdf(file='" + metaData.getDocumentTitle() 130 | + ".pdf',20,7);\n"); 131 | buffer.append(toListInR(tt.similarityScores, "cos")); 132 | buffer.append(toListInR(segments, "estSeg")); 133 | buffer.append(toListInR(getGoldSegments(jcas), "seg")); 134 | buffer.append(toListInR(tt.minimaPosition, "canSeg")); 135 | buffer.append(toListInR(tt.depthScores, "depth")); 136 | buffer.append("plot(0:" 137 | + (tt.similarityScores.size() - 1) 138 | + ",cos,type='l',xlab='Sentence',ylab='cosine similarity',main='" 139 | + main + "');\n"); 140 | buffer.append("abline(v=seg,col='red',lty=5);\n"); 141 | buffer.append("abline(v=estSeg,col='green',lwd=2,lty=4);\n"); 142 | buffer.append("abline(v=seg[seg%in%estSeg],col='black',lwd=3);\n"); 143 | buffer.append("points(estSeg,rep(max(cos)*0.98," + segments.size() 144 | + "),col='green',pch=22);\n"); 145 | buffer.append("points(canSeg,rep(max(cos)*0.9," 146 | + tt.minimaPosition.size() + "),col='blue',pch=23);\n"); 147 | buffer.append("text(canSeg[-length(canSeg)],rep(max(cos)*c(0.84,0.88,0.92,0.94),length=" 148 | + tt.depthScores.size() + "),labels=depth);\n"); 149 | buffer.append("dev.off();dev.off()"); 150 | System.out.println(buffer.toString()); 151 | 152 | } 153 | 154 | private List getGoldSegments(JCas jcas) { 155 | 156 | List ret = new ArrayList(); 157 | Iterator segIt = JCasUtil.iterator(jcas, Segment.class); 158 | int sentenceCount = -1; 159 | while (segIt.hasNext()) { 160 | Segment seg = segIt.next(); 161 | for (Sentence s : JCasUtil.selectCovered(jcas, Sentence.class, seg)) { 162 | sentenceCount++; 163 | } 164 | ret.add(sentenceCount); 165 | } 166 | return ret; 167 | } 168 | 169 | private StringBuffer toListInR(List list, String name) { 170 | StringBuffer buffer = new StringBuffer(); 171 | buffer.append(name); 172 | buffer.append("=c("); 173 | for (T sc : list) { 174 | if (sc instanceof Double) { 175 | DecimalFormat df = new DecimalFormat("#.##"); 176 | buffer.append(df.format(sc).replace(",", ".")); 177 | } else { 178 | buffer.append(sc); 179 | } 180 | buffer.append(","); 181 | } 182 | if (list.size() > 0) 183 | buffer.deleteCharAt(buffer.length() - 1); 184 | buffer.append(");\n"); 185 | return buffer; 186 | } 187 | 188 | 189 | /** 190 | * expects a list with the sentencenumber that will be segmented 191 | * 192 | * @param jcas 193 | * @param sentenceBreaks 194 | */ 195 | private void annotateSegments(JCas jcas, List sentenceBreaks) { 196 | Iterator sentenceItr = JCasUtil 197 | .iterator(jcas, Sentence.class); 198 | int sentenceCount = -1; 199 | int prevBreak = 0; 200 | if (printSegments) { 201 | System.out.println("Annotated Segments"); 202 | System.out.println(sentenceBreaks.toString()); 203 | } 204 | 205 | for (final int sBreak : sentenceBreaks) { 206 | final Segment seg = new Segment(jcas); 207 | 208 | Sentence segmentSentence = null; 209 | 210 | int beginOffset = 0; 211 | int endOffset = 0; 212 | 213 | // move sentenceItr to last sentence in segment 214 | for (; sentenceCount < sBreak; sentenceCount++) { 215 | segmentSentence = sentenceItr.next(); 216 | 217 | if (sentenceCount == prevBreak) { 218 | beginOffset = segmentSentence.getBegin(); 219 | } 220 | } 221 | 222 | if (segmentSentence != null) { 223 | endOffset = segmentSentence.getEnd(); 224 | } 225 | 226 | seg.setBegin(beginOffset); 227 | seg.setEnd(endOffset); 228 | seg.addToIndexes(); 229 | 230 | if (printSegments) { 231 | System.out.println(sBreak + "\t" + sentenceCount + "\t" 232 | + beginOffset + "\t" + endOffset); 233 | } 234 | prevBreak = sBreak; 235 | } 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaTopicIdAnnotator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | package de.tudarmstadt.langtech.lda.annotator; 25 | 26 | import static org.uimafit.util.JCasUtil.select; 27 | 28 | import java.util.ArrayList; 29 | import java.util.HashMap; 30 | import java.util.List; 31 | import java.util.Random; 32 | 33 | import jgibbslda.Model; 34 | 35 | import org.apache.uima.UIMAFramework; 36 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 37 | import org.apache.uima.jcas.JCas; 38 | import org.apache.uima.jcas.cas.DoubleArray; 39 | import org.apache.uima.util.Level; 40 | import org.apache.uima.util.Logger; 41 | import org.uimafit.descriptor.ConfigurationParameter; 42 | 43 | import de.tudarmstadt.langtech.lda.type.Topic; 44 | import de.tudarmstadt.langtech.lda.type.TopicDistribution; 45 | import de.tudarmstadt.langtech.lda.type.WordTopicDistribution; 46 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; 47 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 48 | 49 | public abstract class GibbsLdaTopicIdAnnotator extends 50 | GibbsLdaTopicModelAnnotator { 51 | public static final String PARAM_LDA_REPEAT_INFERENCE = "LdaRepeatInference"; 52 | public static final String PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION = "LdaAnnotateDocumentTopicDistribution"; 53 | public static final String PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION = "LdaAnnotateWordTopicDistribution"; 54 | 55 | private static final Logger log = UIMAFramework 56 | .getLogger(GibbsLdaTopicIdAnnotator.class); 57 | @ConfigurationParameter(name = PARAM_LDA_REPEAT_INFERENCE, mandatory = false, defaultValue = "1") 58 | private int ldaRepeatInference; 59 | 60 | @ConfigurationParameter(name = PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION, mandatory = false, defaultValue = "false") 61 | private boolean ldaAnnotateDocumentTopicDistribution = false; 62 | 63 | @ConfigurationParameter(name = PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION, mandatory = false, defaultValue = "false") 64 | private boolean ldaAnnotateWordTopicDistribution = false; 65 | 66 | /** 67 | * Function iterates over all tokens and assigns a topic ID. This can only 68 | * be performed, when the token is within the model. 69 | * 70 | * @param jcas 71 | * @param z 72 | */ 73 | 74 | private void annotateTokenWithTopicId(JCas jcas, List[] modelZ, 75 | List[] modelModeZ, List[] documents) { 76 | 77 | int si = 0; 78 | int ti = 0; 79 | int zti = 0; 80 | int actDocumentSize = 0; 81 | List wordTokens = null; 82 | StringBuffer output = new StringBuffer(); 83 | 84 | if (documents.length > 0) { 85 | wordTokens = documents[0]; 86 | actDocumentSize = wordTokens.size(); 87 | } 88 | 89 | for (Token t : select(jcas, Token.class)) { 90 | if (zti >= actDocumentSize) { 91 | ti = 0; 92 | zti = 0; 93 | si++; 94 | wordTokens = documents[si]; 95 | actDocumentSize = wordTokens.size(); 96 | } 97 | String token = t.getCoveredText(); 98 | assert token.equals(wordTokens.get(zti)); 99 | // System.out.print("indices: " + si + "\t" + ti + "\tsize: " 100 | // + modelZ[si].size() + " " + modelModeZ[si].size()); 101 | // System.out.println("\t" + token + " "+ wordTokens.get(zti)); 102 | if (getInferencerGlobalDict().word2id.containsKey(token)) { 103 | int topicId = modelZ[si].get(ti); 104 | int topicModeId = modelModeZ[si].get(ti); 105 | Topic topic = new Topic(jcas, t.getBegin(), t.getEnd()); 106 | topic.setTopicId(topicId); 107 | topic.setTopicModeId(topicModeId); 108 | topic.addToIndexes(); 109 | 110 | ti++; 111 | 112 | output.append(token).append(":").append(topicId).append(":") 113 | .append(topicModeId); 114 | 115 | } else { 116 | output.append(token).append(":NA"); 117 | } 118 | output.append(" "); 119 | zti++; 120 | 121 | } 122 | log.log(Level.FINE, output.toString()); 123 | } 124 | 125 | @Override 126 | public void process(JCas jcas) throws AnalysisEngineProcessException { 127 | final List[] documents = getDocuments(jcas); 128 | DocumentMetaData metaData = DocumentMetaData.get(jcas); 129 | super.setLdaInferenceSaveName(metaData.getDocumentTitle()); 130 | Model m = inference(documents); 131 | // if no inference is repeated z contains the topic IDs that are used 132 | List[] modelZ = m.z; 133 | List[] modelModeZ; 134 | 135 | modelModeZ = getTopicListFromRepeated(getInferenceModeValues(), 136 | documents, getInferenceNiters(), 1); 137 | if (ldaRepeatInference > 1) { 138 | // initialize save structure for word wise topic stabilization 139 | ArrayList values = new ArrayList(); 140 | for (int k = 0; k < documents.length; k++) { 141 | values.add(new int[modelZ[k].size()][m.K]); 142 | } 143 | for (int k = 1; k < ldaRepeatInference; k++) { 144 | for (int p = 0; p < documents.length; p++) { 145 | for (int t = 0; t < modelZ[p].size(); t++) { 146 | int topic = modelZ[p].get(t); 147 | values.get(p)[t][topic]++; 148 | } 149 | } 150 | m = inference(documents); 151 | modelZ = m.z; 152 | modelModeZ = getTopicListFromRepeated(getInferenceModeValues(), 153 | documents, getInferenceNiters(), 1); 154 | } 155 | } 156 | annotateTokenWithTopicId(jcas, modelZ, modelModeZ, documents); 157 | if (ldaAnnotateDocumentTopicDistribution) 158 | annotateDocumentsWithTopicDistribution(jcas, documents, m); 159 | if(ldaAnnotateWordTopicDistribution) 160 | annotateWordsWithTopicDistribution(jcas,m); 161 | } 162 | 163 | private void annotateWordsWithTopicDistribution(JCas jcas, Model m) { 164 | 165 | HashMap map = new HashMap(); 166 | for(int wi =0;wi< m.phi.length;wi++){ 167 | double[] topics=m.phi[wi]; 168 | String word = getInferencerGlobalDict().id2word.get(wi); 169 | DoubleArray arr = new DoubleArray(jcas, topics.length); 170 | for(int ti=0;ti[] documents, Model m) { 189 | int si = 0; 190 | int ti = 0; 191 | int start = -1; 192 | int docSize = documents[si].size(); 193 | for (Token t : select(jcas, Token.class)) { 194 | if (start < 0) { 195 | docSize = documents[si].size(); 196 | start = t.getBegin(); 197 | } 198 | ti++; 199 | if (ti == docSize) { 200 | TopicDistribution td = new TopicDistribution(jcas, start, 201 | t.getEnd()); 202 | start = -1; 203 | DoubleArray arr = new DoubleArray(jcas, m.K); 204 | for (int i = 0; i < m.theta[si].length; i++) { 205 | arr.set(i, m.theta[si][i]); 206 | } 207 | td.setTopicDistribution(arr); 208 | td.addToIndexes(); 209 | 210 | si++; 211 | 212 | ti = 0; 213 | } 214 | 215 | } 216 | } 217 | 218 | private List[] getTopicListFromRepeated(ArrayList values, 219 | List[] partsArray, int max, int min) { 220 | @SuppressWarnings("unchecked") 221 | List[] newZ = new ArrayList[values.size()]; 222 | Random r = new Random(); 223 | for (int s = 0; s < values.size(); s++) { 224 | int[][] sentence = values.get(s); 225 | newZ[s] = new ArrayList(); 226 | for (int t = 0; t < sentence.length; t++) { 227 | List candidates = getTopicCandidates(sentence[t], max, 228 | min); 229 | if (candidates.size() > 0) { 230 | int topic = candidates.get(r.nextInt(candidates.size())); 231 | newZ[s].add(topic); 232 | } else { 233 | System.out.println("No Candidates found"); 234 | 235 | System.out.println(); 236 | } 237 | 238 | } 239 | 240 | } 241 | return newZ; 242 | 243 | } 244 | 245 | private List getTopicCandidates(int[] topics, int max, int min) { 246 | ArrayList candidates = new ArrayList(); 247 | for (int m = max; m >= min; m--) { 248 | 249 | for (int t = 0; t < topics.length; t++) { 250 | if (topics[t] == m) { 251 | candidates.add(t); 252 | } 253 | } 254 | if (candidates.size() > 0) { 255 | return candidates; 256 | } 257 | } 258 | return new ArrayList(); 259 | } 260 | 261 | public abstract List[] getDocuments(JCas jcas); 262 | } 263 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDADataset.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2007 by 3 | * 4 | * Xuan-Hieu Phan 5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com 6 | * Graduate School of Information Sciences 7 | * Tohoku University 8 | * 9 | * Cam-Tu Nguyen 10 | * ncamtu@gmail.com 11 | * College of Technology 12 | * Vietnam National University, Hanoi 13 | * 14 | * JGibbsLDA is a free software; you can redistribute it and/or modify 15 | * it under the terms of the GNU General Public License as published 16 | * by the Free Software Foundation; either version 2 of the License, 17 | * or (at your option) any later version. 18 | * 19 | * JGibbsLDA is distributed in the hope that it will be useful, but 20 | * WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU General Public License 25 | * along with JGibbsLDA; if not, write to the Free Software Foundation, 26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 27 | */ 28 | package jgibbslda; 29 | 30 | import java.io.BufferedReader; 31 | import java.io.FileInputStream; 32 | import java.io.InputStreamReader; 33 | import java.util.HashMap; 34 | import java.util.List; 35 | import java.util.Map; 36 | import java.util.Vector; 37 | 38 | public class LDADataset { 39 | //--------------------------------------------------------------- 40 | // Instance Variables 41 | //--------------------------------------------------------------- 42 | 43 | public Dictionary localDict; // local dictionary 44 | public Document [] docs; // a list of documents 45 | public int M; // number of documents 46 | public int V; // number of words 47 | 48 | // map from local coordinates (id) to global ones 49 | // null if the global dictionary is not set 50 | public Map lid2gid; 51 | 52 | //link to a global dictionary (optional), null for train data, not null for test data 53 | public Dictionary globalDict; 54 | 55 | //-------------------------------------------------------------- 56 | // Constructor 57 | //-------------------------------------------------------------- 58 | public LDADataset(){ 59 | localDict = new Dictionary(); 60 | M = 0; 61 | V = 0; 62 | docs = null; 63 | 64 | globalDict = null; 65 | lid2gid = null; 66 | } 67 | 68 | public LDADataset(int M){ 69 | localDict = new Dictionary(); 70 | this.M = M; 71 | this.V = 0; 72 | docs = new Document[M]; 73 | 74 | globalDict = null; 75 | lid2gid = null; 76 | } 77 | 78 | public LDADataset(int M, Dictionary globalDict){ 79 | localDict = new Dictionary(); 80 | this.M = M; 81 | this.V = 0; 82 | docs = new Document[M]; 83 | 84 | this.globalDict = globalDict; 85 | lid2gid = new HashMap(); 86 | } 87 | 88 | //------------------------------------------------------------- 89 | //Public Instance Methods 90 | //------------------------------------------------------------- 91 | /** 92 | * set the document at the index idx if idx is greater than 0 and less than M 93 | * @param doc document to be set 94 | * @param idx index in the document array 95 | */ 96 | public void setDoc(Document doc, int idx){ 97 | if (0 <= idx && idx < M){ 98 | docs[idx] = doc; 99 | } 100 | } 101 | /** 102 | * set the document at the index idx if idx is greater than 0 and less than M 103 | * @param str string contains doc 104 | * @param idx index in the document array 105 | */ 106 | public void setDoc(String str, int idx){ 107 | if (0 <= idx && idx < M){ 108 | String [] words = str.split("[ \\t\\n]"); 109 | 110 | Vector ids = new Vector(); 111 | 112 | for (String word : words){ 113 | int _id = localDict.word2id.size(); 114 | 115 | if (localDict.contains(word)) 116 | _id = localDict.getID(word); 117 | 118 | if (globalDict != null){ 119 | //get the global id 120 | Integer id = globalDict.getID(word); 121 | //System.out.println(id); 122 | 123 | if (id != null){ 124 | localDict.addWord(word); 125 | 126 | lid2gid.put(_id, id); 127 | ids.add(_id); 128 | } 129 | else { //not in global dictionary 130 | //do nothing currently 131 | } 132 | } 133 | else { 134 | localDict.addWord(word); 135 | ids.add(_id); 136 | } 137 | } 138 | 139 | Document doc = new Document(ids, str); 140 | docs[idx] = doc; 141 | V = localDict.word2id.size(); 142 | } 143 | } 144 | 145 | 146 | public void setDoc(List words, int idx){ 147 | String str = ""; 148 | if (0 <= idx && idx < M){ 149 | 150 | Vector ids = new Vector(); 151 | 152 | for (String word : words){ 153 | str+=word+" "; 154 | int _id = localDict.word2id.size(); 155 | 156 | if (localDict.contains(word)) 157 | _id = localDict.getID(word); 158 | 159 | if (globalDict != null){ 160 | //get the global id 161 | Integer id = globalDict.getID(word); 162 | //System.out.println(id); 163 | 164 | if (id != null){ 165 | localDict.addWord(word); 166 | 167 | lid2gid.put(_id, id); 168 | ids.add(_id); 169 | } 170 | else { //not in global dictionary 171 | //do nothing currently 172 | } 173 | } 174 | else { 175 | localDict.addWord(word); 176 | ids.add(_id); 177 | } 178 | } 179 | 180 | Document doc = new Document(ids, str); 181 | docs[idx] = doc; 182 | V = localDict.word2id.size(); 183 | } 184 | } 185 | //--------------------------------------------------------------- 186 | // I/O methods 187 | //--------------------------------------------------------------- 188 | 189 | /** 190 | * read a dataset from a stream, create new dictionary 191 | * @return dataset if success and null otherwise 192 | */ 193 | public static LDADataset readDataSet(String filename){ 194 | try { 195 | BufferedReader reader = new BufferedReader(new InputStreamReader( 196 | new FileInputStream(filename), "UTF-8")); 197 | 198 | LDADataset data = readDataSet(reader); 199 | 200 | reader.close(); 201 | return data; 202 | } 203 | catch (Exception e){ 204 | System.out.println("Read Dataset Error: " + e.getMessage()); 205 | e.printStackTrace(); 206 | return null; 207 | } 208 | } 209 | 210 | /** 211 | * read a dataset from a file with a preknown vocabulary 212 | * @param filename file from which we read dataset 213 | * @param dict the dictionary 214 | * @return dataset if success and null otherwise 215 | */ 216 | public static LDADataset readDataSet(String filename, Dictionary dict){ 217 | try { 218 | BufferedReader reader = new BufferedReader(new InputStreamReader( 219 | new FileInputStream(filename), "UTF-8")); 220 | LDADataset data = readDataSet(reader, dict); 221 | 222 | reader.close(); 223 | return data; 224 | } 225 | catch (Exception e){ 226 | System.out.println("Read Dataset Error: " + e.getMessage()); 227 | e.printStackTrace(); 228 | return null; 229 | } 230 | } 231 | 232 | /** 233 | * read a dataset from a stream, create new dictionary 234 | * @return dataset if success and null otherwise 235 | */ 236 | public static LDADataset readDataSet(BufferedReader reader){ 237 | try { 238 | //read number of document 239 | String line; 240 | line = reader.readLine(); 241 | int M = Integer.parseInt(line); 242 | 243 | LDADataset data = new LDADataset(M); 244 | for (int i = 0; i < M; ++i){ 245 | line = reader.readLine(); 246 | 247 | data.setDoc(line, i); 248 | } 249 | 250 | return data; 251 | } 252 | catch (Exception e){ 253 | System.out.println("Read Dataset Error: " + e.getMessage()); 254 | e.printStackTrace(); 255 | return null; 256 | } 257 | } 258 | 259 | /** 260 | * read a dataset from a stream with respect to a specified dictionary 261 | * @param reader stream from which we read dataset 262 | * @param dict the dictionary 263 | * @return dataset if success and null otherwise 264 | */ 265 | public static LDADataset readDataSet(BufferedReader reader, Dictionary dict){ 266 | try { 267 | //read number of document 268 | String line; 269 | line = reader.readLine(); 270 | int M = Integer.parseInt(line); 271 | System.out.println("NewM:" + M); 272 | 273 | LDADataset data = new LDADataset(M, dict); 274 | for (int i = 0; i < M; ++i){ 275 | line = reader.readLine(); 276 | 277 | data.setDoc(line, i); 278 | } 279 | 280 | return data; 281 | } 282 | catch (Exception e){ 283 | System.out.println("Read Dataset Error: " + e.getMessage()); 284 | e.printStackTrace(); 285 | return null; 286 | } 287 | } 288 | 289 | /** 290 | * read a dataset from a string, create new dictionary 291 | * @param str String from which we get the dataset, documents are seperated by newline character 292 | * @return dataset if success and null otherwise 293 | */ 294 | public static LDADataset readDataSet(String [] strs){ 295 | LDADataset data = new LDADataset(strs.length); 296 | 297 | for (int i = 0 ; i < strs.length; ++i){ 298 | data.setDoc(strs[i], i); 299 | } 300 | return data; 301 | } 302 | 303 | /** 304 | * read a dataset from a string with respect to a specified dictionary 305 | * @param str String from which we get the dataset, documents are seperated by newline character 306 | * @param dict the dictionary 307 | * @return dataset if success and null otherwise 308 | */ 309 | public static LDADataset readDataSet(String [] strs, Dictionary dict){ 310 | //System.out.println("readDataset..."); 311 | LDADataset data = new LDADataset(strs.length, dict); 312 | 313 | for (int i = 0 ; i < strs.length; ++i){ 314 | //System.out.println("set doc " + i); 315 | data.setDoc(strs[i], i); 316 | } 317 | return data; 318 | } 319 | 320 | public static LDADataset readDataSet(List [] strs, Dictionary dict){ 321 | //System.out.println("readDataset..."); 322 | LDADataset data = new LDADataset(strs.length, dict); 323 | 324 | for (int i = 0 ; i < strs.length; ++i){ 325 | //System.out.println("set doc " + i); 326 | data.setDoc(strs[i], i); 327 | } 328 | return data; 329 | } 330 | } 331 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/TopicTilingTopicDocument.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter; 25 | 26 | import java.util.ArrayList; 27 | import java.util.Collection; 28 | import java.util.Collections; 29 | import java.util.HashMap; 30 | import java.util.List; 31 | import java.util.Map.Entry; 32 | 33 | import jgibbslda.Inferencer; 34 | import jgibbslda.LDACmdOption; 35 | import jgibbslda.Model; 36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 37 | 38 | public class TopicTilingTopicDocument { 39 | public List similarityScores; 40 | public List minimaPosition; 41 | public List depthScores; 42 | private Inferencer inf; 43 | private LDACmdOption opt; 44 | 45 | private int segmentNumber = -1; 46 | 47 | private int window = 1; 48 | private String ldaModelDirectory; 49 | private String ldaModelName; 50 | private int repeatSegmentation = 1; 51 | private int inferenceIterations = 100; 52 | private int repeatInference = 1; 53 | 54 | public TopicTilingTopicDocument(String ldaModelDirectory, String ldaModelName, int window, int repeatSegmentation, int repeatInference, int inferenceIteration) { 55 | this(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInference, inferenceIteration, -1); 56 | } 57 | 58 | public TopicTilingTopicDocument(String ldaModelDirectory, String ldaModelName, int window, int repeatSegmentation, int repeatInference, int inferenceIteration, int segmentNumber) { 59 | 60 | super(); 61 | this.ldaModelDirectory = ldaModelDirectory; 62 | this.ldaModelName = ldaModelName; 63 | this.window = window; 64 | this.repeatInference = repeatInference; 65 | this.repeatSegmentation = repeatSegmentation; 66 | this.inferenceIterations = inferenceIteration; 67 | 68 | opt = new LDACmdOption(); 69 | opt.dir = this.ldaModelDirectory; 70 | opt.modelName = this.ldaModelName; 71 | this.segmentNumber = segmentNumber; 72 | 73 | } 74 | 75 | public List segment(List> sentences) { 76 | HashMap map = new HashMap(); 77 | if (segmentNumber < 0) { 78 | return segment2(sentences); 79 | } 80 | for (int i = 0; i < repeatSegmentation; i++) { 81 | 82 | List segments = segment2(sentences); 83 | System.out.println(segments); 84 | for (int value : segments) { 85 | int count = 0; 86 | if (map.containsKey(value)) { 87 | count = map.get(value); 88 | } 89 | map.put(value, count + 1); 90 | 91 | } 92 | } 93 | System.out.println(map); 94 | List segments = new ArrayList(); 95 | for (int i = repeatSegmentation; i >= 0; i--) { 96 | for (Entry e : map.entrySet()) { 97 | if (e.getValue() == i) { 98 | segments.add(e.getKey()); 99 | if (segments.size() == segmentNumber) { 100 | Collections.sort(segments); 101 | return segments; 102 | } 103 | } 104 | 105 | } 106 | } 107 | Collections.sort(segments); 108 | return segments; 109 | } 110 | 111 | public List segment2(List> sentences) { 112 | 113 | similarityScores = getSimilarityScores(sentences); 114 | System.out.println("SIM_TOPIC_TILING_DT: "+similarityScores); 115 | minimaPosition = getMinima(); 116 | depthScores = getDepthScores(); 117 | List segments = new ArrayList(); 118 | if (segmentNumber < 0) 119 | segments = getSegments(); 120 | else 121 | segments = getSegmentsNumberGiven(); 122 | // add the last sentence as boundary if it is not set 123 | 124 | if (segments.size() > 1 && segments.get(segments.size() - 1) != sentences.size()) { 125 | segments.add(sentences.size() - 1); 126 | } else { 127 | System.err.println("segment size:" + segments.size()); 128 | System.err.println("similarites: " + similarityScores); 129 | } 130 | return segments; 131 | } 132 | 133 | private List getSegmentsNumberGiven() { 134 | List segments = new ArrayList(minimaPosition); 135 | List depths = depthScores; 136 | List depths2 = new ArrayList(depthScores); 137 | if (depths.size() > segmentNumber) { 138 | 139 | Collections.sort(depths); 140 | double min = depths.get(depths.size() - segmentNumber + 1);// save 141 | 142 | for (int i = segments.size() - 1; i >= 0; i--) { 143 | if (depths2.get(i) < min) { 144 | segments.remove(i); 145 | } 146 | } 147 | } 148 | 149 | return segments; 150 | } 151 | 152 | public List getSegments() { 153 | // copy minima list 154 | List segments = new ArrayList(minimaPosition); 155 | 156 | double mean = calculateMean(depthScores); 157 | double variance = calculateVariance(depthScores, mean); 158 | double threshold = mean - variance / 2.0; 159 | 160 | for (int i = segments.size() - 1; i >= 0; i--) { 161 | if (depthScores.get(i) < threshold) { 162 | segments.remove(i); 163 | } 164 | } 165 | return segments; 166 | } 167 | 168 | private double calculateVariance(List vals, double mean) { 169 | double variance = 0.0; 170 | for (double d : vals) { 171 | variance += (d - mean) * (d - mean); 172 | } 173 | variance /= vals.size(); 174 | return variance; 175 | } 176 | 177 | private double calculateMean(List vals) { 178 | double mean = 0.0; 179 | for (double d : vals) { 180 | mean += d; 181 | } 182 | mean /= vals.size(); 183 | return mean; 184 | } 185 | 186 | private List getDepthScores() { 187 | List depths = new ArrayList(); 188 | for (int i : minimaPosition) { 189 | depths.add(getDepths(i)); 190 | } 191 | return depths; 192 | } 193 | 194 | // //left and right neighbor 195 | private double getDepths(int minimumPosition) { 196 | int i = minimumPosition; 197 | double depths = similarityScores.get(i - 1) - similarityScores.get(i) 198 | + similarityScores.get(i + 1) - similarityScores.get(i); 199 | return depths; 200 | } 201 | 202 | 203 | private List getMinima() { 204 | List minima = new ArrayList(); 205 | double prev = 0; 206 | double curr = 0; 207 | double next = 1; 208 | for (int i = 1; i < similarityScores.size() - 1; i++) { 209 | if (next != curr) { 210 | prev = similarityScores.get(i - 1); 211 | } 212 | curr = similarityScores.get(i); 213 | next = similarityScores.get(i + 1); 214 | if (curr < next && curr < prev) { 215 | minima.add(i); 216 | } 217 | } 218 | return minima; 219 | 220 | } 221 | 222 | private List getSimilarityScores(List> sentences) { 223 | List similarities = new ArrayList(); 224 | List parts = new ArrayList(); 225 | for (int i = 0; i < sentences.size(); i++) { 226 | parts.add(getPrev(sentences, i)); 227 | } 228 | for (int i = window - 1; i > 0; i--) { 229 | parts.add(getPrev(sentences, sentences.size() - 1, i)); 230 | } 231 | String[] partsArray = new String[parts.size()]; 232 | int i = 0; 233 | for (String ss : parts) { 234 | partsArray[i++] = ss; 235 | } 236 | double[][] topicDocument = null; 237 | for (i = 0; i < repeatInference; i++) { 238 | Model m = inference(partsArray); 239 | if (topicDocument == null) { 240 | topicDocument = new double[partsArray.length][m.K]; 241 | for (int j = 0; j < partsArray.length; j++) { 242 | for (int k = 0; k < m.K; k++) { 243 | topicDocument[j][k] = 1.0; 244 | } 245 | } 246 | } 247 | for (int j = 0; j < partsArray.length; j++) { 248 | for (int k = 0; k < m.K; k++) { 249 | topicDocument[j][k] *= m.theta[j][k]; 250 | } 251 | } 252 | } 253 | for (i = 0; i < partsArray.length - window; i++) { 254 | double[] v1 = topicDocument[i]; 255 | double[] v2 = topicDocument[i + window]; 256 | double sim = calculateDotProduct(v1, v2); 257 | similarities.add(sim); 258 | } 259 | // System.out.println(similarities.size()); 260 | return similarities; 261 | } 262 | 263 | private List getTopicCandidates(int[] topics) { 264 | ArrayList candidates = new ArrayList(); 265 | for (int m = repeatInference; m >= 0; m--) { 266 | 267 | for (int t = 0; t < topics.length; t++) { 268 | if (topics[t] == m) { 269 | candidates.add(t); 270 | } 271 | } 272 | if (candidates.size() > 0) { 273 | return candidates; 274 | } 275 | } 276 | return null; 277 | } 278 | 279 | private int[] getVector(int topicNumber, Collection topicAssigment) { 280 | int[] vec = new int[topicNumber]; 281 | for (int k : topicAssigment) { 282 | vec[k]++; 283 | } 284 | return vec; 285 | } 286 | 287 | private Model inference(String[] sentences) { 288 | inf = new Inferencer(); 289 | inf.init(opt); 290 | 291 | inf.niters = inferenceIterations; 292 | // inf.niters = Integer.parseInt(prop.getProperty("infIteration")); 293 | Model m = inf.inference(sentences); 294 | return m; 295 | } 296 | 297 | private String getPrev(List> sentences, int i) { 298 | 299 | return getPrev(sentences, i, window); 300 | } 301 | 302 | private String getPrev(List> sentences, int i, int window) { 303 | String result = ""; 304 | for (int k = i; k >= 0 && k > (i - window); k--) { 305 | for (Token t : sentences.get(k)) { 306 | result += t.getCoveredText() + " "; 307 | } 308 | } 309 | return result; 310 | } 311 | 312 | private double calculateDotProduct(int[] curr, int[] next) { 313 | int xy = 0; 314 | int sumX = 0; 315 | int sumY = 0; 316 | if (curr.length != next.length) { 317 | throw new IllegalArgumentException("Cosine Similarity: X != Y"); 318 | } 319 | for (int i = 0; i < curr.length; i++) { 320 | int xi = curr[i]; 321 | int yi = next[i]; 322 | 323 | xy += xi * yi; 324 | sumX += xi * xi; 325 | sumY += yi * yi; 326 | } 327 | 328 | return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY)); 329 | } 330 | 331 | private double calculateDotProduct(double[] curr, double[] next) { 332 | double xy = 0; 333 | double sumX = 0; 334 | double sumY = 0; 335 | if (curr.length != next.length) { 336 | throw new IllegalArgumentException("Cosine Similarity: X != Y"); 337 | } 338 | for (int i = 0; i < curr.length; i++) { 339 | double xi = curr[i]; 340 | double yi = next[i]; 341 | 342 | xy += xi * yi; 343 | sumX += xi * xi; 344 | sumY += yi * yi; 345 | } 346 | 347 | return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY)); 348 | } 349 | 350 | } 351 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/TextTilingWindowOptimized.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | 25 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter; 26 | 27 | import java.io.FileNotFoundException; 28 | import java.io.FileReader; 29 | import java.io.IOException; 30 | import java.util.ArrayList; 31 | import java.util.Collection; 32 | import java.util.Collections; 33 | import java.util.HashMap; 34 | import java.util.List; 35 | import java.util.Map.Entry; 36 | import java.util.Properties; 37 | import java.util.Random; 38 | 39 | import jgibbslda.Inferencer; 40 | import jgibbslda.LDACmdOption; 41 | import jgibbslda.Model; 42 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 43 | 44 | public class TextTilingWindowOptimized { 45 | private int segmentNumber = -1; 46 | private int window = 1; 47 | private int additionalVectorSize = 1; 48 | public List similarityScores; 49 | public List minimaPosition; 50 | public List depthScores; 51 | private Inferencer inf; 52 | private String ldaModel; 53 | private LDACmdOption opt; 54 | private Properties prop; 55 | private int segmentIteration = 5; 56 | private int inferenceIterationRepeating = 1; 57 | private int inferenceIteration; 58 | 59 | public TextTilingWindowOptimized(String ldaModel) { 60 | this(ldaModel, -1); 61 | } 62 | 63 | public TextTilingWindowOptimized(String ldaModel, int segmentNumber) { 64 | super(); 65 | this.ldaModel = ldaModel; 66 | opt = new LDACmdOption(); 67 | opt.dir = ldaModel; 68 | // opt.modelName = "model-final"; 69 | this.segmentNumber = segmentNumber; 70 | prop = new Properties(); 71 | try { 72 | prop.load(new FileReader("topictiling_config")); 73 | } catch (FileNotFoundException e) { 74 | // TODO Auto-generated catch block 75 | e.printStackTrace(); 76 | } catch (IOException e) { 77 | // TODO Auto-generated catch block 78 | e.printStackTrace(); 79 | } 80 | opt.modelName = prop.getProperty("model_name"); 81 | window = 1; 82 | inferenceIteration = 100; 83 | inferenceIterationRepeating = 1; 84 | segmentIteration = 1; 85 | if (prop.containsKey("window")) 86 | window = Integer.parseInt(prop.getProperty("window")); 87 | if (prop.containsKey("infIteration")) 88 | inferenceIteration = Integer.parseInt(prop.getProperty("infIteration")); 89 | if (prop.containsKey("infIterationRepeating")) 90 | inferenceIterationRepeating = Integer.parseInt(prop.getProperty("infIterationRepeating")); 91 | if (prop.containsKey("segmentIteration")) 92 | segmentIteration = Integer.parseInt(prop.getProperty("segmentIteration")); 93 | System.err.println("window:"+window); 94 | System.err.println("inferenceIteration:"+inferenceIteration); 95 | System.err.println("inferenceIterationRepeating:"+inferenceIterationRepeating); 96 | System.err.println("window:"+window); 97 | } 98 | 99 | public List segment(List> sentences) { 100 | HashMap map = new HashMap(); 101 | if (segmentNumber < 0) { 102 | return segment2(sentences); 103 | } 104 | for (int i = 0; i < segmentIteration; i++) { 105 | 106 | List segments = segment2(sentences); 107 | System.out.println(segments); 108 | for (int value : segments) { 109 | int count = 0; 110 | if (map.containsKey(value)) { 111 | count = map.get(value); 112 | } 113 | map.put(value, count + 1); 114 | 115 | } 116 | } 117 | System.out.println(map); 118 | List segments = new ArrayList(); 119 | for (int i = segmentIteration; i >= 0; i--) { 120 | for (Entry e : map.entrySet()) { 121 | if (e.getValue() == i) { 122 | segments.add(e.getKey()); 123 | if (segments.size() == segmentNumber) { 124 | Collections.sort(segments); 125 | return segments; 126 | } 127 | } 128 | 129 | } 130 | } 131 | Collections.sort(segments); 132 | return segments; 133 | } 134 | 135 | public List segment2(List> sentences) { 136 | 137 | similarityScores = getSimilarityScores(sentences); 138 | minimaPosition = getMinima(); 139 | depthScores = getDepthScores(); 140 | 141 | List segments = new ArrayList(); 142 | if (segmentNumber < 0) 143 | segments = getSegments(); 144 | else 145 | segments = getSegmentsNumberGiven(); 146 | // add the last sentence as boundary if it is not set 147 | 148 | if (segments.size()>1&&segments.get(segments.size() - 1) != sentences.size()) { 149 | segments.add(sentences.size() - 1); 150 | }else{ 151 | System.err.println("segment size:"+segments.size()); 152 | System.err.println("similarites: "+similarityScores); 153 | } 154 | // System.out.println(segments); 155 | return segments; 156 | } 157 | 158 | private List getSegmentsNumberGiven() { 159 | List segments = new ArrayList(minimaPosition); 160 | List depths = depthScores; 161 | List depths2 = new ArrayList(depthScores); 162 | if (depths.size() > segmentNumber) { 163 | 164 | Collections.sort(depths); 165 | double min = depths.get(depths.size() - segmentNumber + 1);// save 166 | 167 | for (int i = segments.size() - 1; i >= 0; i--) { 168 | if (depths2.get(i) < min) { 169 | segments.remove(i); 170 | } 171 | } 172 | } 173 | 174 | return segments; 175 | } 176 | 177 | public List getSegments() { 178 | // copy minima list 179 | List segments = new ArrayList(minimaPosition); 180 | 181 | double mean = calculateMean(depthScores); 182 | double variance = calculateVariance(depthScores, mean); 183 | double threshold = mean - variance / 2.0; 184 | 185 | for (int i = segments.size() - 1; i >= 0; i--) { 186 | if (depthScores.get(i) < threshold) { 187 | segments.remove(i); 188 | } 189 | } 190 | return segments; 191 | } 192 | 193 | private double calculateVariance(List vals, double mean) { 194 | double variance = 0.0; 195 | for (double d : vals) { 196 | variance += (d - mean) * (d - mean); 197 | } 198 | variance /= vals.size(); 199 | return variance; 200 | } 201 | 202 | private double calculateMean(List vals) { 203 | double mean = 0.0; 204 | for (double d : vals) { 205 | mean += d; 206 | } 207 | mean /= vals.size(); 208 | return mean; 209 | } 210 | 211 | private List getDepthScores() { 212 | List depths = new ArrayList(); 213 | double depth; 214 | for (int i : minimaPosition) { 215 | 216 | depths.add(getDepths(i)); 217 | } 218 | return depths; 219 | } 220 | 221 | // //left and right neighbor 222 | private double getDepths(int minimumPosition) { 223 | int i = minimumPosition; 224 | double depths = similarityScores.get(i - 1) - similarityScores.get(i) 225 | + similarityScores.get(i + 1) - similarityScores.get(i); 226 | return depths; 227 | } 228 | 229 | private List getMinima() { 230 | List minima = new ArrayList(); 231 | double prev = 0; 232 | double curr = 0; 233 | double next = 1; 234 | for (int i = 1; i < similarityScores.size() - 1; i++) { 235 | if (next != curr) { 236 | prev = similarityScores.get(i - 1); 237 | } 238 | curr = similarityScores.get(i); 239 | next = similarityScores.get(i + 1); 240 | if (curr < next && curr < prev) { 241 | minima.add(i); 242 | } 243 | } 244 | return minima; 245 | 246 | } 247 | 248 | private List getSimilarityScores(List> sentences) { 249 | List similarities = new ArrayList(); 250 | List parts = new ArrayList(); 251 | for (int i = 0; i < sentences.size(); i++) { 252 | parts.add(getPrev(sentences, i)); 253 | } 254 | for (int i = window - 1; i > 0; i--) { 255 | parts.add(getPrev(sentences, sentences.size() - 1, i)); 256 | } 257 | String[] partsArray = new String[parts.size()]; 258 | int i = 0; 259 | for (String ss : parts) { 260 | partsArray[i++] = ss; 261 | } 262 | 263 | Model m = inference(partsArray); 264 | if (inferenceIterationRepeating == 1) { 265 | for (i = 0; i < partsArray.length - window; i++) { 266 | int[] v1 = getVector(m.K, m.z[i]); 267 | int[] v2 = getVector(m.K, m.z[i + window]); 268 | double sim = calculateDotProduct(v1, v2); 269 | similarities.add(sim); 270 | } 271 | 272 | } else { 273 | // initialize save structure for word wise topic stabilization 274 | ArrayList values = new ArrayList(); 275 | for (int k = 0; k < partsArray.length; k++) { 276 | values.add(new int[m.z[k].size()][m.K]); 277 | } 278 | for (int k = 1; k < inferenceIterationRepeating; k++) { 279 | for (int p = 0; p < partsArray.length; p++) { 280 | for (int t = 0; t < m.z[p].size(); t++) { 281 | int topic = m.z[p].get(t); 282 | values.get(p)[t][topic]++; 283 | } 284 | } 285 | m = inference(partsArray); 286 | } 287 | 288 | List[] newZ = new ArrayList[partsArray.length]; 289 | Random r = new Random(); 290 | for (int s = 0; s < values.size(); s++) { 291 | int[][] sentence = values.get(s); 292 | newZ[s] = new ArrayList(); 293 | for (int t = 0; t < sentence.length; t++) { 294 | List candidates = getTopicCandidates(sentence[t]); 295 | 296 | int topic = candidates.get(r.nextInt(candidates.size())); 297 | newZ[s].add(topic); 298 | } 299 | 300 | } 301 | for (i = 0; i < newZ.length - window; i++) { 302 | int[] v1 = getVector(m.K, newZ[i]); 303 | int[] v2 = getVector(m.K, newZ[i + window]); 304 | double sim = calculateDotProduct(v1, v2); 305 | similarities.add(sim); 306 | } 307 | 308 | } 309 | 310 | return similarities; 311 | } 312 | 313 | private List getTopicCandidates(int[] topics) { 314 | ArrayList candidates = new ArrayList(); 315 | for (int m = inferenceIterationRepeating; m >= 0; m--) { 316 | 317 | for (int t = 0; t < topics.length; t++) { 318 | if (topics[t] == m) { 319 | candidates.add(t); 320 | } 321 | } 322 | if (candidates.size() > 0) { 323 | return candidates; 324 | } 325 | } 326 | return null; 327 | } 328 | 329 | private int[] getVector(int topicNumber, Collection topicAssigment) { 330 | int[] vec = new int[topicNumber]; 331 | for (int k : topicAssigment) { 332 | vec[k]++; 333 | } 334 | return vec; 335 | } 336 | 337 | private Model inference(String[] sentences) { 338 | inf = new Inferencer(); 339 | inf.init(opt); 340 | 341 | inf.niters = inferenceIteration; 342 | Model m = inf.inference(sentences); 343 | return m; 344 | } 345 | 346 | private double[] norm(int[] v1) { 347 | double sum = 0.0; 348 | for (int v : v1) { 349 | sum += v; 350 | } 351 | double[] vd = new double[v1.length]; 352 | for (int i = 0; i < v1.length; i++) { 353 | vd[i] = v1[i] / sum; 354 | } 355 | return vd; 356 | } 357 | 358 | private int[] getVector(int i, Model m) { 359 | int[] vec = new int[m.K]; 360 | for (int k : m.z[i]) { 361 | vec[k]++; 362 | } 363 | return vec; 364 | } 365 | 366 | private String getPrev(List> sentences, int i) { 367 | 368 | return getPrev(sentences, i, window); 369 | } 370 | 371 | private String getPrev(List> sentences, int i, int window) { 372 | String result = ""; 373 | for (int k = i; k >= 0 && k > (i - window); k--) { 374 | for (Token t : sentences.get(k)) { 375 | result += t.getCoveredText() + " "; 376 | } 377 | } 378 | return result; 379 | } 380 | 381 | private double calculateDotProduct(double[] vd1, double[] vd2) { 382 | double xy = 0; 383 | double sumX = 0; 384 | double sumY = 0; 385 | if (vd1.length != vd2.length) { 386 | throw new IllegalArgumentException("Cosine Similarity: X != Y"); 387 | } 388 | for (int i = 0; i < vd1.length; i++) { 389 | double xi = vd1[i]; 390 | double yi = vd2[i]; 391 | 392 | xy += xi * yi; 393 | sumX += xi * xi; 394 | sumY += yi * yi; 395 | } 396 | 397 | return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY)); 398 | } 399 | 400 | private double calculateDotProduct(int[] curr, int[] next) { 401 | int xy = 0; 402 | int sumX = 0; 403 | int sumY = 0; 404 | if (curr.length != next.length) { 405 | throw new IllegalArgumentException("Cosine Similarity: X != Y"); 406 | } 407 | for (int i = 0; i < curr.length; i++) { 408 | int xi = curr[i]; 409 | int yi = next[i]; 410 | 411 | xy += xi * yi; 412 | sumX += xi * xi; 413 | sumY += yi * yi; 414 | } 415 | 416 | return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY)); 417 | } 418 | 419 | } 420 | -------------------------------------------------------------------------------- /de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/TopicTilingSegmenterAnnotator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Martin Riedl 3 | * riedl@cs.tu-darmstadt.de 4 | * FG Language Technology 5 | * Technische Universität Darmstadt, Germany 6 | * 7 | * 8 | * This file is part of TopicTiling. 9 | * 10 | * TopicTiling is free software: you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation, either version 3 of the License, or 13 | * (at your option) any later version. 14 | * 15 | * TopicTiling is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with TopicTiling. If not, see . 22 | */ 23 | 24 | 25 | 26 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator; 27 | 28 | import java.text.DecimalFormat; 29 | import java.util.ArrayList; 30 | import java.util.Collection; 31 | import java.util.Iterator; 32 | import java.util.List; 33 | 34 | import org.apache.uima.UimaContext; 35 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 36 | import org.apache.uima.cas.impl.ListUtils; 37 | import org.apache.uima.jcas.JCas; 38 | import org.apache.uima.resource.ResourceInitializationException; 39 | import org.uimafit.component.JCasAnnotator_ImplBase; 40 | import org.uimafit.descriptor.ConfigurationParameter; 41 | import org.uimafit.util.JCasUtil; 42 | 43 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TextTilingWindowOptimized; 44 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TopicTiling; 45 | import de.tudarmstadt.langtech.semantics.type.Segment; 46 | import de.tudarmstadt.langtech.semantics.type.SegmentQuantity; 47 | import de.tudarmstadt.langtech.semantics.type.SegmentScore; 48 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; 49 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; 50 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; 51 | 52 | public class TopicTilingSegmenterAnnotator extends JCasAnnotator_ImplBase { 53 | private boolean printSegments = true; 54 | 55 | public static final String PARAM_USE_ASSIGNED_TOPICS = "UseAssgnedTopics"; 56 | public static final String PARAM_LDA_MODEL_DIRECTORY = "LdaModelDirectory"; 57 | public static final String PARAM_LDA_MODEL_NAME = "LdaModelName"; 58 | public static final String PARAM_WINDOW = "TopicTilingWindow"; 59 | public static final String PARAM_REPEAT_SEGMENTATION = "RepeatedSegmentation"; 60 | public static final String PARAM_INFERENCE_ITERATION = "InferenceIteration"; 61 | public static final String PARAM_REPEAT_INFERENCE = "RepeatedInference"; 62 | public static final String PARAM_DEPTH_SCORE = "DepthScore"; 63 | public static final String PARAM_MODE_COUNTING = "ModeCounting"; 64 | public static final String PARAM_DEBUG="Debug"; 65 | @ConfigurationParameter(name = PARAM_USE_ASSIGNED_TOPICS, mandatory = false) 66 | private boolean useAssignedTopics = false;; 67 | @ConfigurationParameter(name = PARAM_LDA_MODEL_DIRECTORY, mandatory = true) 68 | private String ldaModelDirectory; 69 | @ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true) 70 | private String ldaModelName; 71 | @ConfigurationParameter(name = PARAM_WINDOW, mandatory = true) 72 | private int window; 73 | @ConfigurationParameter(name = PARAM_REPEAT_INFERENCE, mandatory = true) 74 | private int repeatInferences; 75 | @ConfigurationParameter(name = PARAM_REPEAT_SEGMENTATION, mandatory = true) 76 | private int repeatSegmentation; 77 | @ConfigurationParameter(name = PARAM_INFERENCE_ITERATION, mandatory = true) 78 | private int inferenceIteration; 79 | @ConfigurationParameter(name = PARAM_MODE_COUNTING, mandatory = true) 80 | private boolean modeCounting; 81 | @ConfigurationParameter(name = PARAM_DEBUG, mandatory = false) 82 | private boolean debug; 83 | @ConfigurationParameter(name = PARAM_DEPTH_SCORE, mandatory = true) 84 | private String depthScore; 85 | 86 | @Override 87 | public void initialize(UimaContext context) 88 | throws ResourceInitializationException { 89 | super.initialize(context); 90 | 91 | } 92 | 93 | @Override 94 | public void process(JCas jcas) throws AnalysisEngineProcessException { 95 | 96 | List> s = new ArrayList>(); 97 | 98 | // int i = 0; 99 | Iterator segments = JCasUtil.select(jcas, Segment.class) 100 | .iterator(); 101 | Segment seg = null; 102 | if (segments.hasNext()) 103 | seg = segments.next(); 104 | 105 | for (Sentence ss : JCasUtil.select(jcas, Sentence.class)) { 106 | 107 | s.add(JCasUtil.selectCovered(Token.class, ss)); 108 | 109 | } 110 | 111 | DocumentMetaData meta = DocumentMetaData.get(jcas); 112 | StringBuffer buffer = new StringBuffer(); 113 | buffer.append(meta.getDocumentTitle()); 114 | 115 | buffer.append("\n"); 116 | // TopicTilingTopicDocument tttd ; 117 | 118 | TopicTiling tt; 119 | tt = new TopicTiling(ldaModelDirectory, ldaModelName, window, 120 | repeatSegmentation, repeatInferences, inferenceIteration, 121 | modeCounting, depthScore, useAssignedTopics,debug); 122 | buffer.append("GOL: " + getGoldSegments(jcas) + "\n"); 123 | List segmentPositions; 124 | if (JCasUtil.select(jcas, SegmentQuantity.class).size() == 0) { 125 | segmentPositions = tt.segment(s); 126 | } else { 127 | int segNum = JCasUtil.select(jcas, SegmentQuantity.class) 128 | .iterator().next().getSegmentCount(); 129 | segmentPositions = tt.segment(s, segNum); 130 | } 131 | 132 | int j = 0; 133 | for (List ss: s){ 134 | String l = ""; 135 | for (Token t:ss){ 136 | l+=t.getCoveredText()+" "; 137 | } 138 | if(debug)System.out.println(j+"\t"+l); 139 | j++; 140 | } 141 | if(debug)System.out.println(segmentPositions); 142 | annotateSegments(jcas, segmentPositions, tt.depthScores, 143 | tt.minimaPosition,tt.similarityScores); 144 | } 145 | 146 | private void printRcode(JCas jcas, int segmentCount, 147 | TextTilingWindowOptimized tt, List segments) { 148 | DocumentMetaData metaData = DocumentMetaData.get(jcas); 149 | ; 150 | String main = metaData.getDocumentTitle() 151 | + ": Cosine Similarity between sentences "; 152 | if (segmentCount < 0) 153 | main = main + " (segments given: " + segmentCount + ")"; 154 | StringBuffer buffer = new StringBuffer(); 155 | buffer.append("#Cosine Similarity\n"); 156 | buffer.append("pdf(file='" + metaData.getDocumentTitle() 157 | + ".pdf',20,7);\n"); 158 | buffer.append(toListInR(tt.similarityScores, "cos")); 159 | buffer.append(toListInR(segments, "estSeg")); 160 | buffer.append(toListInR(getGoldSegments(jcas), "seg")); 161 | buffer.append(toListInR(tt.minimaPosition, "canSeg")); 162 | buffer.append(toListInR(tt.depthScores, "depth")); 163 | buffer.append("plot(0:" 164 | + (tt.similarityScores.size() - 1) 165 | + ",cos,type='l',xlab='Sentence',ylab='cosine similarity',main='" 166 | + main + "');\n"); 167 | buffer.append("abline(v=seg,col='red',lty=5);\n"); 168 | buffer.append("abline(v=estSeg,col='green',lwd=2,lty=4);\n"); 169 | buffer.append("abline(v=seg[seg%in%estSeg],col='black',lwd=3);\n"); 170 | buffer.append("points(estSeg,rep(max(cos)*0.98," + segments.size() 171 | + "),col='green',pch=22);\n"); 172 | buffer.append("points(canSeg,rep(max(cos)*0.9," 173 | + tt.minimaPosition.size() + "),col='blue',pch=23);\n"); 174 | buffer.append("text(canSeg[-length(canSeg)],rep(max(cos)*c(0.84,0.88,0.92,0.94),length=" 175 | + tt.depthScores.size() + "),labels=depth);\n"); 176 | buffer.append("dev.off();dev.off()"); 177 | System.out.println(buffer.toString()); 178 | 179 | } 180 | 181 | private List getGoldSegments(JCas jcas) { 182 | 183 | List ret = new ArrayList(); 184 | Iterator segIt = JCasUtil.iterator(jcas, Segment.class); 185 | int sentenceCount = -1; 186 | while (segIt.hasNext()) { 187 | Segment seg = segIt.next(); 188 | for (Sentence s : JCasUtil.selectCovered(jcas, Sentence.class, seg)) { 189 | sentenceCount++; 190 | } 191 | ret.add(sentenceCount); 192 | } 193 | return ret; 194 | } 195 | 196 | private StringBuffer toListInR(List list, String name) { 197 | StringBuffer buffer = new StringBuffer(); 198 | buffer.append(name); 199 | buffer.append("=c("); 200 | for (T sc : list) { 201 | if (sc instanceof Double) { 202 | DecimalFormat df = new DecimalFormat("#.##"); 203 | buffer.append(df.format(sc).replace(",", ".")); 204 | } else { 205 | buffer.append(sc); 206 | } 207 | buffer.append(","); 208 | } 209 | if (list.size() > 0) 210 | buffer.deleteCharAt(buffer.length() - 1); 211 | buffer.append(");\n"); 212 | return buffer; 213 | } 214 | private String getSimilarityScores(List similarityScores, int from, int to){ 215 | String scores = ""; 216 | int f = from-1; 217 | if (f<0)f=0; 218 | if(debug)System.out.println(f+"\t"+(to-1)); 219 | for(int i =f;i<=to-1;i++){ 220 | scores+=","+similarityScores.get(i); 221 | } 222 | if (scores.length()>0)scores=scores.substring(1); 223 | return scores; 224 | } 225 | private void annotateSegments(JCas jcas, List segmentPositions, 226 | List depthScores, List minimaPosition,List similarityScores) { 227 | List sentences = new ArrayList(JCasUtil.select(jcas, Sentence.class)); 228 | 229 | //add first segment which has no score 230 | int endIdx; 231 | if (segmentPositions.get(segmentPositions.size()-1)!=(sentences.size()-1)){ 232 | segmentPositions.add(sentences.size()-1); 233 | depthScores.add(0.0); 234 | } 235 | int endSentece; 236 | if (segmentPositions.size()>0){ 237 | endIdx=sentences.get(segmentPositions.get(0)).getEnd(); 238 | endSentece=segmentPositions.get(0); 239 | }else{ 240 | endIdx=sentences.get(sentences.size()-1).getEnd(); 241 | endSentece=sentences.size()-1; 242 | } 243 | addSegment(sentences.get(0).getBegin(),endIdx,0.0,getSimilarityScores(similarityScores, 0,endSentece),jcas); 244 | int segEnd; 245 | int segStart; 246 | for(int i=1;i segmentPositions, 263 | List depthScores, List minimaPosition) { 264 | Iterator sentenceItr = JCasUtil 265 | .iterator(jcas, Sentence.class); 266 | int sentenceCount = -1; 267 | int prevBreak = 0; 268 | 269 | for (final int sBreak : segmentPositions) { 270 | final SegmentScore score = new SegmentScore(jcas); 271 | 272 | Sentence segmentSentence = null; 273 | 274 | int beginOffset = 0; 275 | int endOffset = 0; 276 | 277 | // move sentenceItr to last sentence in segment 278 | for (; sentenceCount < sBreak; sentenceCount++) { 279 | segmentSentence = sentenceItr.next(); 280 | 281 | if (sentenceCount == prevBreak) { 282 | beginOffset = segmentSentence.getBegin(); 283 | System.out.println("BeginOffset: "+ beginOffset); 284 | } 285 | } 286 | 287 | if (segmentSentence != null) { 288 | endOffset = segmentSentence.getEnd(); 289 | System.out.println("end offset "+endOffset); 290 | } 291 | score.setBegin(beginOffset); 292 | score.setEnd(endOffset); 293 | int idx = minimaPosition.indexOf(sBreak); 294 | if (idx < 0) { 295 | score.setScore(1.0); 296 | } else { 297 | score.setScore(depthScores.get(idx)); 298 | } 299 | score.addToIndexes(); 300 | if (printSegments) { 301 | System.out.println(sBreak + "\t" + sentenceCount + "\t" 302 | + beginOffset + "\t" + endOffset); 303 | } 304 | prevBreak = sBreak; 305 | } 306 | } 307 | 308 | /** 309 | * expects a list with the sentencenumber that will be segmented 310 | * 311 | * @param jcas 312 | * @param sentenceBreaks 313 | */ 314 | private void annotateSegments(JCas jcas, List sentenceBreaks) { 315 | Iterator sentenceItr = JCasUtil 316 | .iterator(jcas, Sentence.class); 317 | int sentenceCount = -1; 318 | int prevBreak = 0; 319 | if (printSegments) { 320 | System.out.println("Annotated Segments"); 321 | System.out.println(sentenceBreaks.toString()); 322 | } 323 | 324 | for (final int sBreak : sentenceBreaks) { 325 | final Segment seg = new Segment(jcas); 326 | 327 | Sentence segmentSentence = null; 328 | 329 | int beginOffset = 0; 330 | int endOffset = 0; 331 | 332 | // move sentenceItr to last sentence in segment 333 | for (; sentenceCount < sBreak; sentenceCount++) { 334 | segmentSentence = sentenceItr.next(); 335 | 336 | if (sentenceCount == prevBreak) { 337 | beginOffset = segmentSentence.getBegin(); 338 | } 339 | } 340 | 341 | if (segmentSentence != null) { 342 | endOffset = segmentSentence.getEnd(); 343 | } 344 | 345 | seg.setBegin(beginOffset); 346 | seg.setEnd(endOffset); 347 | seg.addToIndexes(); 348 | 349 | if (printSegments) { 350 | System.out.println(sBreak + "\t" + sentenceCount + "\t" 351 | + beginOffset + "\t" + endOffset); 352 | } 353 | prevBreak = sBreak; 354 | } 355 | } 356 | } 357 | --------------------------------------------------------------------------------