├── topictiling.png
├── de.tudarmstadt.langtech.semantics.segmentation.topictiling
├── scripts
│ ├── topictiling.sh
│ ├── package.sh
│ └── topictiling.bat
├── src
│ ├── main
│ │ ├── resources
│ │ │ └── desc
│ │ │ │ └── type
│ │ │ │ ├── Segment.xml
│ │ │ │ ├── SegmentQuantity.xml
│ │ │ │ ├── CohesionIndicator.xml
│ │ │ │ ├── SegmentScore.xml
│ │ │ │ └── GibbsLdaDescriptor.xml
│ │ └── java
│ │ │ └── de
│ │ │ └── tudarmstadt
│ │ │ └── langtech
│ │ │ ├── semantics
│ │ │ ├── type
│ │ │ │ ├── Segment.java
│ │ │ │ ├── Segment_Type.java
│ │ │ │ ├── SegmentQuantity.java
│ │ │ │ ├── SegmentQuantity_Type.java
│ │ │ │ ├── SegmentScore.java
│ │ │ │ └── SegmentScore_Type.java
│ │ │ └── segmentation
│ │ │ │ └── segmenter
│ │ │ │ ├── annotator
│ │ │ │ ├── SimpleSegmenter.java
│ │ │ │ ├── OutputSegments.java
│ │ │ │ ├── TopicTilingDocumentSegmenterAnnotator.java
│ │ │ │ └── TopicTilingSegmenterAnnotator.java
│ │ │ │ ├── RunTopicTilingOnFile.java
│ │ │ │ ├── TopicTilingTopicDocument.java
│ │ │ │ └── TextTilingWindowOptimized.java
│ │ │ └── ml
│ │ │ └── lda
│ │ │ └── type
│ │ │ ├── GibbsLdaTopic.java
│ │ │ └── GibbsLdaTopic_Type.java
│ └── test
│ │ └── java
│ │ ├── TestSimpleReader.java
│ │ └── RunTopicTilingOnFile.java
├── .project
├── .classpath
├── pom.xml
└── README.txt
├── split_output.py
├── de.tudarmstadt.langtech.lda
├── .project
├── pom.xml
├── src
│ ├── main
│ │ ├── java
│ │ │ ├── jgibbslda
│ │ │ │ ├── Constants.java
│ │ │ │ ├── Conversion.java
│ │ │ │ ├── LDACmdOption.java
│ │ │ │ ├── Pair.java
│ │ │ │ ├── LDA.java
│ │ │ │ ├── Document.java
│ │ │ │ ├── Estimator.java
│ │ │ │ ├── Dictionary.java
│ │ │ │ ├── LogSaveEstimator.java
│ │ │ │ ├── Inferencer.java
│ │ │ │ └── LDADataset.java
│ │ │ └── de
│ │ │ │ └── tudarmstadt
│ │ │ │ └── langtech
│ │ │ │ └── lda
│ │ │ │ ├── consumer
│ │ │ │ └── GibbsLdaModelGeneratorConsumer.java
│ │ │ │ ├── annotator
│ │ │ │ ├── GibbsLdaDocumentBasedTopicIdAnnotator.java
│ │ │ │ ├── GibbsLdaSentenceBasedTopicIdAnnotator.java
│ │ │ │ ├── GibbsLdaTopicModelAnnotator.java
│ │ │ │ └── GibbsLdaTopicIdAnnotator.java
│ │ │ │ └── type
│ │ │ │ ├── Topic.java
│ │ │ │ ├── Topic_Type.java
│ │ │ │ ├── TopicDistribution.java
│ │ │ │ ├── WordTopicDistribution.java
│ │ │ │ ├── TopicDistribution_Type.java
│ │ │ │ └── WordTopicDistribution_Type.java
│ │ └── resources
│ │ │ └── desc
│ │ │ └── type
│ │ │ └── gibbsldatypes.xml
│ └── test
│ │ └── java
│ │ └── de
│ │ └── tudarmstadt
│ │ └── langtech
│ │ └── lda
│ │ └── TestLdaTopicModelAnnotator.java
└── .classpath
└── README.md
/topictiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/riedlma/topictiling/HEAD/topictiling.png
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/topictiling.sh:
--------------------------------------------------------------------------------
1 | java -Xmx1G -cp $(echo dependency/*jar| tr ' ' ':'):de.tudarmstadt.langtech.semantics.segmentation.topictiling-0.0.2.jar de.tudarmstadt.langtech.semantics.segmentation.segmenter.RunTopicTilingOnFile $@
2 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/package.sh:
--------------------------------------------------------------------------------
1 | outp=topictiling_0.0.2
2 |
3 | cd ../de.tudarmstadt.langtech.lda
4 | mvn package
5 | mvn install
6 | cd ../de.tudarmstadt.langtech.semantics.segmentation.topictiling
7 | mvn package
8 | mvn dependency:copy-dependencies
9 |
10 | mkdir $outp
11 | cp target/*jar $outp
12 | cp -r target/dependency $outp
13 | cp scripts/top*sh $outp
14 | cp scripts/top*bat $outp
15 |
16 | cp README.txt $outp
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | SegmentDescriptor
4 |
5 | 1.0
6 |
7 |
8 |
9 | de.tudarmstadt.langtech.semantics.type.Segment
10 |
11 | uima.tcas.Annotation
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/split_output.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | topic_output = sys.argv[1]
5 | output_folder = sys.argv[2]
6 |
7 | if not os.path.exists(output_folder):
8 | os.makedirs(output_folder)
9 |
10 | out = ""
11 | out_filename = ""
12 | for l in open(topic_output):
13 | out+=l
14 | if l.strip()=="":
15 | out=""
16 | if l.strip().startswith(""):
17 | docname = l.strip().replace("","").replace("","")
18 | if l.strip().startswith(""):
19 | fw = open(os.path.join(output_folder,docname),"w")
20 | fw.write(out)
21 | fw.close()
22 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | de.tudarmstadt.langtech.lda
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.jdt.core.javanature
21 | org.eclipse.m2e.core.maven2Nature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | de.tudarmstadt.langtech.semantics.segmentation.topictiling
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.jdt.core.javanature
21 | org.eclipse.m2e.core.maven2Nature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentQuantity.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | SegmentQuantity
4 |
5 | 1.0
6 |
7 |
8 |
9 | de.tudarmstadt.langtech.semantics.type.SegmentQuantity
10 | Saves the number of segments a document should consist of according to a given gold-standard.
11 | uima.tcas.Annotation
12 |
13 |
14 | segmentCount
15 |
16 | uima.cas.Integer
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/CohesionIndicator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | CohesionIndicator
4 |
5 | 1.0
6 |
7 |
8 |
9 | de.tudarmstadt.ukp.dkpro.semantics.type.CohesionIndicator
10 | Marks a range that is relevant for cohesion. This may be, for instance, a Lemma.
11 | uima.tcas.Annotation
12 |
13 |
14 | stringRepresentation
15 |
16 | uima.cas.String
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentScore.xml:
--------------------------------------------------------------------------------
1 |
2 | SegmentScore
3 |
4 | 1.0
5 |
6 |
7 |
8 | de.tudarmstadt.langtech.semantics.type.SegmentScore
9 |
10 | uima.tcas.Annotation
11 |
12 |
13 | score
14 |
15 | uima.cas.Double
16 |
17 |
18 | similarityScores
19 |
20 | uima.cas.String
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/GibbsLdaDescriptor.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | GibbsLdaDescriptor
4 |
5 | 1.0
6 |
7 |
8 |
9 | de.tudarmstadt.langtech.ml.lda.type.GibbsLdaTopic
10 |
11 | uima.tcas.Annotation
12 |
13 |
14 | topic
15 |
16 | uima.cas.Integer
17 |
18 |
19 | termId
20 |
21 | uima.cas.Integer
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | de.tudarmstadt.ukp.dkpro
4 | de.tudarmstadt.ukp.dkpro.lda
5 | 0.0.1-SNAPSHOT
6 |
7 |
8 | args4j
9 | args4j
10 | 2.0.16
11 | jar
12 | compile
13 |
14 |
15 | de.tudarmstadt.ukp.dkpro.core
16 | de.tudarmstadt.ukp.dkpro.core.api.metadata-asl
17 | 1.4.0
18 |
19 |
20 | de.tudarmstadt.ukp.dkpro.core
21 | de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl
22 | 1.4.0
23 |
24 |
25 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/test/java/TestSimpleReader.java:
--------------------------------------------------------------------------------
1 | import java.io.File;
2 | import java.io.IOException;
3 |
4 | import org.apache.commons.io.FileUtils;
5 | import org.apache.uima.UIMAException;
6 | import org.apache.uima.analysis_engine.AnalysisEngine;
7 | import org.apache.uima.jcas.JCas;
8 | import org.uimafit.factory.AnalysisEngineFactory;
9 | import org.uimafit.factory.JCasFactory;
10 | import org.uimafit.pipeline.SimplePipeline;
11 | import org.uimafit.util.JCasUtil;
12 |
13 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.SimpleSegmenter;
14 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
15 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
16 |
17 | public class TestSimpleReader {
18 | public static void main(String[] args) throws UIMAException, IOException {
19 | // String f = "test.txt";
20 | // JCas jcas = JCasFactory.createJCas();
21 | // jcas.setDocumentText(FileUtils.readFileToString(new File(f)));
22 | // AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(SimpleSegmenter.class);
23 | // SimplePipeline.runPipeline(jcas, segmenter);
24 | // for(Sentence s:JCasUtil.select(jcas, Sentence.class)){
25 | // System.out.println(s.getCoveredText());
26 | // for (Token t: JCasUtil.selectCovered( Token.class,s)){
27 | // System.out.println(t.getCoveredText());
28 | // }
29 | // }
30 |
31 |
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Constants.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * JGibbsLDA is a free software; you can redistribute it and/or modify
15 | * it under the terms of the GNU General Public License as published
16 | * by the Free Software Foundation; either version 2 of the License,
17 | * or (at your option) any later version.
18 | *
19 | * JGibbsLDA is distributed in the hope that it will be useful, but
20 | * WITHOUT ANY WARRANTY; without even the implied warranty of
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | * GNU General Public License for more details.
23 | *
24 | * You should have received a copy of the GNU General Public License
25 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 | */
28 |
29 | package jgibbslda;
30 |
31 | public class Constants {
32 | public static final long BUFFER_SIZE_LONG = 1000000;
33 | public static final short BUFFER_SIZE_SHORT = 512;
34 |
35 | public static final int MODEL_STATUS_UNKNOWN = 0;
36 | public static final int MODEL_STATUS_EST = 1;
37 | public static final int MODEL_STATUS_ESTC = 2;
38 | public static final int MODEL_STATUS_INF = 3;
39 | }
40 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Conversion.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * JGibbsLDA is a free software; you can redistribute it and/or modify
15 | * it under the terms of the GNU General Public License as published
16 | * by the Free Software Foundation; either version 2 of the License,
17 | * or (at your option) any later version.
18 | *
19 | * JGibbsLDA is distributed in the hope that it will be useful, but
20 | * WITHOUT ANY WARRANTY; without even the implied warranty of
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | * GNU General Public License for more details.
23 | *
24 | * You should have received a copy of the GNU General Public License
25 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 | */
28 |
29 | package jgibbslda;
30 |
31 | public class Conversion {
32 | public static String ZeroPad( int number, int width )
33 | {
34 | StringBuffer result = new StringBuffer("");
35 | for( int i = 0; i < width-Integer.toString(number).length(); i++ )
36 | result.append( "0" );
37 | result.append( Integer.toString(number) );
38 |
39 | return result.toString();
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/consumer/GibbsLdaModelGeneratorConsumer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 | package de.tudarmstadt.langtech.lda.consumer;
25 |
26 | import jgibbslda.Estimator;
27 | import jgibbslda.LDACmdOption;
28 |
29 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
30 | import org.apache.uima.jcas.JCas;
31 | import org.uimafit.component.JCasConsumer_ImplBase;
32 |
33 | public class GibbsLdaModelGeneratorConsumer extends JCasConsumer_ImplBase {
34 |
35 | @Override
36 | public void process(JCas aJCas)
37 | throws AnalysisEngineProcessException {
38 | LDACmdOption options = new LDACmdOption();
39 | Estimator es = new Estimator();
40 | es.init(options);
41 |
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | de.tudarmstadt.langtech.semantics.segmentation
4 | de.tudarmstadt.langtech.semantics.segmentation.topictiling
5 | 0.0.2
6 |
7 |
8 | de.tudarmstadt.ukp.dkpro.core
9 |
10 | de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl
11 |
12 | 1.4.0
13 |
14 |
15 | args4j
16 | args4j
17 | 2.0.16
18 |
19 |
20 | de.tudarmstadt.ukp.dkpro.core
21 |
22 | de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl
23 |
24 | 1.4.0
25 |
26 |
27 | de.tudarmstadt.ukp.dkpro.core
28 |
29 | de.tudarmstadt.ukp.dkpro.core.io.text-asl
30 |
31 | 1.4.0
32 |
33 |
34 | org.uimafit
35 | uimafit
36 | 1.4.0
37 |
38 |
39 | de.tudarmstadt.ukp.dkpro
40 | de.tudarmstadt.ukp.dkpro.lda
41 | 0.0.1-SNAPSHOT
42 |
43 |
44 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDACmdOption.java:
--------------------------------------------------------------------------------
1 | package jgibbslda;
2 |
3 | import org.kohsuke.args4j.*;
4 |
5 | public class LDACmdOption {
6 |
7 | @Option(name="-est", usage="Specify whether we want to estimate model from scratch")
8 | public boolean est = false;
9 |
10 | @Option(name="-estc", usage="Specify whether we want to continue the last estimation")
11 | public boolean estc = false;
12 |
13 | @Option(name="-inf", usage="Specify whether we want to do inference")
14 | public boolean inf = true;
15 |
16 | @Option(name="-dir", usage="Specify directory")
17 | public String dir = "";
18 |
19 | @Option(name="-dfile", usage="Specify data file")
20 | public String dfile = "";
21 |
22 | @Option(name="-model", usage="Specify the model name")
23 | public String modelName = "";
24 |
25 | @Option(name="-alpha", usage="Specify alpha")
26 | public double alpha = -1.0;
27 |
28 | @Option(name="-beta", usage="Specify beta")
29 | public double beta = -1.0;
30 |
31 | @Option(name="-ntopics", usage="Specify the number of topics")
32 | public int K = 100;
33 |
34 | @Option(name="-niters", usage="Specify the number of iterations")
35 | public int niters = 1000;
36 |
37 | @Option(name="-savestep", usage="Specify the number of steps to save the model since the last save")
38 | public int savestep = 100;
39 |
40 | @Option(name="-twords", usage="Specify the number of most likely words to be printed for each topic")
41 | public int twords = 100;
42 |
43 | @Option(name="-withrawdata", usage="Specify whether we include raw data in the input")
44 | public boolean withrawdata = false;
45 |
46 | @Option(name="-wordmap", usage="Specify the wordmap file")
47 | public String wordMapFileName = "wordmap.txt";
48 | }
49 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Pair.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * JGibbsLDA is a free software; you can redistribute it and/or modify
15 | * it under the terms of the GNU General Public License as published
16 | * by the Free Software Foundation; either version 2 of the License,
17 | * or (at your option) any later version.
18 | *
19 | * JGibbsLDA is distributed in the hope that it will be useful, but
20 | * WITHOUT ANY WARRANTY; without even the implied warranty of
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | * GNU General Public License for more details.
23 | *
24 | * You should have received a copy of the GNU General Public License
25 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 | */
28 |
29 | package jgibbslda;
30 |
31 | import java.util.Comparator;
32 |
33 | public class Pair implements Comparable {
34 | public Object first;
35 | public Comparable second;
36 | public static boolean naturalOrder = false;
37 |
38 | public Pair(Object k, Comparable v){
39 | first = k;
40 | second = v;
41 | }
42 |
43 | public Pair(Object k, Comparable v, boolean naturalOrder){
44 | first = k;
45 | second = v;
46 | Pair.naturalOrder = naturalOrder;
47 | }
48 |
49 | public int compareTo(Pair p){
50 | if (naturalOrder)
51 | return this.second.compareTo(p.second);
52 | else return -this.second.compareTo(p.second);
53 | }
54 | }
55 |
56 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/resources/desc/type/gibbsldatypes.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | gibbsldatypes
4 |
5 | 1.0
6 |
7 |
8 |
9 | de.tudarmstadt.ukp.dkpro.lda.type.Topic
10 |
11 | uima.tcas.Annotation
12 |
13 |
14 | topicId
15 |
16 | uima.cas.Integer
17 |
18 |
19 | topicModeId
20 |
21 | uima.cas.Integer
22 |
23 |
24 |
25 |
26 | de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution
27 |
28 | uima.tcas.Annotation
29 |
30 |
31 | topicDistribution
32 |
33 | uima.cas.DoubleArray
34 |
35 |
36 |
37 |
38 | de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution
39 |
40 | uima.tcas.Annotation
41 |
42 |
43 | topicDistribution
44 |
45 | uima.cas.DoubleArray
46 | true
47 |
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/Segment.java:
--------------------------------------------------------------------------------
1 |
2 |
3 | /* First created by JCasGen Fri Nov 08 17:22:57 CET 2013 */
4 | package de.tudarmstadt.langtech.semantics.type;
5 |
6 | import org.apache.uima.jcas.JCas;
7 | import org.apache.uima.jcas.JCasRegistry;
8 | import org.apache.uima.jcas.cas.TOP_Type;
9 |
10 | import org.apache.uima.jcas.tcas.Annotation;
11 |
12 |
13 | /**
14 | * Updated by JCasGen Fri Nov 08 17:22:57 CET 2013
15 | * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml
16 | * @generated */
17 | public class Segment extends Annotation {
18 | /** @generated
19 | * @ordered
20 | */
21 | @SuppressWarnings ("hiding")
22 | public final static int typeIndexID = JCasRegistry.register(Segment.class);
23 | /** @generated
24 | * @ordered
25 | */
26 | @SuppressWarnings ("hiding")
27 | public final static int type = typeIndexID;
28 | /** @generated */
29 | @Override
30 | public int getTypeIndexID() {return typeIndexID;}
31 |
32 | /** Never called. Disable default constructor
33 | * @generated */
34 | protected Segment() {/* intentionally empty block */}
35 |
36 | /** Internal - constructor used by generator
37 | * @generated */
38 | public Segment(int addr, TOP_Type type) {
39 | super(addr, type);
40 | readObject();
41 | }
42 |
43 | /** @generated */
44 | public Segment(JCas jcas) {
45 | super(jcas);
46 | readObject();
47 | }
48 |
49 | /** @generated */
50 | public Segment(JCas jcas, int begin, int end) {
51 | super(jcas);
52 | setBegin(begin);
53 | setEnd(end);
54 | readObject();
55 | }
56 |
57 | /**
58 | * Write your own initialization here
59 | *
60 | @generated modifiable */
61 | private void readObject() {/*default - does nothing empty block */}
62 |
63 | }
64 |
65 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/topictiling.bat:
--------------------------------------------------------------------------------
1 | java -cp dependency/ant-1.8.1.jar:dependency/ant-launcher-1.8.1.jar:dependency/aopalliance-1.0.jar:dependency/args4j-2.0.16.jar:dependency/commons-compress-1.4.1.jar:dependency/commons-io-2.0.1.jar:dependency/commons-lang-2.6.jar:dependency/commons-logging-1.1.0.jboss.jar:dependency/commons-logging-1.1.1.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.coref-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.io-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.metadata-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.ner-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.parameter-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.resources-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.syntax-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.io.text-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp .dkpro.core.stanfordnlp-gpl-1.4.0.jar:dependency/icu4j-4.0.1.jar:dependency/jVinci-2.4.0.jar:dependency/joda-time-2.0.jar:dependency/lda.jar:dependency/org.apache.log4j-1.2.13.v200706111418.jar:dependency/serializer-2.7.1.jar:dependency/spring-aop-3.1.0.RELEASE.jar:dependency/spring-asm-3.1.0.RELEASE.jar:dependency/spring-beans-3.1.0.RELEASE.jar:dependency/spring-context-3.1.0.RELEASE.jar:dependency/spring-core-3.1.0.RELEASE.jar:dependency/spring-expression-3.1.0.RELEASE.jar:dependency/stanford-corenlp-1.3.3.jar:dependency/uimafit-1.4.0.jar:dependency/uimaj-adapter-vinci-2.4.0.jar:dependency/uimaj-core-2.4.0.jar:dependency/uimaj-cpe-2.4.0.jar:dependency/uimaj-document-annotation-2.4.0.jar:dependency/uimaj-tools-2.4.0.jar:dependency/xalan-2.7.1.jar:dependency/xercesImpl-2.9.1.jar:dependency/xml-apis-1.3.03.jar:dependency/xom-1.2.5.jar:dependency/xz-1.0.jar:de.tudarmstadt.langtech.semantics.segmentation.topictiling-0.0.2.jar de.tudarmstadt.langtech.semantics.segmentation.segmenter.RunTopicTilingOnFile
2 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/SimpleSegmenter.java:
--------------------------------------------------------------------------------
1 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator;
2 |
3 | import java.text.BreakIterator;
4 |
5 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
6 | import org.apache.uima.jcas.JCas;
7 | import org.uimafit.component.JCasAnnotator_ImplBase;
8 | import org.uimafit.descriptor.ConfigurationParameter;
9 |
10 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
11 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
12 |
13 | public class SimpleSegmenter extends JCasAnnotator_ImplBase{
14 | public static final String PARAM_TOKEN_BOUNDARY="TokenBoundary";
15 | public static final String PARAM_SENTENCE_BOUNDARY="SentenceBoundary";
16 | @ConfigurationParameter(name = PARAM_SENTENCE_BOUNDARY,mandatory=false)
17 | private char sentenceBoundary = '\n';
18 | @ConfigurationParameter(name = PARAM_TOKEN_BOUNDARY,mandatory=false)
19 | private char tokenBoundary = ' ';
20 |
21 |
22 | @Override
23 | public void process(JCas aJCas) throws AnalysisEngineProcessException {
24 | String txt = aJCas.getDocumentText();
25 | int prevToken = 0;
26 | int prevSentence = 0;
27 | System.out.println(txt);
28 | int i =0;
29 | for (i=0;i0){
32 | Sentence s = new Sentence(aJCas,prevSentence,i);
33 | s.addToIndexes();
34 | prevSentence=i+1;
35 | Token t = new Token(aJCas,prevToken,i);
36 | t.addToIndexes();
37 | prevToken=i+1;
38 | }
39 | if (txt.charAt(i)==tokenBoundary && i-prevToken>0){
40 | Token t = new Token(aJCas,prevToken,i);
41 | t.addToIndexes();
42 | prevToken=i+1;
43 | }
44 |
45 | }
46 | if (i-prevSentence>0){
47 | Sentence s = new Sentence(aJCas,prevSentence,i);
48 | s.addToIndexes();
49 | Token t = new Token(aJCas,prevToken,i);
50 | t.addToIndexes();
51 | }
52 | }
53 |
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaDocumentBasedTopicIdAnnotator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 | package de.tudarmstadt.langtech.lda.annotator;
25 |
26 | import static org.uimafit.util.JCasUtil.select;
27 | import static org.uimafit.util.JCasUtil.selectCovered;
28 |
29 | import java.util.ArrayList;
30 | import java.util.Collection;
31 | import java.util.List;
32 |
33 | import org.apache.uima.jcas.JCas;
34 |
35 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
37 |
38 |
39 |
40 | public class GibbsLdaDocumentBasedTopicIdAnnotator
41 | extends GibbsLdaTopicIdAnnotator {
42 |
43 | @Override
44 | public List[] getDocuments(JCas jcas) {
45 | Collection sentences = select(jcas, Sentence.class);
46 | @SuppressWarnings("unchecked")
47 | List[] arr = new ArrayList[1];
48 | arr[0]= new ArrayList();
49 | for (Sentence s : sentences) {
50 | for (Token t : selectCovered(Token.class, s)) {
51 | arr[0].add(t.getCoveredText());
52 | }
53 | }
54 |
55 | return arr;
56 | }
57 |
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/Segment_Type.java:
--------------------------------------------------------------------------------
1 |
2 | /* First created by JCasGen Fri Nov 08 17:22:57 CET 2013 */
3 | package de.tudarmstadt.langtech.semantics.type;
4 |
5 | import org.apache.uima.jcas.JCas;
6 | import org.apache.uima.jcas.JCasRegistry;
7 | import org.apache.uima.cas.impl.CASImpl;
8 | import org.apache.uima.cas.impl.FSGenerator;
9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.jcas.tcas.Annotation_Type;
13 |
14 | /**
15 | * Updated by JCasGen Fri Nov 08 17:22:57 CET 2013
16 | * @generated */
17 | public class Segment_Type extends Annotation_Type {
18 | /** @generated */
19 | @Override
20 | protected FSGenerator getFSGenerator() {return fsGenerator;}
21 | /** @generated */
22 | private final FSGenerator fsGenerator =
23 | new FSGenerator() {
24 | public FeatureStructure createFS(int addr, CASImpl cas) {
25 | if (Segment_Type.this.useExistingInstance) {
26 | // Return eq fs instance if already created
27 | FeatureStructure fs = Segment_Type.this.jcas.getJfsFromCaddr(addr);
28 | if (null == fs) {
29 | fs = new Segment(addr, Segment_Type.this);
30 | Segment_Type.this.jcas.putJfsFromCaddr(addr, fs);
31 | return fs;
32 | }
33 | return fs;
34 | } else return new Segment(addr, Segment_Type.this);
35 | }
36 | };
37 | /** @generated */
38 | @SuppressWarnings ("hiding")
39 | public final static int typeIndexID = Segment.typeIndexID;
40 | /** @generated
41 | @modifiable */
42 | @SuppressWarnings ("hiding")
43 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.semantics.type.Segment");
44 |
45 |
46 |
47 | /** initialize variables to correspond with Cas Type and Features
48 | * @generated */
49 | public Segment_Type(JCas jcas, Type casType) {
50 | super(jcas, casType);
51 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
52 |
53 | }
54 | }
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaSentenceBasedTopicIdAnnotator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 | package de.tudarmstadt.langtech.lda.annotator;
25 |
26 | import static org.uimafit.util.JCasUtil.select;
27 | import static org.uimafit.util.JCasUtil.selectCovered;
28 |
29 | import java.util.ArrayList;
30 | import java.util.Collection;
31 | import java.util.List;
32 |
33 | import org.apache.uima.jcas.JCas;
34 |
35 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
37 |
38 | public class GibbsLdaSentenceBasedTopicIdAnnotator extends
39 | GibbsLdaTopicIdAnnotator {
40 |
41 |
42 | public List[] getDocuments(JCas jcas) {
43 | Collection sentences = select(jcas, Sentence.class);
44 | @SuppressWarnings("unchecked")
45 | List[] arr = new ArrayList[sentences.size()];
46 | int i = 0;
47 | for (Sentence s : select(jcas, Sentence.class)) {
48 | System.out.println(s.getCoveredText());
49 | }
50 | for (Sentence s : sentences) {
51 | StringBuffer line = new StringBuffer();
52 | arr[i] = new ArrayList();
53 | for (Token t : selectCovered(Token.class, s)) {
54 | line.append(t.getCoveredText());
55 | line.append(" ");
56 | arr[i].add(t.getCoveredText());
57 | }
58 | i++;
59 | }
60 |
61 | return arr;
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDA.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * JGibbsLDA is a free software; you can redistribute it and/or modify
15 | * it under the terms of the GNU General Public License as published
16 | * by the Free Software Foundation; either version 2 of the License,
17 | * or (at your option) any later version.
18 | *
19 | * JGibbsLDA is distributed in the hope that it will be useful, but
20 | * WITHOUT ANY WARRANTY; without even the implied warranty of
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | * GNU General Public License for more details.
23 | *
24 | * You should have received a copy of the GNU General Public License
25 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 | */
28 |
29 | package jgibbslda;
30 |
31 | import org.kohsuke.args4j.*;
32 |
33 | public class LDA {
34 |
35 | public static void main(String args[]){
36 | LDACmdOption option = new LDACmdOption();
37 | CmdLineParser parser = new CmdLineParser(option);
38 |
39 | try {
40 | if (args.length == 0){
41 | showHelp(parser);
42 | return;
43 | }
44 |
45 | parser.parseArgument(args);
46 |
47 | if (option.est || option.estc){
48 | Estimator estimator = new Estimator();
49 | estimator.init(option);
50 | estimator.estimate();
51 | }
52 | else if (option.inf){
53 | Inferencer inferencer = new Inferencer();
54 | inferencer.init(option);
55 |
56 | Model newModel = inferencer.inference();
57 |
58 | for (int i = 0; i < newModel.phi.length; ++i){
59 | //phi: K * V
60 | System.out.println("-----------------------\ntopic" + i + " : ");
61 | for (int j = 0; j < 10; ++j){
62 | System.out.println(inferencer.globalDict.id2word.get(j) + "\t" + newModel.phi[i][j]);
63 | }
64 | }
65 | }
66 | }
67 | catch (CmdLineException cle){
68 | System.out.println("Command line error: " + cle.getMessage());
69 | showHelp(parser);
70 | return;
71 | }
72 | catch (Exception e){
73 | System.out.println("Error in main: " + e.getMessage());
74 | e.printStackTrace();
75 | return;
76 | }
77 | }
78 |
79 | public static void showHelp(CmdLineParser parser){
80 | System.out.println("LDA [options ...] [arguments...]");
81 | parser.printUsage(System.out);
82 | }
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Document.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * JGibbsLDA is a free software; you can redistribute it and/or modify
15 | * it under the terms of the GNU General Public License as published
16 | * by the Free Software Foundation; either version 2 of the License,
17 | * or (at your option) any later version.
18 | *
19 | * JGibbsLDA is distributed in the hope that it will be useful, but
20 | * WITHOUT ANY WARRANTY; without even the implied warranty of
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | * GNU General Public License for more details.
23 | *
24 | * You should have received a copy of the GNU General Public License
25 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 | */
28 |
29 | package jgibbslda;
30 |
31 | import java.util.Vector;
32 |
33 | public class Document {
34 |
35 | //----------------------------------------------------
36 | //Instance Variables
37 | //----------------------------------------------------
38 | public int [] words;
39 | public String rawStr;
40 | public int length;
41 |
42 | //----------------------------------------------------
43 | //Constructors
44 | //----------------------------------------------------
45 | public Document(){
46 | words = null;
47 | rawStr = "";
48 | length = 0;
49 | }
50 |
51 | public Document(int length){
52 | this.length = length;
53 | rawStr = "";
54 | words = new int[length];
55 | }
56 |
57 | public Document(int length, int [] words){
58 | this.length = length;
59 | rawStr = "";
60 |
61 | this.words = new int[length];
62 | for (int i =0 ; i < length; ++i){
63 | this.words[i] = words[i];
64 | }
65 | }
66 |
67 | public Document(int length, int [] words, String rawStr){
68 | this.length = length;
69 | this.rawStr = rawStr;
70 |
71 | this.words = new int[length];
72 | for (int i =0 ; i < length; ++i){
73 | this.words[i] = words[i];
74 | }
75 | }
76 |
77 | public Document(Vector doc){
78 | this.length = doc.size();
79 | rawStr = "";
80 | this.words = new int[length];
81 | for (int i = 0; i < length; i++){
82 | this.words[i] = doc.get(i);
83 | }
84 | }
85 |
86 | public Document(Vector doc, String rawStr){
87 | this.length = doc.size();
88 | this.rawStr = rawStr;
89 | this.words = new int[length];
90 | for (int i = 0; i < length; ++i){
91 | this.words[i] = doc.get(i);
92 | }
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentQuantity.java:
--------------------------------------------------------------------------------
1 |
2 |
3 | /* First created by JCasGen Fri Nov 08 16:28:43 CET 2013 */
4 | package de.tudarmstadt.langtech.semantics.type;
5 |
6 | import org.apache.uima.jcas.JCas;
7 | import org.apache.uima.jcas.JCasRegistry;
8 | import org.apache.uima.jcas.cas.TOP_Type;
9 |
10 | import org.apache.uima.jcas.tcas.Annotation;
11 |
12 |
13 | /** Saves the number of segments a document should consist of according to a given gold-standard.
14 | * Updated by JCasGen Fri Nov 08 16:59:47 CET 2013
15 | * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml
16 | * @generated */
17 | public class SegmentQuantity extends Annotation {
18 | /** @generated
19 | * @ordered
20 | */
21 | @SuppressWarnings ("hiding")
22 | public final static int typeIndexID = JCasRegistry.register(SegmentQuantity.class);
23 | /** @generated
24 | * @ordered
25 | */
26 | @SuppressWarnings ("hiding")
27 | public final static int type = typeIndexID;
28 | /** @generated */
29 | @Override
30 | public int getTypeIndexID() {return typeIndexID;}
31 |
32 | /** Never called. Disable default constructor
33 | * @generated */
34 | protected SegmentQuantity() {/* intentionally empty block */}
35 |
36 | /** Internal - constructor used by generator
37 | * @generated */
38 | public SegmentQuantity(int addr, TOP_Type type) {
39 | super(addr, type);
40 | readObject();
41 | }
42 |
43 | /** @generated */
44 | public SegmentQuantity(JCas jcas) {
45 | super(jcas);
46 | readObject();
47 | }
48 |
49 | /** @generated */
50 | public SegmentQuantity(JCas jcas, int begin, int end) {
51 | super(jcas);
52 | setBegin(begin);
53 | setEnd(end);
54 | readObject();
55 | }
56 |
57 | /**
58 | * Write your own initialization here
59 | *
60 | @generated modifiable */
61 | private void readObject() {/*default - does nothing empty block */}
62 |
63 |
64 |
65 | //*--------------*
66 | //* Feature: segmentCount
67 |
68 | /** getter for segmentCount - gets
69 | * @generated */
70 | public int getSegmentCount() {
71 | if (SegmentQuantity_Type.featOkTst && ((SegmentQuantity_Type)jcasType).casFeat_segmentCount == null)
72 | jcasType.jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
73 | return jcasType.ll_cas.ll_getIntValue(addr, ((SegmentQuantity_Type)jcasType).casFeatCode_segmentCount);}
74 |
75 | /** setter for segmentCount - sets
76 | * @generated */
77 | public void setSegmentCount(int v) {
78 | if (SegmentQuantity_Type.featOkTst && ((SegmentQuantity_Type)jcasType).casFeat_segmentCount == null)
79 | jcasType.jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
80 | jcasType.ll_cas.ll_setIntValue(addr, ((SegmentQuantity_Type)jcasType).casFeatCode_segmentCount, v);}
81 | }
82 |
83 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/Topic.java:
--------------------------------------------------------------------------------
1 |
2 |
3 | /* First created by JCasGen Tue Feb 21 09:57:17 CET 2012 */
4 | package de.tudarmstadt.langtech.lda.type;
5 |
6 | import org.apache.uima.jcas.JCas;
7 | import org.apache.uima.jcas.JCasRegistry;
8 | import org.apache.uima.jcas.cas.TOP_Type;
9 |
10 | import org.apache.uima.jcas.tcas.Annotation;
11 |
12 |
13 | /**
14 | * Updated by JCasGen Thu Apr 12 12:36:02 CEST 2012
15 | * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml
16 | * @generated */
17 | public class Topic extends Annotation {
18 | /** @generated
19 | * @ordered
20 | */
21 | public final static int typeIndexID = JCasRegistry.register(Topic.class);
22 | /** @generated
23 | * @ordered
24 | */
25 | public final static int type = typeIndexID;
26 | /** @generated */
27 | public int getTypeIndexID() {return typeIndexID;}
28 |
29 | /** Never called. Disable default constructor
30 | * @generated */
31 | protected Topic() {}
32 |
33 | /** Internal - constructor used by generator
34 | * @generated */
35 | public Topic(int addr, TOP_Type type) {
36 | super(addr, type);
37 | readObject();
38 | }
39 |
40 | /** @generated */
41 | public Topic(JCas jcas) {
42 | super(jcas);
43 | readObject();
44 | }
45 |
46 | /** @generated */
47 | public Topic(JCas jcas, int begin, int end) {
48 | super(jcas);
49 | setBegin(begin);
50 | setEnd(end);
51 | readObject();
52 | }
53 |
54 | /**
55 | * Write your own initialization here
56 | *
57 | @generated modifiable */
58 | private void readObject() {}
59 |
60 | //*--------------*
61 | //* Feature: topicId
62 |
63 | /** getter for topicId - gets
64 | * @generated */
65 | public int getTopicId() {
66 | if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicId == null)
67 | jcasType.jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
68 | return jcasType.ll_cas.ll_getIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicId);}
69 |
70 | /** setter for topicId - sets
71 | * @generated */
72 | public void setTopicId(int v) {
73 | if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicId == null)
74 | jcasType.jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
75 | jcasType.ll_cas.ll_setIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicId, v);}
76 |
77 |
78 | //*--------------*
79 | //* Feature: topicModeId
80 |
81 | /** getter for topicModeId - gets
82 | * @generated */
83 | public int getTopicModeId() {
84 | if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicModeId == null)
85 | jcasType.jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
86 | return jcasType.ll_cas.ll_getIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicModeId);}
87 |
88 | /** setter for topicModeId - sets
89 | * @generated */
90 | public void setTopicModeId(int v) {
91 | if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicModeId == null)
92 | jcasType.jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
93 | jcasType.ll_cas.ll_setIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicModeId, v);}
94 | }
95 |
96 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentQuantity_Type.java:
--------------------------------------------------------------------------------
1 |
2 | /* First created by JCasGen Fri Nov 08 16:28:43 CET 2013 */
3 | package de.tudarmstadt.langtech.semantics.type;
4 |
5 | import org.apache.uima.jcas.JCas;
6 | import org.apache.uima.jcas.JCasRegistry;
7 | import org.apache.uima.cas.impl.CASImpl;
8 | import org.apache.uima.cas.impl.FSGenerator;
9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 |
16 | /** Saves the number of segments a document should consist of according to a given gold-standard.
17 | * Updated by JCasGen Fri Nov 08 16:59:47 CET 2013
18 | * @generated */
19 | public class SegmentQuantity_Type extends Annotation_Type {
20 | /** @generated */
21 | @Override
22 | protected FSGenerator getFSGenerator() {return fsGenerator;}
23 | /** @generated */
24 | private final FSGenerator fsGenerator =
25 | new FSGenerator() {
26 | public FeatureStructure createFS(int addr, CASImpl cas) {
27 | if (SegmentQuantity_Type.this.useExistingInstance) {
28 | // Return eq fs instance if already created
29 | FeatureStructure fs = SegmentQuantity_Type.this.jcas.getJfsFromCaddr(addr);
30 | if (null == fs) {
31 | fs = new SegmentQuantity(addr, SegmentQuantity_Type.this);
32 | SegmentQuantity_Type.this.jcas.putJfsFromCaddr(addr, fs);
33 | return fs;
34 | }
35 | return fs;
36 | } else return new SegmentQuantity(addr, SegmentQuantity_Type.this);
37 | }
38 | };
39 | /** @generated */
40 | @SuppressWarnings ("hiding")
41 | public final static int typeIndexID = SegmentQuantity.typeIndexID;
42 | /** @generated
43 | @modifiable */
44 | @SuppressWarnings ("hiding")
45 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
46 |
47 | /** @generated */
48 | final Feature casFeat_segmentCount;
49 | /** @generated */
50 | final int casFeatCode_segmentCount;
51 | /** @generated */
52 | public int getSegmentCount(int addr) {
53 | if (featOkTst && casFeat_segmentCount == null)
54 | jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
55 | return ll_cas.ll_getIntValue(addr, casFeatCode_segmentCount);
56 | }
57 | /** @generated */
58 | public void setSegmentCount(int addr, int v) {
59 | if (featOkTst && casFeat_segmentCount == null)
60 | jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
61 | ll_cas.ll_setIntValue(addr, casFeatCode_segmentCount, v);}
62 |
63 |
64 |
65 |
66 |
67 | /** initialize variables to correspond with Cas Type and Features
68 | * @generated */
69 | public SegmentQuantity_Type(JCas jcas, Type casType) {
70 | super(jcas, casType);
71 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
72 |
73 |
74 | casFeat_segmentCount = jcas.getRequiredFeatureDE(casType, "segmentCount", "uima.cas.Integer", featOkTst);
75 | casFeatCode_segmentCount = (null == casFeat_segmentCount) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_segmentCount).getCode();
76 |
77 | }
78 | }
79 |
80 |
81 |
82 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/test/java/de/tudarmstadt/langtech/lda/TestLdaTopicModelAnnotator.java:
--------------------------------------------------------------------------------
1 | package de.tudarmstadt.langtech.lda;
2 |
3 | import static org.uimafit.factory.AnalysisEngineFactory.createPrimitive;
4 |
5 | import java.io.IOException;
6 | import java.text.BreakIterator;
7 |
8 | import org.apache.uima.UIMAException;
9 | import org.apache.uima.analysis_engine.AnalysisEngine;
10 | import org.apache.uima.jcas.JCas;
11 | import org.uimafit.component.xwriter.CASDumpWriter;
12 | import org.uimafit.factory.AnalysisEngineFactory;
13 | import org.uimafit.factory.JCasFactory;
14 | import org.uimafit.pipeline.SimplePipeline;
15 |
16 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaDocumentBasedTopicIdAnnotator;
17 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaSentenceBasedTopicIdAnnotator;
18 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaTopicIdAnnotator;
19 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
20 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
21 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
22 |
23 | public class TestLdaTopicModelAnnotator {
24 | public static void main(String[] args) throws UIMAException, IOException {
25 | JCas jcas = getJCas();
26 |
27 | //sentence wise
28 | AnalysisEngine ae = AnalysisEngineFactory.createPrimitive(GibbsLdaSentenceBasedTopicIdAnnotator.class,
29 | GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_NAME, "model-final",
30 | GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_DIR, "src/test/resources/model",
31 | GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION,true,
32 | GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION,true,
33 | GibbsLdaTopicIdAnnotator.PARAM_LDA_REPEAT_INFERENCE, 100
34 | );
35 |
36 | //document wise
37 | AnalysisEngine ae2 = AnalysisEngineFactory.createPrimitive(GibbsLdaDocumentBasedTopicIdAnnotator.class,
38 | GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_NAME, "model-final",
39 | GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_DIR, "src/test/resources/model",
40 | GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION,true,
41 | GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION,true,
42 | GibbsLdaTopicIdAnnotator.PARAM_LDA_REPEAT_INFERENCE, 100
43 | );
44 |
45 |
46 | AnalysisEngine out = createPrimitive(CASDumpWriter.class);
47 | SimplePipeline.runPipeline(jcas, ae,out);
48 | }
49 |
50 | private static JCas getJCas() throws UIMAException {
51 | JCas jcas = JCasFactory.createJCas();
52 | jcas.setDocumentLanguage("en");
53 | String text = "This is some example document. And there is more text";
54 | jcas.setDocumentText(text);
55 | DocumentMetaData metaData = new DocumentMetaData(jcas);
56 | metaData.setDocumentTitle("Titel");
57 | metaData.addToIndexes();
58 | BreakIterator boundary = BreakIterator.getWordInstance();
59 |
60 |
61 | // print each sentence in reverse order
62 | boundary.setText(text);
63 | int start = boundary.first();
64 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
65 | Token t = new Token(jcas, start, end);
66 | t.addToIndexes();
67 | }
68 | boundary = BreakIterator.getSentenceInstance();
69 | boundary.setText(text);
70 |
71 | start = boundary.first();
72 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
73 | Sentence t = new Sentence(jcas, start, end);
74 | t.addToIndexes();
75 | }
76 | return jcas;
77 |
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/ml/lda/type/GibbsLdaTopic.java:
--------------------------------------------------------------------------------
1 |
2 |
3 | /* First created by JCasGen Fri Nov 08 16:28:12 CET 2013 */
4 | package de.tudarmstadt.langtech.ml.lda.type;
5 |
6 | import org.apache.uima.jcas.JCas;
7 | import org.apache.uima.jcas.JCasRegistry;
8 | import org.apache.uima.jcas.cas.TOP_Type;
9 |
10 | import org.apache.uima.jcas.tcas.Annotation;
11 |
12 |
13 | /**
14 | * Updated by JCasGen Fri Nov 08 16:59:29 CET 2013
15 | * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/GibbsLdaDescriptor.xml
16 | * @generated */
17 | public class GibbsLdaTopic extends Annotation {
18 | /** @generated
19 | * @ordered
20 | */
21 | @SuppressWarnings ("hiding")
22 | public final static int typeIndexID = JCasRegistry.register(GibbsLdaTopic.class);
23 | /** @generated
24 | * @ordered
25 | */
26 | @SuppressWarnings ("hiding")
27 | public final static int type = typeIndexID;
28 | /** @generated */
29 | @Override
30 | public int getTypeIndexID() {return typeIndexID;}
31 |
32 | /** Never called. Disable default constructor
33 | * @generated */
34 | protected GibbsLdaTopic() {/* intentionally empty block */}
35 |
36 | /** Internal - constructor used by generator
37 | * @generated */
38 | public GibbsLdaTopic(int addr, TOP_Type type) {
39 | super(addr, type);
40 | readObject();
41 | }
42 |
43 | /** @generated */
44 | public GibbsLdaTopic(JCas jcas) {
45 | super(jcas);
46 | readObject();
47 | }
48 |
49 | /** @generated */
50 | public GibbsLdaTopic(JCas jcas, int begin, int end) {
51 | super(jcas);
52 | setBegin(begin);
53 | setEnd(end);
54 | readObject();
55 | }
56 |
57 | /**
58 | * Write your own initialization here
59 | *
60 | @generated modifiable */
61 | private void readObject() {/*default - does nothing empty block */}
62 |
63 |
64 |
65 | //*--------------*
66 | //* Feature: topic
67 |
68 | /** getter for topic - gets
69 | * @generated */
70 | public int getTopic() {
71 | if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_topic == null)
72 | jcasType.jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
73 | return jcasType.ll_cas.ll_getIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_topic);}
74 |
75 | /** setter for topic - sets
76 | * @generated */
77 | public void setTopic(int v) {
78 | if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_topic == null)
79 | jcasType.jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
80 | jcasType.ll_cas.ll_setIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_topic, v);}
81 |
82 |
83 | //*--------------*
84 | //* Feature: termId
85 |
86 | /** getter for termId - gets
87 | * @generated */
88 | public int getTermId() {
89 | if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_termId == null)
90 | jcasType.jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
91 | return jcasType.ll_cas.ll_getIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_termId);}
92 |
93 | /** setter for termId - sets
94 | * @generated */
95 | public void setTermId(int v) {
96 | if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_termId == null)
97 | jcasType.jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
98 | jcasType.ll_cas.ll_setIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_termId, v);}
99 | }
100 |
101 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentScore.java:
--------------------------------------------------------------------------------
1 |
2 |
3 | /* First created by JCasGen Fri Nov 08 16:51:38 CET 2013 */
4 | package de.tudarmstadt.langtech.semantics.type;
5 |
6 | import org.apache.uima.jcas.JCas;
7 | import org.apache.uima.jcas.JCasRegistry;
8 | import org.apache.uima.jcas.cas.TOP_Type;
9 |
10 | import org.apache.uima.jcas.tcas.Annotation;
11 |
12 |
13 | /**
14 | * Updated by JCasGen Wed Aug 26 15:50:04 CEST 2015
15 | * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentScore.xml
16 | * @generated */
17 | public class SegmentScore extends Annotation {
18 | /** @generated
19 | * @ordered
20 | */
21 | @SuppressWarnings ("hiding")
22 | public final static int typeIndexID = JCasRegistry.register(SegmentScore.class);
23 | /** @generated
24 | * @ordered
25 | */
26 | @SuppressWarnings ("hiding")
27 | public final static int type = typeIndexID;
28 | /** @generated */
29 | @Override
30 | public int getTypeIndexID() {return typeIndexID;}
31 |
32 | /** Never called. Disable default constructor
33 | * @generated */
34 | protected SegmentScore() {/* intentionally empty block */}
35 |
36 | /** Internal - constructor used by generator
37 | * @generated */
38 | public SegmentScore(int addr, TOP_Type type) {
39 | super(addr, type);
40 | readObject();
41 | }
42 |
43 | /** @generated */
44 | public SegmentScore(JCas jcas) {
45 | super(jcas);
46 | readObject();
47 | }
48 |
49 | /** @generated */
50 | public SegmentScore(JCas jcas, int begin, int end) {
51 | super(jcas);
52 | setBegin(begin);
53 | setEnd(end);
54 | readObject();
55 | }
56 |
57 | /**
58 | * Write your own initialization here
59 | *
60 | @generated modifiable */
61 | private void readObject() {/*default - does nothing empty block */}
62 |
63 |
64 |
65 | //*--------------*
66 | //* Feature: score
67 |
68 | /** getter for score - gets
69 | * @generated */
70 | public double getScore() {
71 | if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_score == null)
72 | jcasType.jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
73 | return jcasType.ll_cas.ll_getDoubleValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_score);}
74 |
75 | /** setter for score - sets
76 | * @generated */
77 | public void setScore(double v) {
78 | if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_score == null)
79 | jcasType.jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
80 | jcasType.ll_cas.ll_setDoubleValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_score, v);}
81 |
82 |
83 | //*--------------*
84 | //* Feature: similarityScores
85 |
86 | /** getter for similarityScores - gets
87 | * @generated */
88 | public String getSimilarityScores() {
89 | if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_similarityScores == null)
90 | jcasType.jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
91 | return jcasType.ll_cas.ll_getStringValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_similarityScores);}
92 |
93 | /** setter for similarityScores - sets
94 | * @generated */
95 | public void setSimilarityScores(String v) {
96 | if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_similarityScores == null)
97 | jcasType.jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
98 | jcasType.ll_cas.ll_setStringValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_similarityScores, v);}
99 | }
100 |
101 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/OutputSegments.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator;
25 |
26 | import java.io.FileNotFoundException;
27 | import java.io.PrintStream;
28 | import java.util.Collection;
29 |
30 | import org.apache.commons.lang.StringEscapeUtils;
31 | import org.apache.uima.UimaContext;
32 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
33 | import org.apache.uima.jcas.JCas;
34 | import org.apache.uima.resource.ResourceInitializationException;
35 | import org.uimafit.component.JCasAnnotator_ImplBase;
36 | import org.uimafit.descriptor.ConfigurationParameter;
37 | import org.uimafit.util.JCasUtil;
38 |
39 | import de.tudarmstadt.langtech.semantics.type.SegmentScore;
40 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
41 |
42 | public class OutputSegments extends JCasAnnotator_ImplBase {
43 | public static final String PARAM_OUTPUT = "Output";
44 | @ConfigurationParameter(name = PARAM_OUTPUT, mandatory = false)
45 | private String output;
46 | private PrintStream ps;
47 | @Override
48 | public void initialize(UimaContext context)
49 | throws ResourceInitializationException {
50 | super.initialize(context);
51 | if(output==null){
52 | ps = System.out;
53 | }else{
54 | try {
55 | ps = new PrintStream(output);
56 | } catch (FileNotFoundException e) {
57 | e.printStackTrace();
58 | }
59 | }
60 | ps.println("");
61 | }
62 |
63 | @Override
64 | public void process(JCas aJCas) throws AnalysisEngineProcessException {
65 | ps.println("");
66 | ps.println(""+DocumentMetaData.get(aJCas).getDocumentTitle()+"");
67 | ps.println("");
68 | Collection ss = JCasUtil.select(aJCas, SegmentScore.class);
69 | int i = 0;
70 | for (SegmentScore s : ss) {
71 | if(i==0){
72 | if(s.getBegin()!=0){
73 | ps.println("");
74 | ps.println("");
75 | ps.println("");
76 | ps.println(StringEscapeUtils.escapeXml(aJCas.getDocumentText().substring(0,s.getBegin())));
77 | ps.println("");
78 | ps.println("");
79 | }
80 | }
81 | ps.println("");
82 | // ps.println(""+s.getSimilarityScores()+"");
83 | ps.println(""+s.getScore()+"");
84 | ps.println("");
85 | ps.println(StringEscapeUtils.escapeXml(s.getCoveredText()));
86 | ps.println("");
87 | ps.println("");
88 | i+=1;
89 | }
90 | ps.println("");
91 | ps.println("");
92 | }
93 | @Override
94 | public void collectionProcessComplete()
95 | throws AnalysisEngineProcessException {
96 | ps.println("");
97 | ps.close();
98 | super.collectionProcessComplete();
99 | }
100 |
101 | }
102 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/Topic_Type.java:
--------------------------------------------------------------------------------
1 |
2 | /* First created by JCasGen Tue Feb 21 09:57:17 CET 2012 */
3 | package de.tudarmstadt.langtech.lda.type;
4 |
5 | import org.apache.uima.jcas.JCas;
6 | import org.apache.uima.jcas.JCasRegistry;
7 | import org.apache.uima.cas.impl.CASImpl;
8 | import org.apache.uima.cas.impl.FSGenerator;
9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 |
16 | /**
17 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
18 | * @generated */
19 | public class Topic_Type extends Annotation_Type {
20 | /** @generated */
21 | protected FSGenerator getFSGenerator() {return fsGenerator;}
22 | /** @generated */
23 | private final FSGenerator fsGenerator =
24 | new FSGenerator() {
25 | public FeatureStructure createFS(int addr, CASImpl cas) {
26 | if (Topic_Type.this.useExistingInstance) {
27 | // Return eq fs instance if already created
28 | FeatureStructure fs = Topic_Type.this.jcas.getJfsFromCaddr(addr);
29 | if (null == fs) {
30 | fs = new Topic(addr, Topic_Type.this);
31 | Topic_Type.this.jcas.putJfsFromCaddr(addr, fs);
32 | return fs;
33 | }
34 | return fs;
35 | } else return new Topic(addr, Topic_Type.this);
36 | }
37 | };
38 | /** @generated */
39 | public final static int typeIndexID = Topic.typeIndexID;
40 | /** @generated
41 | @modifiable */
42 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.Topic");
43 |
44 |
45 |
46 | /** @generated */
47 | final Feature casFeat_topicId;
48 | /** @generated */
49 | final int casFeatCode_topicId;
50 | /** @generated */
51 | public int getTopicId(int addr) {
52 | if (featOkTst && casFeat_topicId == null)
53 | jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
54 | return ll_cas.ll_getIntValue(addr, casFeatCode_topicId);
55 | }
56 | /** @generated */
57 | public void setTopicId(int addr, int v) {
58 | if (featOkTst && casFeat_topicId == null)
59 | jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
60 | ll_cas.ll_setIntValue(addr, casFeatCode_topicId, v);}
61 |
62 |
63 |
64 | /** @generated */
65 | final Feature casFeat_topicModeId;
66 | /** @generated */
67 | final int casFeatCode_topicModeId;
68 | /** @generated */
69 | public int getTopicModeId(int addr) {
70 | if (featOkTst && casFeat_topicModeId == null)
71 | jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
72 | return ll_cas.ll_getIntValue(addr, casFeatCode_topicModeId);
73 | }
74 | /** @generated */
75 | public void setTopicModeId(int addr, int v) {
76 | if (featOkTst && casFeat_topicModeId == null)
77 | jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
78 | ll_cas.ll_setIntValue(addr, casFeatCode_topicModeId, v);}
79 |
80 |
81 |
82 |
83 |
84 | /** initialize variables to correspond with Cas Type and Features
85 | * @generated */
86 | public Topic_Type(JCas jcas, Type casType) {
87 | super(jcas, casType);
88 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
89 |
90 |
91 | casFeat_topicId = jcas.getRequiredFeatureDE(casType, "topicId", "uima.cas.Integer", featOkTst);
92 | casFeatCode_topicId = (null == casFeat_topicId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicId).getCode();
93 |
94 |
95 | casFeat_topicModeId = jcas.getRequiredFeatureDE(casType, "topicModeId", "uima.cas.Integer", featOkTst);
96 | casFeatCode_topicModeId = (null == casFeat_topicModeId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicModeId).getCode();
97 |
98 | }
99 | }
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/ml/lda/type/GibbsLdaTopic_Type.java:
--------------------------------------------------------------------------------
1 |
2 | /* First created by JCasGen Fri Nov 08 16:28:12 CET 2013 */
3 | package de.tudarmstadt.langtech.ml.lda.type;
4 |
5 | import org.apache.uima.jcas.JCas;
6 | import org.apache.uima.jcas.JCasRegistry;
7 | import org.apache.uima.cas.impl.CASImpl;
8 | import org.apache.uima.cas.impl.FSGenerator;
9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 |
16 | /**
17 | * Updated by JCasGen Fri Nov 08 16:59:29 CET 2013
18 | * @generated */
19 | public class GibbsLdaTopic_Type extends Annotation_Type {
20 | /** @generated */
21 | @Override
22 | protected FSGenerator getFSGenerator() {return fsGenerator;}
23 | /** @generated */
24 | private final FSGenerator fsGenerator =
25 | new FSGenerator() {
26 | public FeatureStructure createFS(int addr, CASImpl cas) {
27 | if (GibbsLdaTopic_Type.this.useExistingInstance) {
28 | // Return eq fs instance if already created
29 | FeatureStructure fs = GibbsLdaTopic_Type.this.jcas.getJfsFromCaddr(addr);
30 | if (null == fs) {
31 | fs = new GibbsLdaTopic(addr, GibbsLdaTopic_Type.this);
32 | GibbsLdaTopic_Type.this.jcas.putJfsFromCaddr(addr, fs);
33 | return fs;
34 | }
35 | return fs;
36 | } else return new GibbsLdaTopic(addr, GibbsLdaTopic_Type.this);
37 | }
38 | };
39 | /** @generated */
40 | @SuppressWarnings ("hiding")
41 | public final static int typeIndexID = GibbsLdaTopic.typeIndexID;
42 | /** @generated
43 | @modifiable */
44 | @SuppressWarnings ("hiding")
45 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
46 |
47 | /** @generated */
48 | final Feature casFeat_topic;
49 | /** @generated */
50 | final int casFeatCode_topic;
51 | /** @generated */
52 | public int getTopic(int addr) {
53 | if (featOkTst && casFeat_topic == null)
54 | jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
55 | return ll_cas.ll_getIntValue(addr, casFeatCode_topic);
56 | }
57 | /** @generated */
58 | public void setTopic(int addr, int v) {
59 | if (featOkTst && casFeat_topic == null)
60 | jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
61 | ll_cas.ll_setIntValue(addr, casFeatCode_topic, v);}
62 |
63 |
64 |
65 | /** @generated */
66 | final Feature casFeat_termId;
67 | /** @generated */
68 | final int casFeatCode_termId;
69 | /** @generated */
70 | public int getTermId(int addr) {
71 | if (featOkTst && casFeat_termId == null)
72 | jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
73 | return ll_cas.ll_getIntValue(addr, casFeatCode_termId);
74 | }
75 | /** @generated */
76 | public void setTermId(int addr, int v) {
77 | if (featOkTst && casFeat_termId == null)
78 | jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
79 | ll_cas.ll_setIntValue(addr, casFeatCode_termId, v);}
80 |
81 |
82 |
83 |
84 |
85 | /** initialize variables to correspond with Cas Type and Features
86 | * @generated */
87 | public GibbsLdaTopic_Type(JCas jcas, Type casType) {
88 | super(jcas, casType);
89 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
90 |
91 |
92 | casFeat_topic = jcas.getRequiredFeatureDE(casType, "topic", "uima.cas.Integer", featOkTst);
93 | casFeatCode_topic = (null == casFeat_topic) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topic).getCode();
94 |
95 |
96 | casFeat_termId = jcas.getRequiredFeatureDE(casType, "termId", "uima.cas.Integer", featOkTst);
97 | casFeatCode_termId = (null == casFeat_termId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_termId).getCode();
98 |
99 | }
100 | }
101 |
102 |
103 |
104 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaTopicModelAnnotator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 |
25 | package de.tudarmstadt.langtech.lda.annotator;
26 |
27 | import java.util.ArrayList;
28 | import java.util.List;
29 |
30 | import jgibbslda.Dictionary;
31 | import jgibbslda.Inferencer;
32 | import jgibbslda.LDACmdOption;
33 | import jgibbslda.Model;
34 |
35 | import org.apache.uima.UimaContext;
36 | import org.apache.uima.resource.ResourceInitializationException;
37 | import org.uimafit.component.JCasAnnotator_ImplBase;
38 | import org.uimafit.descriptor.ConfigurationParameter;
39 |
40 | /**
41 | * @author Martin Riedl
42 | */
43 | public abstract class GibbsLdaTopicModelAnnotator extends JCasAnnotator_ImplBase{
44 | public static final String PARAM_LDA_MODEL_DIR = "LdaModelDir";
45 | public static final String PARAM_LDA_MODEL_NAME = "LdaModelName";
46 | public static final String PARAM_LDA_INFERENCE_ITERATIONS = "LdaInferenceIterations";
47 | public static final String PARAM_LDA_INFERENCE_SAVE_PATH = "LdaInferenceSavePath";
48 |
49 |
50 |
51 | @ConfigurationParameter(name = PARAM_LDA_INFERENCE_SAVE_PATH, mandatory = false)
52 | private String ldaInferenceSavePath;
53 | private String ldaInferenceSaveName;
54 |
55 | public String getLdaInferenceSaveName() {
56 | return ldaInferenceSaveName;
57 | }
58 | public void setLdaInferenceSaveName(String ldaInferenceSaveName) {
59 | this.ldaInferenceSaveName = ldaInferenceSaveName;
60 | }
61 | @ConfigurationParameter(name = PARAM_LDA_MODEL_DIR, mandatory = true)
62 | private String ldaModelDir;
63 | @ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true)
64 | private String ldaModelName;
65 | @ConfigurationParameter(name = PARAM_LDA_INFERENCE_ITERATIONS, mandatory = false, description = "Inference iterations used to built topic distribution for new model", defaultValue = "100")
66 | private int ldaInferenceIteration;
67 |
68 | private Inferencer inferencer;
69 |
70 | // public Model inference(String[] documents) {
71 | // Model m = inferencer.inference(documents);
72 | // if(ldaInferenceSavePath!=null){
73 | // m.dir = ldaInferenceSavePath;
74 | // m.saveModel("inference_"+ldaInferenceSaveName);
75 | // }
76 | // return m;
77 | // }
78 |
79 | public Model inference(List[] documents) {
80 | Model m = inferencer.inference(documents);
81 | if(ldaInferenceSavePath!=null){
82 | m.dir = ldaInferenceSavePath;
83 | m.saveModel("inference_"+ldaInferenceSaveName);
84 | }
85 | return m;
86 | }
87 | public int getInferenceNiters() {
88 | return inferencer.niters;
89 | }
90 |
91 | public ArrayList getInferenceModeValues() {
92 | return inferencer.values;
93 | }
94 |
95 | public Dictionary getInferencerGlobalDict(){
96 | return inferencer.globalDict;
97 | }
98 |
99 |
100 | @Override
101 | public void initialize(UimaContext context)
102 | throws ResourceInitializationException {
103 | super.initialize(context);
104 | LDACmdOption options = new LDACmdOption();
105 | options.dir = ldaModelDir;
106 | options.modelName = ldaModelName;
107 | options.niters = ldaInferenceIteration;
108 | //Initiliaze inferencer
109 | inferencer = new Inferencer();
110 | inferencer.init(options);
111 | }
112 |
113 |
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/README.txt:
--------------------------------------------------------------------------------
1 | ----------------------------------------------------
2 | | TopicTiling |
3 | ----------------------------------------------------
4 |
5 | Topic Tiling is a LDA based Text Segmentation algorithm.
6 | This algorithm is based on the well-known TextTiling
7 | algorithm, and segments documents using the Latent
8 | Dirichlet Allocation (LDA) topic model. TopicTiling performs
9 | the segmentation in linear time and thus is computationally
10 | less expensive than other LDA-based segmentation methods.
11 |
12 | USE:
13 |
14 | The tool has been developed and tested using unix-based systems.
15 | As TopicTiling is written in Java it should also run on Windows
16 | machines. For executing TopicTiling you have to uncompress the
17 | zip file and execute the topictiling.sh (Unix-based system) or
18 | topictiling.bat (Windows-based system). The output is given in
19 | an XML format with suggested topical boundaries.
20 |
21 | HINT FOR NON-LATIN LANGUAGES:
22 | If you want to process e.g. Chinese, Arabic languages with TopicTiling
23 | you have to provide tokenized text (both for TopicTiling and GibbsLDA)
24 | and in addition use the flag -s which disables the Stanford tokenization
25 | and uses instead a simple whitespace tokenizer that expects one sentence
26 | per line
27 |
28 |
29 | The parameters of the script are shown when no parameters are given:
30 |
31 | [java] Option "-fd" is required
32 | [java] java -jar myprogram.jar [options...] arguments...
33 | [java] -dn : Use the direct neighbor otherwise the highest neighbor will be used
34 | [java] (default false)
35 | [java] -fd VAL : Directory fo the test files
36 | [java] -fp VAL : File pattern for the test files
37 | [java] -i N : Number of inference iterations used to annotate words with topic
38 | [java] IDs (default 100)
39 | [java] -m : Use mode counting (true/false) (default=true)
40 | [java] -out VAL : File the content is written to (otherwise stdout will be used)
41 | [java] -ri N : Use the repeated inference method
42 | [java] -rs N : Use the repeated segmentation
43 | [java] -s : Use simple segmentation (default=false)
44 | [java] -tmd VAL : Directory of the topic model (GibbsLDA should be used)
45 | [java] -tmn VAL : Name of the topic model (GibbsLDA should be used)
46 | [java] -w N : Window size used to calculate the sentence similarity
47 |
48 | The parameters -fp, -fd, -tmd, -tmn are the ones that have to be specified
49 | and –ri should be parametrized by using about 5 repeated inferences.
50 |
51 | For the algorithms it’s important to have a trained LDA model. The model should
52 | be in a similar domain as the data you apply my algorithm. You have to train it
53 | yourself using GibssLda++ or JGibbslda (http://gibbslda.sourceforge.net/) . They
54 | both have the same output format. The output of the algorithms is given in XML
55 | and looks like:
56 |
57 |
58 | …
59 |
60 | score
61 | …
62 |
63 | …
64 |
65 |
66 |
67 | The code returns all possible boundary positions (all maxima). If the number of
68 | segments is known, select the the N highest depthScore values as boundary positions.
69 |
70 |
71 | LICENSE:
72 |
73 | The software is released under GPL 3.0
74 |
75 | PAPERS:
76 |
77 |
78 | Riedl, M., Biemann, C. (2012): Text Segmentation with Topic Models. Journal for Language Technology and Computational Linguistics (JLCL), Vol. 27, No. 1, pp. 47--70, August 2012 (pdf)
79 | Riedl M., Biemann C. (2012): How Text Segmentation Algorithms Gain from Topic Models, Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2012), Montreal, Canada. (pdf)
80 | Riedl M., Biemann C. (2012): TopicTiling: A Text Segmentation Algorithm based on LDA, Proceedings of the Student Research Workshop of the 50th Meeting of the Association for Computational Linguistics, Jeju, Republic of Korea. (pdf)
81 | Riedl M., Biemann C. (2012): Sweeping through the Topic Space: Bad luck? Roll again! In Proceedings of the Joint Workshop on Unsupervised and Semi-Supervised Learning in NLP held in conjunction with EACL 2012, Avignon, France (pdf)
82 |
83 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/TopicDistribution.java:
--------------------------------------------------------------------------------
1 |
2 |
3 | /* First created by JCasGen Wed Apr 11 15:17:37 CEST 2012 */
4 | package de.tudarmstadt.langtech.lda.type;
5 |
6 | import org.apache.uima.jcas.JCas;
7 | import org.apache.uima.jcas.JCasRegistry;
8 | import org.apache.uima.jcas.cas.TOP_Type;
9 |
10 | import org.apache.uima.jcas.tcas.Annotation;
11 | import org.apache.uima.jcas.cas.DoubleArray;
12 |
13 |
14 | /**
15 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
16 | * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml
17 | * @generated */
18 | public class TopicDistribution extends Annotation {
19 | /** @generated
20 | * @ordered
21 | */
22 | public final static int typeIndexID = JCasRegistry.register(TopicDistribution.class);
23 | /** @generated
24 | * @ordered
25 | */
26 | public final static int type = typeIndexID;
27 | /** @generated */
28 | public int getTypeIndexID() {return typeIndexID;}
29 |
30 | /** Never called. Disable default constructor
31 | * @generated */
32 | protected TopicDistribution() {}
33 |
34 | /** Internal - constructor used by generator
35 | * @generated */
36 | public TopicDistribution(int addr, TOP_Type type) {
37 | super(addr, type);
38 | readObject();
39 | }
40 |
41 | /** @generated */
42 | public TopicDistribution(JCas jcas) {
43 | super(jcas);
44 | readObject();
45 | }
46 |
47 | /** @generated */
48 | public TopicDistribution(JCas jcas, int begin, int end) {
49 | super(jcas);
50 | setBegin(begin);
51 | setEnd(end);
52 | readObject();
53 | }
54 |
55 | /**
56 | * Write your own initialization here
57 | *
58 | @generated modifiable */
59 | private void readObject() {}
60 |
61 |
62 |
63 | //*--------------*
64 | //* Feature: topicDistribution
65 |
66 | /** getter for topicDistribution - gets
67 | * @generated */
68 | public DoubleArray getTopicDistribution() {
69 | if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
70 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
71 | return (DoubleArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution)));}
72 |
73 | /** setter for topicDistribution - sets
74 | * @generated */
75 | public void setTopicDistribution(DoubleArray v) {
76 | if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
77 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
78 | jcasType.ll_cas.ll_setRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution, jcasType.ll_cas.ll_getFSRef(v));}
79 |
80 | /** indexed getter for topicDistribution - gets an indexed value -
81 | * @generated */
82 | public double getTopicDistribution(int i) {
83 | if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
84 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
85 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);
86 | return jcasType.ll_cas.ll_getDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);}
87 |
88 | /** indexed setter for topicDistribution - sets an indexed value -
89 | * @generated */
90 | public void setTopicDistribution(int i, double v) {
91 | if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
92 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
93 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);
94 | jcasType.ll_cas.ll_setDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i, v);}
95 | }
96 |
97 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentScore_Type.java:
--------------------------------------------------------------------------------
1 |
2 | /* First created by JCasGen Fri Nov 08 16:51:38 CET 2013 */
3 | package de.tudarmstadt.langtech.semantics.type;
4 |
5 | import org.apache.uima.jcas.JCas;
6 | import org.apache.uima.jcas.JCasRegistry;
7 | import org.apache.uima.cas.impl.CASImpl;
8 | import org.apache.uima.cas.impl.FSGenerator;
9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 |
16 | /**
17 | * Updated by JCasGen Wed Aug 26 15:50:04 CEST 2015
18 | * @generated */
19 | public class SegmentScore_Type extends Annotation_Type {
20 | /** @generated */
21 | @Override
22 | protected FSGenerator getFSGenerator() {return fsGenerator;}
23 | /** @generated */
24 | private final FSGenerator fsGenerator =
25 | new FSGenerator() {
26 | public FeatureStructure createFS(int addr, CASImpl cas) {
27 | if (SegmentScore_Type.this.useExistingInstance) {
28 | // Return eq fs instance if already created
29 | FeatureStructure fs = SegmentScore_Type.this.jcas.getJfsFromCaddr(addr);
30 | if (null == fs) {
31 | fs = new SegmentScore(addr, SegmentScore_Type.this);
32 | SegmentScore_Type.this.jcas.putJfsFromCaddr(addr, fs);
33 | return fs;
34 | }
35 | return fs;
36 | } else return new SegmentScore(addr, SegmentScore_Type.this);
37 | }
38 | };
39 | /** @generated */
40 | @SuppressWarnings ("hiding")
41 | public final static int typeIndexID = SegmentScore.typeIndexID;
42 | /** @generated
43 | @modifiable */
44 | @SuppressWarnings ("hiding")
45 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.langtech.semantics.type.SegmentScore");
46 |
47 | /** @generated */
48 | final Feature casFeat_score;
49 | /** @generated */
50 | final int casFeatCode_score;
51 | /** @generated */
52 | public double getScore(int addr) {
53 | if (featOkTst && casFeat_score == null)
54 | jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
55 | return ll_cas.ll_getDoubleValue(addr, casFeatCode_score);
56 | }
57 | /** @generated */
58 | public void setScore(int addr, double v) {
59 | if (featOkTst && casFeat_score == null)
60 | jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
61 | ll_cas.ll_setDoubleValue(addr, casFeatCode_score, v);}
62 |
63 |
64 |
65 | /** @generated */
66 | final Feature casFeat_similarityScores;
67 | /** @generated */
68 | final int casFeatCode_similarityScores;
69 | /** @generated */
70 | public String getSimilarityScores(int addr) {
71 | if (featOkTst && casFeat_similarityScores == null)
72 | jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
73 | return ll_cas.ll_getStringValue(addr, casFeatCode_similarityScores);
74 | }
75 | /** @generated */
76 | public void setSimilarityScores(int addr, String v) {
77 | if (featOkTst && casFeat_similarityScores == null)
78 | jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
79 | ll_cas.ll_setStringValue(addr, casFeatCode_similarityScores, v);}
80 |
81 |
82 |
83 |
84 |
85 | /** initialize variables to correspond with Cas Type and Features
86 | * @generated */
87 | public SegmentScore_Type(JCas jcas, Type casType) {
88 | super(jcas, casType);
89 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
90 |
91 |
92 | casFeat_score = jcas.getRequiredFeatureDE(casType, "score", "uima.cas.Double", featOkTst);
93 | casFeatCode_score = (null == casFeat_score) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_score).getCode();
94 |
95 |
96 | casFeat_similarityScores = jcas.getRequiredFeatureDE(casType, "similarityScores", "uima.cas.String", featOkTst);
97 | casFeatCode_similarityScores = (null == casFeat_similarityScores) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_similarityScores).getCode();
98 |
99 | }
100 | }
101 |
102 |
103 |
104 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/WordTopicDistribution.java:
--------------------------------------------------------------------------------
1 |
2 |
3 | /* First created by JCasGen Thu Apr 12 12:36:03 CEST 2012 */
4 | package de.tudarmstadt.langtech.lda.type;
5 |
6 | import org.apache.uima.jcas.JCas;
7 | import org.apache.uima.jcas.JCasRegistry;
8 | import org.apache.uima.jcas.cas.TOP_Type;
9 |
10 | import org.apache.uima.jcas.tcas.Annotation;
11 | import org.apache.uima.jcas.cas.DoubleArray;
12 |
13 |
14 | /**
15 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
16 | * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml
17 | * @generated */
18 | public class WordTopicDistribution extends Annotation {
19 | /** @generated
20 | * @ordered
21 | */
22 | public final static int typeIndexID = JCasRegistry.register(WordTopicDistribution.class);
23 | /** @generated
24 | * @ordered
25 | */
26 | public final static int type = typeIndexID;
27 | /** @generated */
28 | public int getTypeIndexID() {return typeIndexID;}
29 |
30 | /** Never called. Disable default constructor
31 | * @generated */
32 | protected WordTopicDistribution() {}
33 |
34 | /** Internal - constructor used by generator
35 | * @generated */
36 | public WordTopicDistribution(int addr, TOP_Type type) {
37 | super(addr, type);
38 | readObject();
39 | }
40 |
41 | /** @generated */
42 | public WordTopicDistribution(JCas jcas) {
43 | super(jcas);
44 | readObject();
45 | }
46 |
47 | /** @generated */
48 | public WordTopicDistribution(JCas jcas, int begin, int end) {
49 | super(jcas);
50 | setBegin(begin);
51 | setEnd(end);
52 | readObject();
53 | }
54 |
55 | /**
56 | * Write your own initialization here
57 | *
58 | @generated modifiable */
59 | private void readObject() {}
60 |
61 |
62 |
63 | //*--------------*
64 | //* Feature: topicDistribution
65 |
66 | /** getter for topicDistribution - gets
67 | * @generated */
68 | public DoubleArray getTopicDistribution() {
69 | if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
70 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
71 | return (DoubleArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution)));}
72 |
73 | /** setter for topicDistribution - sets
74 | * @generated */
75 | public void setTopicDistribution(DoubleArray v) {
76 | if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
77 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
78 | jcasType.ll_cas.ll_setRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution, jcasType.ll_cas.ll_getFSRef(v));}
79 |
80 | /** indexed getter for topicDistribution - gets an indexed value -
81 | * @generated */
82 | public double getTopicDistribution(int i) {
83 | if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
84 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
85 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);
86 | return jcasType.ll_cas.ll_getDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);}
87 |
88 | /** indexed setter for topicDistribution - sets an indexed value -
89 | * @generated */
90 | public void setTopicDistribution(int i, double v) {
91 | if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
92 | jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
93 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);
94 | jcasType.ll_cas.ll_setDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i, v);}
95 | }
96 |
97 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/TopicDistribution_Type.java:
--------------------------------------------------------------------------------
1 |
2 | /* First created by JCasGen Wed Apr 11 15:17:37 CEST 2012 */
3 | package de.tudarmstadt.langtech.lda.type;
4 |
5 | import org.apache.uima.jcas.JCas;
6 | import org.apache.uima.jcas.JCasRegistry;
7 | import org.apache.uima.cas.impl.CASImpl;
8 | import org.apache.uima.cas.impl.FSGenerator;
9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 |
16 | /**
17 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
18 | * @generated */
19 | public class TopicDistribution_Type extends Annotation_Type {
20 | /** @generated */
21 | protected FSGenerator getFSGenerator() {return fsGenerator;}
22 | /** @generated */
23 | private final FSGenerator fsGenerator =
24 | new FSGenerator() {
25 | public FeatureStructure createFS(int addr, CASImpl cas) {
26 | if (TopicDistribution_Type.this.useExistingInstance) {
27 | // Return eq fs instance if already created
28 | FeatureStructure fs = TopicDistribution_Type.this.jcas.getJfsFromCaddr(addr);
29 | if (null == fs) {
30 | fs = new TopicDistribution(addr, TopicDistribution_Type.this);
31 | TopicDistribution_Type.this.jcas.putJfsFromCaddr(addr, fs);
32 | return fs;
33 | }
34 | return fs;
35 | } else return new TopicDistribution(addr, TopicDistribution_Type.this);
36 | }
37 | };
38 | /** @generated */
39 | public final static int typeIndexID = TopicDistribution.typeIndexID;
40 | /** @generated
41 | @modifiable */
42 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
43 |
44 | /** @generated */
45 | final Feature casFeat_topicDistribution;
46 | /** @generated */
47 | final int casFeatCode_topicDistribution;
48 | /** @generated */
49 | public int getTopicDistribution(int addr) {
50 | if (featOkTst && casFeat_topicDistribution == null)
51 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
52 | return ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution);
53 | }
54 | /** @generated */
55 | public void setTopicDistribution(int addr, int v) {
56 | if (featOkTst && casFeat_topicDistribution == null)
57 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
58 | ll_cas.ll_setRefValue(addr, casFeatCode_topicDistribution, v);}
59 |
60 | /** @generated */
61 | public double getTopicDistribution(int addr, int i) {
62 | if (featOkTst && casFeat_topicDistribution == null)
63 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
64 | if (lowLevelTypeChecks)
65 | return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, true);
66 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
67 | return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
68 | }
69 |
70 | /** @generated */
71 | public void setTopicDistribution(int addr, int i, double v) {
72 | if (featOkTst && casFeat_topicDistribution == null)
73 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
74 | if (lowLevelTypeChecks)
75 | ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v, true);
76 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
77 | ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v);
78 | }
79 |
80 |
81 |
82 |
83 | /** initialize variables to correspond with Cas Type and Features
84 | * @generated */
85 | public TopicDistribution_Type(JCas jcas, Type casType) {
86 | super(jcas, casType);
87 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
88 |
89 |
90 | casFeat_topicDistribution = jcas.getRequiredFeatureDE(casType, "topicDistribution", "uima.cas.DoubleArray", featOkTst);
91 | casFeatCode_topicDistribution = (null == casFeat_topicDistribution) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicDistribution).getCode();
92 |
93 | }
94 | }
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/WordTopicDistribution_Type.java:
--------------------------------------------------------------------------------
1 |
2 | /* First created by JCasGen Thu Apr 12 12:36:03 CEST 2012 */
3 | package de.tudarmstadt.langtech.lda.type;
4 |
5 | import org.apache.uima.jcas.JCas;
6 | import org.apache.uima.jcas.JCasRegistry;
7 | import org.apache.uima.cas.impl.CASImpl;
8 | import org.apache.uima.cas.impl.FSGenerator;
9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 |
16 | /**
17 | * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
18 | * @generated */
19 | public class WordTopicDistribution_Type extends Annotation_Type {
20 | /** @generated */
21 | protected FSGenerator getFSGenerator() {return fsGenerator;}
22 | /** @generated */
23 | private final FSGenerator fsGenerator =
24 | new FSGenerator() {
25 | public FeatureStructure createFS(int addr, CASImpl cas) {
26 | if (WordTopicDistribution_Type.this.useExistingInstance) {
27 | // Return eq fs instance if already created
28 | FeatureStructure fs = WordTopicDistribution_Type.this.jcas.getJfsFromCaddr(addr);
29 | if (null == fs) {
30 | fs = new WordTopicDistribution(addr, WordTopicDistribution_Type.this);
31 | WordTopicDistribution_Type.this.jcas.putJfsFromCaddr(addr, fs);
32 | return fs;
33 | }
34 | return fs;
35 | } else return new WordTopicDistribution(addr, WordTopicDistribution_Type.this);
36 | }
37 | };
38 | /** @generated */
39 | public final static int typeIndexID = WordTopicDistribution.typeIndexID;
40 | /** @generated
41 | @modifiable */
42 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
43 |
44 | /** @generated */
45 | final Feature casFeat_topicDistribution;
46 | /** @generated */
47 | final int casFeatCode_topicDistribution;
48 | /** @generated */
49 | public int getTopicDistribution(int addr) {
50 | if (featOkTst && casFeat_topicDistribution == null)
51 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
52 | return ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution);
53 | }
54 | /** @generated */
55 | public void setTopicDistribution(int addr, int v) {
56 | if (featOkTst && casFeat_topicDistribution == null)
57 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
58 | ll_cas.ll_setRefValue(addr, casFeatCode_topicDistribution, v);}
59 |
60 | /** @generated */
61 | public double getTopicDistribution(int addr, int i) {
62 | if (featOkTst && casFeat_topicDistribution == null)
63 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
64 | if (lowLevelTypeChecks)
65 | return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, true);
66 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
67 | return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
68 | }
69 |
70 | /** @generated */
71 | public void setTopicDistribution(int addr, int i, double v) {
72 | if (featOkTst && casFeat_topicDistribution == null)
73 | jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
74 | if (lowLevelTypeChecks)
75 | ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v, true);
76 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
77 | ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v);
78 | }
79 |
80 |
81 |
82 |
83 | /** initialize variables to correspond with Cas Type and Features
84 | * @generated */
85 | public WordTopicDistribution_Type(JCas jcas, Type casType) {
86 | super(jcas, casType);
87 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
88 |
89 |
90 | casFeat_topicDistribution = jcas.getRequiredFeatureDE(casType, "topicDistribution", "uima.cas.DoubleArray", featOkTst);
91 | casFeatCode_topicDistribution = (null == casFeat_topicDistribution) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicDistribution).getCode();
92 |
93 | }
94 | }
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/test/java/RunTopicTilingOnFile.java:
--------------------------------------------------------------------------------
1 |
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.uima.UIMAException;
6 | import org.apache.uima.analysis_engine.AnalysisEngine;
7 | import org.apache.uima.collection.CollectionReader;
8 | import org.apache.uima.resource.ResourceInitializationException;
9 | import org.kohsuke.args4j.CmdLineException;
10 | import org.kohsuke.args4j.CmdLineParser;
11 | import org.kohsuke.args4j.Option;
12 | import org.uimafit.factory.AnalysisEngineFactory;
13 | import org.uimafit.factory.CollectionReaderFactory;
14 | import org.uimafit.pipeline.SimplePipeline;
15 |
16 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.OutputSegments;
17 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.TopicTilingSegmenterAnnotator;
18 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader;
19 | import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter;
20 |
21 | public class RunTopicTilingOnFile {
22 |
23 | private static class Options {
24 | @Option(name="-tmd",usage="Directory of the topic model (GibbsLDA should be used)",required = true)
25 | String topicModelDirectory;
26 | @Option(name="-tmn",usage="Name of the topic model (GibbsLDA should be used)",required = true)
27 | String topicModelName;
28 | @Option(name="-dn",usage="Use the direct neighbor otherwise the highest neighbor will be used (default false)",required=false)
29 | boolean useDirectNeighbor=false;
30 | @Option(name="-i",usage="Number of inference iterations used to annotate words with topic IDs (default 100)",required=false)
31 | int inferenceIterations=100;
32 | @Option(name="-m",usage="Use mode counting (true/false) (default=true)",required=false)
33 | boolean modeCounting=true;
34 | @Option(name="-w",usage="Window size used to calculate the sentence similarity", required=false)
35 | int windowSize=1;
36 | @Option(name="-ri",usage="Use the repeated inference method",required = false)
37 | int repeatedInference=1;
38 | @Option(name="-rs",usage="Use the repeated segmentation",required = false)
39 | int repeatedSegmentation=1;
40 | @Option(name="-fd",usage="Directory fo the test files",required = true)
41 | public String fileDirectory;
42 | @Option(name="-fp",usage="File pattern for the test files",required = true)
43 | public String filePattern;
44 | @Option(name="-out",usage="File the content is written to (otherwise stdout will be used)",required = false)
45 | public String output=null;
46 | // @Option(name="-n",usage="Number of segments that should be made (the value -1 indicates, that segments are searched automatically)",required = true)
47 | // public String segmentNumber;
48 | }
49 |
50 | public static void main(final String[] args)
51 | throws ResourceInitializationException, UIMAException, IOException {
52 | Options options = new Options();
53 | CmdLineParser parser = new CmdLineParser(options);
54 | try {
55 | parser.parseArgument(args);
56 | } catch( CmdLineException e ) {
57 | System.err.println(e.getMessage());
58 | System.err.println("java -jar myprogram.jar [options...] arguments...");
59 | parser.printUsage(System.err);
60 | return;
61 | }
62 |
63 | new RunTopicTilingOnFile(options);
64 |
65 | }
66 |
67 | public RunTopicTilingOnFile(Options opt) throws UIMAException, IOException {
68 | String neighbor = "HIGHEST_NEIGHBOR";
69 | if (opt.useDirectNeighbor)
70 | neighbor = "DIRECT_NEIGHBOR";
71 | final CollectionReader reader = CollectionReaderFactory.createCollectionReader(
72 | TextReader.class,
73 | TextReader.PARAM_PATH, opt.fileDirectory
74 | ,
75 |
76 |
77 | TextReader.PARAM_PATTERNS, new String[] { "[+]"+opt.filePattern }
78 | );
79 |
80 | AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(StanfordSegmenter.class);
81 | AnalysisEngine topicTiling = AnalysisEngineFactory
82 | .createPrimitive(
83 | TopicTilingSegmenterAnnotator.class,
84 | TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_DIRECTORY,
85 | opt.topicModelDirectory,
86 | TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_NAME,
87 | opt.topicModelName,
88 | TopicTilingSegmenterAnnotator.PARAM_INFERENCE_ITERATION,
89 | opt.inferenceIterations,
90 | TopicTilingSegmenterAnnotator.PARAM_REPEAT_INFERENCE,
91 | opt.repeatedInference,
92 | TopicTilingSegmenterAnnotator.PARAM_REPEAT_SEGMENTATION,
93 | opt.repeatedSegmentation,
94 | TopicTilingSegmenterAnnotator.PARAM_WINDOW,
95 | opt.windowSize,
96 | TopicTilingSegmenterAnnotator.PARAM_DEPTH_SCORE,
97 | neighbor,
98 | TopicTilingSegmenterAnnotator.PARAM_MODE_COUNTING,
99 | opt.modeCounting);
100 | AnalysisEngine outputSegments = AnalysisEngineFactory.createPrimitive(OutputSegments.class,OutputSegments.PARAM_OUTPUT,opt.output);
101 | SimplePipeline.runPipeline(reader, segmenter, topicTiling,outputSegments);
102 |
103 | }
104 |
105 | }
106 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Estimator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * JGibbsLDA is a free software; you can redistribute it and/or modify
15 | * it under the terms of the GNU General Public License as published
16 | * by the Free Software Foundation; either version 2 of the License,
17 | * or (at your option) any later version.
18 | *
19 | * JGibbsLDA is distributed in the hope that it will be useful, but
20 | * WITHOUT ANY WARRANTY; without even the implied warranty of
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | * GNU General Public License for more details.
23 | *
24 | * You should have received a copy of the GNU General Public License
25 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 | */
28 |
29 | package jgibbslda;
30 |
31 | import java.io.File;
32 |
33 | public class Estimator {
34 |
35 | // output model
36 | protected Model trnModel;
37 | LDACmdOption option;
38 |
39 | public boolean init(LDACmdOption option){
40 | this.option = option;
41 | trnModel = new Model();
42 |
43 | if (option.est){
44 | if (!trnModel.initNewModel(option))
45 | return false;
46 | trnModel.data.localDict.writeWordMap(option.dir + File.separator + option.wordMapFileName);
47 | }
48 | else if (option.estc){
49 | if (!trnModel.initEstimatedModel(option))
50 | return false;
51 | }
52 |
53 | return true;
54 | }
55 |
56 | public void estimate(){
57 | System.out.println("Sampling " + trnModel.niters + " iteration!");
58 |
59 | int lastIter = trnModel.liter;
60 | for (trnModel.liter = lastIter + 1; trnModel.liter < trnModel.niters + lastIter; trnModel.liter++){
61 | System.out.println("Iteration " + trnModel.liter + " ...");
62 |
63 | // for all z_i
64 | for (int m = 0; m < trnModel.M; m++){
65 | for (int n = 0; n < trnModel.data.docs[m].length; n++){
66 | // z_i = z[m][n]
67 | // sample from p(z_i|z_-i, w)
68 | int topic = sampling(m, n);
69 | trnModel.z[m].set(n, topic);
70 | }// end for each word
71 | }// end for each document
72 |
73 | if (option.savestep > 0){
74 | if (trnModel.liter % option.savestep == 0){
75 | System.out.println("Saving the model at iteration " + trnModel.liter + " ...");
76 | computeTheta();
77 | computePhi();
78 | trnModel.saveModel("model-" + Conversion.ZeroPad(trnModel.liter, 5));
79 | }
80 | }
81 | }// end iterations
82 |
83 | System.out.println("Gibbs sampling completed!\n");
84 | System.out.println("Saving the final model!\n");
85 | computeTheta();
86 | computePhi();
87 | trnModel.liter--;
88 | trnModel.saveModel("model-final");
89 | }
90 |
91 | /**
92 | * Do sampling
93 | * @param m document number
94 | * @param n word number
95 | * @return topic id
96 | */
97 | public int sampling(int m, int n){
98 | // remove z_i from the count variable
99 | int topic = trnModel.z[m].get(n);
100 | int w = trnModel.data.docs[m].words[n];
101 |
102 | trnModel.nw[w][topic] -= 1;
103 | trnModel.nd[m][topic] -= 1;
104 | trnModel.nwsum[topic] -= 1;
105 | trnModel.ndsum[m] -= 1;
106 |
107 | double Vbeta = trnModel.V * trnModel.beta;
108 | double Kalpha = trnModel.K * trnModel.alpha;
109 |
110 | //do multinominal sampling via cumulative method
111 | for (int k = 0; k < trnModel.K; k++){
112 | trnModel.p[k] = (trnModel.nw[w][k] + trnModel.beta)/(trnModel.nwsum[k] + Vbeta) *
113 | (trnModel.nd[m][k] + trnModel.alpha)/(trnModel.ndsum[m] + Kalpha);
114 | }
115 |
116 | // cumulate multinomial parameters
117 | for (int k = 1; k < trnModel.K; k++){
118 | trnModel.p[k] += trnModel.p[k - 1];
119 | }
120 |
121 | // scaled sample because of unnormalized p[]
122 | double u = Math.random() * trnModel.p[trnModel.K - 1];
123 |
124 | for (topic = 0; topic < trnModel.K; topic++){
125 | if (trnModel.p[topic] > u) //sample topic w.r.t distribution p
126 | break;
127 | }
128 |
129 | // add newly estimated z_i to count variables
130 | trnModel.nw[w][topic] += 1;
131 | trnModel.nd[m][topic] += 1;
132 | trnModel.nwsum[topic] += 1;
133 | trnModel.ndsum[m] += 1;
134 |
135 | return topic;
136 | }
137 |
138 | public void computeTheta(){
139 | for (int m = 0; m < trnModel.M; m++){
140 | for (int k = 0; k < trnModel.K; k++){
141 | trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha);
142 | }
143 | }
144 | }
145 |
146 | public void computePhi(){
147 | for (int k = 0; k < trnModel.K; k++){
148 | for (int w = 0; w < trnModel.V; w++){
149 | trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta);
150 | }
151 | }
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Dictionary.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * JGibbsLDA is a free software; you can redistribute it and/or modify
15 | * it under the terms of the GNU General Public License as published
16 | * by the Free Software Foundation; either version 2 of the License,
17 | * or (at your option) any later version.
18 | *
19 | * JGibbsLDA is distributed in the hope that it will be useful, but
20 | * WITHOUT ANY WARRANTY; without even the implied warranty of
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | * GNU General Public License for more details.
23 | *
24 | * You should have received a copy of the GNU General Public License
25 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 | */
28 | package jgibbslda;
29 |
30 | import java.io.BufferedReader;
31 | import java.io.BufferedWriter;
32 | import java.io.FileInputStream;
33 | import java.io.FileOutputStream;
34 | import java.io.InputStreamReader;
35 | import java.io.OutputStreamWriter;
36 | import java.util.HashMap;
37 | import java.util.Iterator;
38 | import java.util.Map;
39 | import java.util.StringTokenizer;
40 |
41 | public class Dictionary {
42 | public Map word2id;
43 | public Map id2word;
44 |
45 | //--------------------------------------------------
46 | // constructors
47 | //--------------------------------------------------
48 |
49 | public Dictionary(){
50 | word2id = new HashMap();
51 | id2word = new HashMap();
52 | }
53 |
54 | //---------------------------------------------------
55 | // get/set methods
56 | //---------------------------------------------------
57 |
58 | public String getWord(int id){
59 | return id2word.get(id);
60 | }
61 |
62 | public Integer getID (String word){
63 | return word2id.get(word);
64 | }
65 |
66 | //----------------------------------------------------
67 | // checking methods
68 | //----------------------------------------------------
69 | /**
70 | * check if this dictionary contains a specified word
71 | */
72 | public boolean contains(String word){
73 | return word2id.containsKey(word);
74 | }
75 |
76 | public boolean contains(int id){
77 | return id2word.containsKey(id);
78 | }
79 | //---------------------------------------------------
80 | // manupulating methods
81 | //---------------------------------------------------
82 | /**
83 | * add a word into this dictionary
84 | * return the corresponding id
85 | */
86 | public int addWord(String word){
87 | if (!contains(word)){
88 | int id = word2id.size();
89 |
90 | word2id.put(word, id);
91 | id2word.put(id,word);
92 |
93 | return id;
94 | }
95 | else return getID(word);
96 | }
97 |
98 | //---------------------------------------------------
99 | // I/O methods
100 | //---------------------------------------------------
101 | /**
102 | * read dictionary from file
103 | */
104 | public boolean readWordMap(String wordMapFile){
105 | try{
106 | BufferedReader reader = new BufferedReader(new InputStreamReader(
107 | new FileInputStream(wordMapFile), "UTF-8"));
108 | String line;
109 |
110 | //read the number of words
111 | line = reader.readLine();
112 | int nwords = Integer.parseInt(line);
113 |
114 | //read map
115 | for (int i = 0; i < nwords; ++i){
116 | line = reader.readLine();
117 | StringTokenizer tknr = new StringTokenizer(line, " \t\n\r");
118 |
119 | if (tknr.countTokens() != 2) continue;
120 |
121 | String word = tknr.nextToken();
122 | String id = tknr.nextToken();
123 | int intID = Integer.parseInt(id);
124 |
125 | id2word.put(intID, word);
126 | word2id.put(word, intID);
127 | }
128 |
129 | reader.close();
130 | return true;
131 | }
132 | catch (Exception e){
133 | System.out.println("Error while reading dictionary:" + e.getMessage());
134 | e.printStackTrace();
135 | return false;
136 | }
137 | }
138 |
139 | public boolean writeWordMap(String wordMapFile){
140 | try{
141 | BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
142 | new FileOutputStream(wordMapFile), "UTF-8"));
143 |
144 | //write number of words
145 | writer.write(word2id.size() + "\n");
146 |
147 | //write word to id
148 | Iterator it = word2id.keySet().iterator();
149 | while (it.hasNext()){
150 | String key = it.next();
151 | Integer value = word2id.get(key);
152 |
153 | writer.write(key + " " + value + "\n");
154 | }
155 |
156 | writer.close();
157 | return true;
158 | }
159 | catch (Exception e){
160 | System.out.println("Error while writing word map " + e.getMessage());
161 | e.printStackTrace();
162 | return false;
163 | }
164 |
165 |
166 | }
167 | }
168 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LogSaveEstimator.java:
--------------------------------------------------------------------------------
1 | package jgibbslda;
2 |
3 | /*
4 | * Copyright (C) 2007 by
5 | *
6 | * Xuan-Hieu Phan
7 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
8 | * Graduate School of Information Sciences
9 | * Tohoku University
10 | *
11 | * Cam-Tu Nguyen
12 | * ncamtu@gmail.com
13 | * College of Technology
14 | * Vietnam National University, Hanoi
15 | *
16 | * Martin Riedl
17 | * riedl@cs.tu-darmstadt.de
18 | * FG Language Technology
19 | * Technische Universität Darmstadt, Germany
20 | *
21 | * JGibbsLDA is a free software; you can redistribute it and/or modify
22 | * it under the terms of the GNU General Public License as published
23 | * by the Free Software Foundation; either version 2 of the License,
24 | * or (at your option) any later version.
25 | *
26 | * JGibbsLDA is distributed in the hope that it will be useful, but
27 | * WITHOUT ANY WARRANTY; without even the implied warranty of
28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 | * GNU General Public License for more details.
30 | *
31 | * You should have received a copy of the GNU General Public License
32 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
33 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
34 | */
35 |
36 |
37 | import java.io.File;
38 | import java.util.Arrays;
39 |
40 | public class LogSaveEstimator {
41 |
42 | // output model
43 | protected Model trnModel;
44 | LDACmdOption option;
45 | public int[] savesteps;
46 |
47 | public boolean init(LDACmdOption option){
48 | this.option = option;
49 | trnModel = new Model();
50 |
51 | if (option.est){
52 | if (!trnModel.initNewModel(option))
53 | return false;
54 | trnModel.data.localDict.writeWordMap(option.dir + File.separator + option.wordMapFileName);
55 | }
56 | else if (option.estc){
57 | if (!trnModel.initEstimatedModel(option))
58 | return false;
59 | }
60 |
61 | return true;
62 | }
63 |
64 | public void estimate(){
65 | System.out.println("Sampling " + trnModel.niters + " iteration!");
66 |
67 |
68 | int lastIter = trnModel.liter;
69 | for (trnModel.liter = lastIter + 1; trnModel.liter < trnModel.niters + lastIter; trnModel.liter++){
70 | System.out.println("Iteration " + trnModel.liter + " ...");
71 |
72 | // for all z_i
73 | for (int m = 0; m < trnModel.M; m++){
74 |
75 | for (int n = 0; n < trnModel.data.docs[m].length; n++){
76 | // z_i = z[m][n]
77 | // sample from p(z_i|z_-i, w)
78 | int topic = sampling(m, n);
79 | trnModel.z[m].set(n, topic);
80 | }// end for each word
81 | }// end for each document
82 |
83 | // if (option.savestep > 0){
84 | if (Arrays.binarySearch(savesteps,trnModel.liter)>=0){
85 | System.out.println("Saving the model at iteration " + trnModel.liter + " ...");
86 | computeTheta();
87 | computePhi();
88 | trnModel.saveModel("model-" + Conversion.ZeroPad(trnModel.liter, 5));
89 | }
90 | // }
91 | }// end iterations
92 |
93 | System.out.println("Gibbs sampling completed!\n");
94 | System.out.println("Saving the final model!\n");
95 | computeTheta();
96 | computePhi();
97 | trnModel.liter--;
98 | trnModel.saveModel("model-final");
99 | }
100 |
101 | /**
102 | * Do sampling
103 | * @param m document number
104 | * @param n word number
105 | * @return topic id
106 | */
107 | public int sampling(int m, int n){
108 | // remove z_i from the count variable
109 | int topic = trnModel.z[m].get(n);
110 | int w = trnModel.data.docs[m].words[n];
111 | trnModel.nw[w][topic] -= 1;
112 | trnModel.nd[m][topic] -= 1;
113 | trnModel.nwsum[topic] -= 1;
114 | trnModel.ndsum[m] -= 1;
115 |
116 | double Vbeta = trnModel.V * trnModel.beta;
117 | double Kalpha = trnModel.K * trnModel.alpha;
118 |
119 | //do multinominal sampling via cumulative method
120 | for (int k = 0; k < trnModel.K; k++){
121 | trnModel.p[k] = (trnModel.nw[w][k] + trnModel.beta)/(trnModel.nwsum[k] + Vbeta) *
122 | (trnModel.nd[m][k] + trnModel.alpha)/(trnModel.ndsum[m] + Kalpha);
123 | }
124 |
125 | // cumulate multinomial parameters
126 | for (int k = 1; k < trnModel.K; k++){
127 | trnModel.p[k] += trnModel.p[k - 1];
128 | }
129 |
130 | // scaled sample because of unnormalized p[]
131 | double u = Math.random() * trnModel.p[trnModel.K - 1];
132 |
133 | for (topic = 0; topic < trnModel.K; topic++){
134 | if (trnModel.p[topic] > u) //sample topic w.r.t distribution p
135 | break;
136 | }
137 |
138 | // add newly estimated z_i to count variables
139 | trnModel.nw[w][topic] += 1;
140 | trnModel.nd[m][topic] += 1;
141 | trnModel.nwsum[topic] += 1;
142 | trnModel.ndsum[m] += 1;
143 |
144 | return topic;
145 | }
146 |
147 | public void computeTheta(){
148 | for (int m = 0; m < trnModel.M; m++){
149 | for (int k = 0; k < trnModel.K; k++){
150 | trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha);
151 | }
152 | }
153 | }
154 |
155 | public void computePhi(){
156 | for (int k = 0; k < trnModel.K; k++){
157 | for (int w = 0; w < trnModel.V; w++){
158 | trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta);
159 | }
160 | }
161 | }
162 | }
163 |
164 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/RunTopicTilingOnFile.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 |
25 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter;
26 |
27 |
28 | import java.io.IOException;
29 |
30 | import org.apache.uima.UIMAException;
31 | import org.apache.uima.analysis_engine.AnalysisEngine;
32 | import org.apache.uima.collection.CollectionReader;
33 | import org.apache.uima.resource.ResourceInitializationException;
34 | import org.kohsuke.args4j.CmdLineException;
35 | import org.kohsuke.args4j.CmdLineParser;
36 | import org.kohsuke.args4j.Option;
37 | import org.uimafit.factory.AnalysisEngineFactory;
38 | import org.uimafit.factory.CollectionReaderFactory;
39 | import org.uimafit.pipeline.SimplePipeline;
40 |
41 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.OutputSegments;
42 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.SimpleSegmenter;
43 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.TopicTilingSegmenterAnnotator;
44 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader;
45 | import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter;
46 |
47 | public class RunTopicTilingOnFile {
48 |
49 | private static class Options {
50 | @Option(name="-tmd",usage="Directory of the topic model (GibbsLDA should be used)",required = true)
51 | String topicModelDirectory;
52 | @Option(name="-tmn",usage="Name of the topic model (GibbsLDA should be used)",required = true)
53 | String topicModelName;
54 | @Option(name="-dn",usage="Use the direct neighbor otherwise the highest neighbor will be used (default false)",required=false)
55 | boolean useDirectNeighbor=false;
56 | @Option(name="-d",usage="Print debugging output (default false)",required=false)
57 | boolean debug=false;
58 | @Option(name="-i",usage="Number of inference iterations used to annotate words with topic IDs (default 100)",required=false)
59 | int inferenceIterations=100;
60 | @Option(name="-s",usage="Use simple segmentation (default=false)",required=false)
61 | boolean useSimpleSegmentation=false;
62 |
63 | @Option(name="-m",usage="Use mode counting (true/false) (default=true)",required=false)
64 | boolean modeCounting=true;
65 | @Option(name="-w",usage="Window size used to calculate the sentence similarity", required=false)
66 | int windowSize=1;
67 | @Option(name="-ri",usage="Use the repeated inference method",required = false)
68 | int repeatedInference=1;
69 | @Option(name="-rs",usage="Use the repeated segmentation",required = false)
70 | int repeatedSegmentation=1;
71 | @Option(name="-fd",usage="Directory fo the test files",required = true)
72 | public String fileDirectory;
73 | @Option(name="-fp",usage="File pattern for the test files",required = true)
74 | public String filePattern;
75 | @Option(name="-out",usage="File the content is written to (otherwise stdout will be used)",required = false)
76 | public String output=null;
77 | // @Option(name="-n",usage="Number of segments that should be made (the value -1 indicates, that segments are searched automatically)",required = true)
78 | // public String segmentNumber;
79 | }
80 |
81 | public static void main(final String[] args)
82 | throws ResourceInitializationException, UIMAException, IOException {
83 | Options options = new Options();
84 | CmdLineParser parser = new CmdLineParser(options);
85 | try {
86 | parser.parseArgument(args);
87 | } catch( CmdLineException e ) {
88 | System.err.println(e.getMessage());
89 | System.err.println("java -jar myprogram.jar [options...] arguments...");
90 | parser.printUsage(System.err);
91 | return;
92 | }
93 |
94 | new RunTopicTilingOnFile(options);
95 |
96 | }
97 |
98 | public RunTopicTilingOnFile(Options opt) throws UIMAException, IOException {
99 | String neighbor = "HIGHEST_NEIGHBOR";
100 | if (opt.useDirectNeighbor)
101 | neighbor = "DIRECT_NEIGHBOR";
102 | final CollectionReader reader = CollectionReaderFactory.createCollectionReader(
103 | TextReader.class,
104 | TextReader.PARAM_PATH, opt.fileDirectory,
105 | TextReader.PARAM_PATTERNS, new String[] { "[+]" + opt.filePattern });
106 |
107 | AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(StanfordSegmenter.class);
108 | if(opt.useSimpleSegmentation){
109 | segmenter = AnalysisEngineFactory.createPrimitive(SimpleSegmenter.class);
110 | }
111 | AnalysisEngine topicTiling = AnalysisEngineFactory
112 | .createPrimitive(
113 | TopicTilingSegmenterAnnotator.class,
114 | TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_DIRECTORY,
115 | opt.topicModelDirectory,
116 | TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_NAME,
117 | opt.topicModelName,
118 | TopicTilingSegmenterAnnotator.PARAM_INFERENCE_ITERATION,
119 | opt.inferenceIterations,
120 | TopicTilingSegmenterAnnotator.PARAM_REPEAT_INFERENCE,
121 | opt.repeatedInference,
122 | TopicTilingSegmenterAnnotator.PARAM_REPEAT_SEGMENTATION,
123 | opt.repeatedSegmentation,
124 | TopicTilingSegmenterAnnotator.PARAM_WINDOW,
125 | opt.windowSize,
126 | TopicTilingSegmenterAnnotator.PARAM_DEPTH_SCORE,
127 | neighbor,
128 | TopicTilingSegmenterAnnotator.PARAM_DEBUG,
129 | opt.debug,
130 | TopicTilingSegmenterAnnotator.PARAM_MODE_COUNTING,
131 | opt.modeCounting);
132 | AnalysisEngine outputSegments = AnalysisEngineFactory.createPrimitive(OutputSegments.class,OutputSegments.PARAM_OUTPUT,opt.output);
133 | SimplePipeline.runPipeline(reader, segmenter, topicTiling,outputSegments);
134 |
135 | }
136 |
137 | }
138 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Inferencer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * Martin Riedl
15 | * riedl@cs.tu-darmstadt.de
16 | * FG Language Technology
17 | * Technische Universität Darmstadt, Germany
18 | *
19 | * JGibbsLDA is a free software; you can redistribute it and/or modify
20 | * it under the terms of the GNU General Public License as published
21 | * by the Free Software Foundation; either version 2 of the License,
22 | * or (at your option) any later version.
23 | *
24 | * JGibbsLDA is distributed in the hope that it will be useful, but
25 | * WITHOUT ANY WARRANTY; without even the implied warranty of
26 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 | * GNU General Public License for more details.
28 | *
29 | * You should have received a copy of the GNU General Public License
30 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
31 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
32 | */
33 |
34 | package jgibbslda;
35 |
36 | import java.util.ArrayList;
37 | import java.util.List;
38 |
39 | import org.apache.uima.UIMAFramework;
40 | import org.apache.uima.util.Level;
41 | import org.apache.uima.util.Logger;
42 |
43 |
44 |
45 |
46 | public class Inferencer {
47 | // Train model
48 | public Model trnModel;
49 | public Dictionary globalDict;
50 | private LDACmdOption option;
51 | public ArrayList values ;
52 | private Model newModel;
53 | public int niters = 100;
54 | public static Logger logger = UIMAFramework.getLogger(Inferencer.class);
55 | //-----------------------------------------------------
56 | // Init method
57 | //-----------------------------------------------------
58 | public boolean init(LDACmdOption option){
59 | this.option = option;
60 | trnModel = new Model();
61 |
62 | if (!trnModel.initEstimatedModel(option))
63 | return false;
64 |
65 | globalDict = trnModel.data.localDict;
66 | computeTrnTheta();
67 | computeTrnPhi();
68 |
69 | return true;
70 | }
71 |
72 | //inference new model ~ getting data from a specified dataset
73 | public Model inference( LDADataset newData){
74 | logger.log(Level.FINE,"init new model");
75 | Model newModel = new Model();
76 |
77 | newModel.initNewModel(option, newData, trnModel);
78 | this.newModel = newModel;
79 |
80 | //initialiaze for repeated mode (RIEDL)
81 | values = new ArrayList();
82 | for (int doc = 0; doc < newModel.z.length; doc++) {
83 | values.add(new int[newModel.z[doc].size()][newModel.K]);
84 | }
85 |
86 | //-----------------------
87 | logger.log(Level.FINE,"Sampling " + niters + " iteration for inference!");
88 | // TopicTiling.printDim(newModel.z);
89 |
90 |
91 | for (newModel.liter = 1; newModel.liter <= niters; newModel.liter++){
92 | //System.out.println("Iteration " + newModel.liter + " ...");
93 |
94 | // for all newz_i
95 | for (int m = 0; m < newModel.M; ++m){//num of docs
96 | for (int n = 0; n < newModel.data.docs[m].length; n++){
97 | // (newz_i = newz[m][n]
98 | // sample from p(z_i|z_-1,w)
99 | int topic = infSampling(m, n);
100 | newModel.z[m].set(n, topic);
101 | //MR
102 | values.get(m)[n][topic]++;
103 | //END MR
104 | }
105 | }//end foreach new doc
106 |
107 | }// end iterations
108 |
109 |
110 | logger.log(Level.FINE,"Gibbs sampling for inference completed!");
111 |
112 | computeNewTheta();
113 | computeNewPhi();
114 | newModel.liter--;
115 |
116 | return this.newModel;
117 | }
118 |
119 | public Model inference(String [] strs){
120 | //System.out.println("inference");
121 | // Model newModel = new Model();
122 |
123 | //System.out.println("read dataset");
124 | LDADataset dataset = LDADataset.readDataSet(strs, globalDict);
125 |
126 | return inference(dataset);
127 | }
128 |
129 | public Model inference(List [] strs){
130 | //System.out.println("inference");
131 | // Model newModel = new Model();
132 |
133 | //System.out.println("read dataset");
134 | LDADataset dataset = LDADataset.readDataSet(strs, globalDict);
135 |
136 | return inference(dataset);
137 | }
138 |
139 | //inference new model ~ getting dataset from file specified in option
140 | public Model inference(){
141 | //System.out.println("inference");
142 |
143 | newModel = new Model();
144 | if (!newModel.initNewModel(option, trnModel)) return null;
145 |
146 | logger.log(Level.INFO,"Sampling " + niters + " iteration for inference!");
147 |
148 | for (newModel.liter = 1; newModel.liter <= niters; newModel.liter++){
149 | //System.out.println("Iteration " + newModel.liter + " ...");
150 |
151 | // for all newz_i
152 | for (int m = 0; m < newModel.M; ++m){
153 | for (int n = 0; n < newModel.data.docs[m].length; n++){
154 | // (newz_i = newz[m][n]
155 | // sample from p(z_i|z_-1,w)
156 | int topic = infSampling(m, n);
157 | newModel.z[m].set(n, topic);
158 |
159 | }
160 | }//end foreach new doc
161 |
162 | }// end iterations
163 |
164 | logger.log(Level.FINE,"Gibbs sampling for inference completed!");
165 | logger.log(Level.FINE,"Saving the inference outputs!");
166 |
167 | computeNewTheta();
168 | computeNewPhi();
169 | newModel.liter--;
170 | newModel.saveModel(newModel.dfile + "." + newModel.modelName);
171 |
172 | return newModel;
173 | }
174 |
175 | /**
176 | * do sampling for inference
177 | * m: document number
178 | * n: word number?
179 | */
180 | protected int infSampling(int m, int n){
181 | // remove z_i from the count variables
182 | int topic = newModel.z[m].get(n);
183 | int _w = newModel.data.docs[m].words[n];
184 | int w = newModel.data.lid2gid.get(_w);
185 | newModel.nw[_w][topic] -= 1;
186 | newModel.nd[m][topic] -= 1;
187 | newModel.nwsum[topic] -= 1;
188 | newModel.ndsum[m] -= 1;
189 |
190 | double Vbeta = trnModel.V * newModel.beta;
191 | double Kalpha = trnModel.K * newModel.alpha;
192 |
193 | // do multinomial sampling via cummulative method
194 | for (int k = 0; k < newModel.K; k++){
195 | newModel.p[k] = (trnModel.nw[w][k] + newModel.nw[_w][k] + newModel.beta)/(trnModel.nwsum[k] + newModel.nwsum[k] + Vbeta) *
196 | (newModel.nd[m][k] + newModel.alpha)/(newModel.ndsum[m] + Kalpha);
197 | }
198 |
199 | // cummulate multinomial parameters
200 | for (int k = 1; k < newModel.K; k++){
201 | newModel.p[k] += newModel.p[k - 1];
202 | }
203 |
204 | // scaled sample because of unnormalized p[]
205 | double u = Math.random() * newModel.p[newModel.K - 1];
206 |
207 | for (topic = 0; topic < newModel.K; topic++){
208 | if (newModel.p[topic] > u)
209 | break;
210 | }
211 |
212 | // add newly estimated z_i to count variables
213 | newModel.nw[_w][topic] += 1;
214 | newModel.nd[m][topic] += 1;
215 | newModel.nwsum[topic] += 1;
216 | newModel.ndsum[m] += 1;
217 |
218 | return topic;
219 | }
220 |
221 | protected void computeNewTheta(){
222 | for (int m = 0; m < newModel.M; m++){
223 | for (int k = 0; k < newModel.K; k++){
224 | newModel.theta[m][k] = (newModel.nd[m][k] + newModel.alpha) / (newModel.ndsum[m] + newModel.K * newModel.alpha);
225 | }//end foreach topic
226 | }//end foreach new document
227 | }
228 |
229 | protected void computeNewPhi(){
230 | for (int k = 0; k < newModel.K; k++){
231 | for (int _w = 0; _w < newModel.V; _w++){
232 | Integer id = newModel.data.lid2gid.get(_w);
233 |
234 | if (id != null){
235 | newModel.phi[k][_w] = (trnModel.nw[id][k] + newModel.nw[_w][k] + newModel.beta) / (newModel.nwsum[k] + newModel.nwsum[k] + trnModel.V * newModel.beta);
236 | }
237 | }//end foreach word
238 | }// end foreach topic
239 | }
240 |
241 | protected void computeTrnTheta(){
242 | for (int m = 0; m < trnModel.M; m++){
243 | for (int k = 0; k < trnModel.K; k++){
244 | trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha);
245 | }
246 | }
247 | }
248 |
249 | protected void computeTrnPhi(){
250 | for (int k = 0; k < trnModel.K; k++){
251 | for (int w = 0; w < trnModel.V; w++){
252 | trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta);
253 | }
254 | }
255 | }
256 | }
257 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TopicTiling
2 |
3 |
4 |
5 | Topic Tiling is an LDA-based Text Segmentation algorithm.
6 | The algorithm is inspired by the well-known [TextTiling](http://www.aclweb.org/anthology/J97-1003)
7 | algorithm developed by [Marti Hearst](http://people.ischool.berkeley.edu/~hearst/), and segments documents using the Latent
8 | Dirichlet Allocation (LDA) topic model. TopicTiling performs
9 | the segmentation in linear time and thus is computationally
10 | less expensive than other LDA-based segmentation methods.
11 |
12 | I have moved the project from SourceForge to Github. Whereas the code is still the same, I have updated the documentation on this page.
13 |
14 | For the LDA computation we use [JGibbLda](http://jgibblda.sourceforge.net/) in a slightly modified version, making this project to be licenced under GPL.
15 |
16 |
17 | Table of Content
18 | ================
19 |
20 |
21 | * [Usage of the binaries](#usage-of-the-binaries)
22 | * [Usage for non latin languages](#usage-for-non-latin-languages)
23 | * [Usage of the source code](#usage-of-the-source-code)
24 | * [Compute a topic model](#compute-a-topic-model)
25 | * [Split output file by documents](#split-output-file-by-documents)
26 | * [Citation](#citation)
27 | * [License](#license)
28 |
29 |
30 |
31 |
32 | Usage of the binaries
33 | ===============
34 |
35 | The tool has been developed and tested using unix-based systems.
36 | As TopicTiling is written in Java it should also run on Windows
37 | machines.
38 |
39 | To start TopicTiling, you have to download the binary ([zip](https://github.com/riedlma/topictiling/releases/download/v1.0/topictiling_v1.0.zip)|[tar.gz](https://github.com/riedlma/topictiling/releases/download/v1.0/topictiling_v1.0.tar.gz)) and decompress the archive. To execute the segmentation method, open the commandline and navigate to the uncompressed folder
40 |
41 | ```
42 | cd topictiling_v1.0
43 | ```
44 |
45 | We provide an batch script to start the segmentation for Windows:
46 | ```
47 | bash topictiling.bat
48 | ```
49 | and a shell script to start the segmentation for unix-based operation systems:
50 | ```
51 | sh topictiling.sh
52 | ```
53 |
54 | These commands will output all parameters of TopicTiling:
55 |
56 |
57 | ```
58 | [java] Option "-fd" is required
59 | [java] java -jar myprogram.jar [options...] arguments...
60 | [java] -dn : Use the direct neighbor otherwise the highest neighbor will be used
61 | [java] (default false)
62 | [java] -fd VAL : Directory fo the test files
63 | [java] -fp VAL : File pattern for the test files
64 | [java] -i N : Number of inference iterations used to annotate words with topic
65 | [java] IDs (default 100)
66 | [java] -m : Use mode counting (true/false) (default=true)
67 | [java] -out VAL : File the content is written to (otherwise stdout will be used)
68 | [java] -ri N : Use the repeated inference method
69 | [java] -rs N : Use the repeated segmentation
70 | [java] -s : Use simple segmentation (default=false)
71 | [java] -tmd VAL : Directory of the topic model (GibbsLDA should be used)
72 | [java] -tmn VAL : Name of the topic model (GibbsLDA should be used)
73 | [java] -w N : Window size used to calculate the sentence similarity
74 | ```
75 |
76 | We recommend using the mode counting (-m). In each inference iteration of LDA, a topicId is assigned to a word. In the default implementation the assignment is done via sampling. Thus, it could happen that a word has a different topicId in each inference step. To stabelize the topicId assignment, we store each topicId assignment for each inference iteration and at the end we use the one that has been sampled most.
77 |
78 | In order to test TopicTiling, you also require a topic model that has been computed with either [JGibbLDA](http://jgibblda.sourceforge.net/) or [GibbsLda++](http://gibbslda.sourceforge.net/). Some description for the computation is given [here](#compute-a-topic-model).
79 |
80 | Once you have computed a topic model, you might have a folder called *topicmodel* with the following files:
81 | ```
82 | topicmodel/model-final.others
83 | topicmodel/model-final.phi
84 | topicmodel/model-final.tassign
85 | topicmodel/model-final.theta
86 | topicmodel/model-final.twords
87 | topicmodel/wordmap.txt
88 | ```
89 |
90 |
91 | For the segmentation, we advise to repeat the inference five times (*-ri 5*) (see [paper](http://www.aclweb.org/anthology/W12-0703)). To start the segmentation, you can then use the following command, considering that the files you want to segment are stored in the folder *files_to_segment* and use as file ending "txt":
92 |
93 | ```
94 | sh topictiling.sh -ri 5 -tmd topicmodel -tmn mode-final -fp "*txt" -fd files_to_segment
95 | ```
96 |
97 | The output of the algorithms is in XML format:
98 |
99 | ```
100 |
101 | …
102 |
103 | score
104 | …
105 |
106 | …
107 |
108 |
109 | ```
110 |
111 | The code returns all maxima where a boundary might be set. If you know the number of segments, you can just select the N semgents with the highest depthScore scores and ignore the remaining ones.
112 |
113 |
114 | Usage for non latin languages
115 | ===============
116 | The current version uses the Stanford segmenter for tokenization. However, this tokenizer does not play well on languages without any latin characters (e.g. Chinese, Arabic, Hebrew, Japanese, etc.). In order to segment such languages, segment the texts beforehand and use the parameter *-s* that disables the tokenization and expects all words segmented by white spaces.
117 |
118 | Usage of the source code
119 | ===============
120 | Import both projects into Eclipse. The LDA project contains JGibbLda with slight modifications, so the mode method can be computed. Additionally it contains UIMA Annotators, so it can be used within a UIMA Pipeline. The project also has dependencies to DKPro and uimafit. To run the TopicTiling algorithm, execute the class TopicTilingTopicDocument.
121 |
122 | Compute a topic model
123 | ===============
124 |
125 | To compute the topic model with LDA, documents are required that represent the domain of texts, the segmentation method will be applied to. For the computation you can use either [JGibbLDA](http://jgibblda.sourceforge.net/) (written in Java) or the faster C++ version [GibbsLda++](http://gibbslda.sourceforge.net/). To get an impression of the usage of different parameters of LDA you can have a look at our paper: [Sweeping through the Topic Space: Bad luck? Roll again!](http://www.aclweb.org/anthology/W12-0703). In general, we would advise training a topic model with 100 topics, alpha with 50/(number of topics) and alpha equals 0.01.
126 |
127 |
128 | Split output file by documents
129 | ===============
130 |
131 | The output of the standard TopicTiling method returns one file with segments for all documents. If you want to have one file with segments for each document you can use the python script in the repository called: *split_output.py*. For this, The output of TopicTiling should be redirected to a file (e.g. *output_file*). The python script expects two parameters: the output file of TopicTiling (*output_file*) and a folder that is created and where all single document files are stored (*output_folder*)
132 | ```
133 | python split_output.py output_file output_folder
134 | ```
135 |
136 |
137 | Citation
138 | ===============
139 | If you use TextTiling, please cite one of the following papers/article:
140 |
141 | ```
142 |
143 | @article{Riedl:jlcl,
144 | author = {Martin Riedl and Chris Biemann},
145 | title = {{Text Segmentation with Topic Models }},
146 | journal = {Journal for Language Technology and Computational Linguistics (JLCL)},
147 | year={2012},
148 | volume={27},
149 | number={47-69},
150 | pages={13-24},
151 | url={http://www.jlcl.org/2012_Heft1/jlcl2012-1-3.pdf}
152 | }
153 |
154 | @inproceedings{riedl12_acl,
155 | author = {Martin Riedl and Chris Biemann},
156 | title = {TopicTiling: A Text Segmentation Algorithm based on LDA},
157 | year = {2012},
158 | address = {Jeju, Republic of Korea},
159 | booktitle = {Proceedings of the Student Research Workshop of the 50th Meeting of the Association for
160 | Computational Linguistics},
161 | pages = {37--42},
162 | url={http://www.aclweb.org/anthology/W12-3307},
163 | }
164 |
165 | ```
166 |
167 |
168 |
169 | License
170 | ===============
171 | As JGibbLDA is published under GPL 2.0 license, which is contained in the current repository, I had to license via this license.
172 |
173 | TopicTiling is a free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation.
174 |
175 | TopicTiling is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
176 |
177 |
178 |
179 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/TopicTilingDocumentSegmenterAnnotator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator;
25 |
26 | import java.text.DecimalFormat;
27 | import java.util.ArrayList;
28 | import java.util.Iterator;
29 | import java.util.List;
30 |
31 | import org.apache.uima.UimaContext;
32 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
33 | import org.apache.uima.jcas.JCas;
34 | import org.apache.uima.resource.ResourceInitializationException;
35 | import org.uimafit.component.JCasAnnotator_ImplBase;
36 | import org.uimafit.descriptor.ConfigurationParameter;
37 | import org.uimafit.util.JCasUtil;
38 |
39 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TextTilingWindowOptimized;
40 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TopicTilingTopicDocument;
41 | import de.tudarmstadt.langtech.semantics.type.Segment;
42 | import de.tudarmstadt.langtech.semantics.type.SegmentQuantity;
43 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
44 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
45 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
46 |
47 | public class TopicTilingDocumentSegmenterAnnotator
48 | extends JCasAnnotator_ImplBase {
49 | private boolean printSegments = false;
50 |
51 | public static final String PARAM_LDA_MODEL_DIRECTORY = "LdaModelDirectory";
52 | public static final String PARAM_LDA_MODEL_NAME = "LdaModelName";
53 | public static final String PARAM_WINDOW = "TopicTilingWindow";
54 | public static final String PARAM_REPEAT_SEGMENTATION = "RepeatedSegmentation";
55 | public static final String PARAM_INFERENCE_ITERATION = "InferenceIteration";
56 | public static final String PARAM_REPEAT_INFERENCE = "RepeatedInference";
57 |
58 | @ConfigurationParameter(name = PARAM_LDA_MODEL_DIRECTORY, mandatory = true)
59 | private String ldaModelDirectory;
60 | @ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true)
61 | private String ldaModelName;
62 | @ConfigurationParameter(name = PARAM_WINDOW, mandatory = true)
63 | private int window;
64 | @ConfigurationParameter(name = PARAM_REPEAT_INFERENCE, mandatory = true)
65 | private int repeatInferences;
66 | @ConfigurationParameter(name = PARAM_REPEAT_SEGMENTATION, mandatory = true)
67 | private int repeatSegmentation;
68 | @ConfigurationParameter(name = PARAM_INFERENCE_ITERATION, mandatory = true)
69 | private int inferenceIteration;
70 |
71 |
72 |
73 | @Override
74 | public void initialize(UimaContext context)
75 | throws ResourceInitializationException {
76 | super.initialize(context);
77 |
78 | }
79 |
80 | @Override
81 | public void process(JCas jcas)
82 | throws AnalysisEngineProcessException {
83 |
84 | List> s = new ArrayList>();
85 |
86 | // int i = 0;
87 | Iterator segments = JCasUtil.select(jcas, Segment.class)
88 | .iterator();
89 | Segment seg = null;
90 | if (segments.hasNext())
91 | seg = segments.next();
92 |
93 | for (Sentence ss : JCasUtil.select(jcas, Sentence.class)) {
94 |
95 | s.add(JCasUtil.selectCovered(Token.class, ss));
96 |
97 | }
98 |
99 | TopicTilingTopicDocument tttd ;
100 |
101 | if (JCasUtil.select(jcas, SegmentQuantity.class).size() == 0) {
102 |
103 | tttd = new TopicTilingTopicDocument(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInferences, inferenceIteration);
104 | } else {
105 | int segNum = JCasUtil.select(jcas, SegmentQuantity.class)
106 | .iterator().next().getSegmentCount();
107 | tttd = new TopicTilingTopicDocument(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInferences, inferenceIteration,segNum);
108 | }
109 |
110 |
111 | List segmentPositions = tttd.segment(s);
112 | // print(jcas,segmentPositions);
113 | // printRcode(jcas, segmentCounts, wtt2, segmentPositionsWnew);
114 | annotateSegments(jcas, segmentPositions);
115 | }
116 |
117 | private void printRcode(JCas jcas, int segmentCount,
118 | TextTilingWindowOptimized tt, List segments) {
119 | // if (!printRcode)
120 | // return;
121 | DocumentMetaData metaData = DocumentMetaData.get(jcas);
122 | ;
123 | String main = metaData.getDocumentTitle()
124 | + ": Cosine Similarity between sentences ";
125 | if (segmentCount < 0)
126 | main = main + " (segments given: " + segmentCount + ")";
127 | StringBuffer buffer = new StringBuffer();
128 | buffer.append("#Cosine Similarity\n");
129 | buffer.append("pdf(file='" + metaData.getDocumentTitle()
130 | + ".pdf',20,7);\n");
131 | buffer.append(toListInR(tt.similarityScores, "cos"));
132 | buffer.append(toListInR(segments, "estSeg"));
133 | buffer.append(toListInR(getGoldSegments(jcas), "seg"));
134 | buffer.append(toListInR(tt.minimaPosition, "canSeg"));
135 | buffer.append(toListInR(tt.depthScores, "depth"));
136 | buffer.append("plot(0:"
137 | + (tt.similarityScores.size() - 1)
138 | + ",cos,type='l',xlab='Sentence',ylab='cosine similarity',main='"
139 | + main + "');\n");
140 | buffer.append("abline(v=seg,col='red',lty=5);\n");
141 | buffer.append("abline(v=estSeg,col='green',lwd=2,lty=4);\n");
142 | buffer.append("abline(v=seg[seg%in%estSeg],col='black',lwd=3);\n");
143 | buffer.append("points(estSeg,rep(max(cos)*0.98," + segments.size()
144 | + "),col='green',pch=22);\n");
145 | buffer.append("points(canSeg,rep(max(cos)*0.9,"
146 | + tt.minimaPosition.size() + "),col='blue',pch=23);\n");
147 | buffer.append("text(canSeg[-length(canSeg)],rep(max(cos)*c(0.84,0.88,0.92,0.94),length="
148 | + tt.depthScores.size() + "),labels=depth);\n");
149 | buffer.append("dev.off();dev.off()");
150 | System.out.println(buffer.toString());
151 |
152 | }
153 |
154 | private List getGoldSegments(JCas jcas) {
155 |
156 | List ret = new ArrayList();
157 | Iterator segIt = JCasUtil.iterator(jcas, Segment.class);
158 | int sentenceCount = -1;
159 | while (segIt.hasNext()) {
160 | Segment seg = segIt.next();
161 | for (Sentence s : JCasUtil.selectCovered(jcas, Sentence.class, seg)) {
162 | sentenceCount++;
163 | }
164 | ret.add(sentenceCount);
165 | }
166 | return ret;
167 | }
168 |
169 | private StringBuffer toListInR(List list, String name) {
170 | StringBuffer buffer = new StringBuffer();
171 | buffer.append(name);
172 | buffer.append("=c(");
173 | for (T sc : list) {
174 | if (sc instanceof Double) {
175 | DecimalFormat df = new DecimalFormat("#.##");
176 | buffer.append(df.format(sc).replace(",", "."));
177 | } else {
178 | buffer.append(sc);
179 | }
180 | buffer.append(",");
181 | }
182 | if (list.size() > 0)
183 | buffer.deleteCharAt(buffer.length() - 1);
184 | buffer.append(");\n");
185 | return buffer;
186 | }
187 |
188 |
189 | /**
190 | * expects a list with the sentencenumber that will be segmented
191 | *
192 | * @param jcas
193 | * @param sentenceBreaks
194 | */
195 | private void annotateSegments(JCas jcas, List sentenceBreaks) {
196 | Iterator sentenceItr = JCasUtil
197 | .iterator(jcas, Sentence.class);
198 | int sentenceCount = -1;
199 | int prevBreak = 0;
200 | if (printSegments) {
201 | System.out.println("Annotated Segments");
202 | System.out.println(sentenceBreaks.toString());
203 | }
204 |
205 | for (final int sBreak : sentenceBreaks) {
206 | final Segment seg = new Segment(jcas);
207 |
208 | Sentence segmentSentence = null;
209 |
210 | int beginOffset = 0;
211 | int endOffset = 0;
212 |
213 | // move sentenceItr to last sentence in segment
214 | for (; sentenceCount < sBreak; sentenceCount++) {
215 | segmentSentence = sentenceItr.next();
216 |
217 | if (sentenceCount == prevBreak) {
218 | beginOffset = segmentSentence.getBegin();
219 | }
220 | }
221 |
222 | if (segmentSentence != null) {
223 | endOffset = segmentSentence.getEnd();
224 | }
225 |
226 | seg.setBegin(beginOffset);
227 | seg.setEnd(endOffset);
228 | seg.addToIndexes();
229 |
230 | if (printSegments) {
231 | System.out.println(sBreak + "\t" + sentenceCount + "\t"
232 | + beginOffset + "\t" + endOffset);
233 | }
234 | prevBreak = sBreak;
235 | }
236 | }
237 | }
238 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaTopicIdAnnotator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 | package de.tudarmstadt.langtech.lda.annotator;
25 |
26 | import static org.uimafit.util.JCasUtil.select;
27 |
28 | import java.util.ArrayList;
29 | import java.util.HashMap;
30 | import java.util.List;
31 | import java.util.Random;
32 |
33 | import jgibbslda.Model;
34 |
35 | import org.apache.uima.UIMAFramework;
36 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
37 | import org.apache.uima.jcas.JCas;
38 | import org.apache.uima.jcas.cas.DoubleArray;
39 | import org.apache.uima.util.Level;
40 | import org.apache.uima.util.Logger;
41 | import org.uimafit.descriptor.ConfigurationParameter;
42 |
43 | import de.tudarmstadt.langtech.lda.type.Topic;
44 | import de.tudarmstadt.langtech.lda.type.TopicDistribution;
45 | import de.tudarmstadt.langtech.lda.type.WordTopicDistribution;
46 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
47 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
48 |
49 | public abstract class GibbsLdaTopicIdAnnotator extends
50 | GibbsLdaTopicModelAnnotator {
51 | public static final String PARAM_LDA_REPEAT_INFERENCE = "LdaRepeatInference";
52 | public static final String PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION = "LdaAnnotateDocumentTopicDistribution";
53 | public static final String PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION = "LdaAnnotateWordTopicDistribution";
54 |
55 | private static final Logger log = UIMAFramework
56 | .getLogger(GibbsLdaTopicIdAnnotator.class);
57 | @ConfigurationParameter(name = PARAM_LDA_REPEAT_INFERENCE, mandatory = false, defaultValue = "1")
58 | private int ldaRepeatInference;
59 |
60 | @ConfigurationParameter(name = PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION, mandatory = false, defaultValue = "false")
61 | private boolean ldaAnnotateDocumentTopicDistribution = false;
62 |
63 | @ConfigurationParameter(name = PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION, mandatory = false, defaultValue = "false")
64 | private boolean ldaAnnotateWordTopicDistribution = false;
65 |
66 | /**
67 | * Function iterates over all tokens and assigns a topic ID. This can only
68 | * be performed, when the token is within the model.
69 | *
70 | * @param jcas
71 | * @param z
72 | */
73 |
74 | private void annotateTokenWithTopicId(JCas jcas, List[] modelZ,
75 | List[] modelModeZ, List[] documents) {
76 |
77 | int si = 0;
78 | int ti = 0;
79 | int zti = 0;
80 | int actDocumentSize = 0;
81 | List wordTokens = null;
82 | StringBuffer output = new StringBuffer();
83 |
84 | if (documents.length > 0) {
85 | wordTokens = documents[0];
86 | actDocumentSize = wordTokens.size();
87 | }
88 |
89 | for (Token t : select(jcas, Token.class)) {
90 | if (zti >= actDocumentSize) {
91 | ti = 0;
92 | zti = 0;
93 | si++;
94 | wordTokens = documents[si];
95 | actDocumentSize = wordTokens.size();
96 | }
97 | String token = t.getCoveredText();
98 | assert token.equals(wordTokens.get(zti));
99 | // System.out.print("indices: " + si + "\t" + ti + "\tsize: "
100 | // + modelZ[si].size() + " " + modelModeZ[si].size());
101 | // System.out.println("\t" + token + " "+ wordTokens.get(zti));
102 | if (getInferencerGlobalDict().word2id.containsKey(token)) {
103 | int topicId = modelZ[si].get(ti);
104 | int topicModeId = modelModeZ[si].get(ti);
105 | Topic topic = new Topic(jcas, t.getBegin(), t.getEnd());
106 | topic.setTopicId(topicId);
107 | topic.setTopicModeId(topicModeId);
108 | topic.addToIndexes();
109 |
110 | ti++;
111 |
112 | output.append(token).append(":").append(topicId).append(":")
113 | .append(topicModeId);
114 |
115 | } else {
116 | output.append(token).append(":NA");
117 | }
118 | output.append(" ");
119 | zti++;
120 |
121 | }
122 | log.log(Level.FINE, output.toString());
123 | }
124 |
125 | @Override
126 | public void process(JCas jcas) throws AnalysisEngineProcessException {
127 | final List[] documents = getDocuments(jcas);
128 | DocumentMetaData metaData = DocumentMetaData.get(jcas);
129 | super.setLdaInferenceSaveName(metaData.getDocumentTitle());
130 | Model m = inference(documents);
131 | // if no inference is repeated z contains the topic IDs that are used
132 | List[] modelZ = m.z;
133 | List[] modelModeZ;
134 |
135 | modelModeZ = getTopicListFromRepeated(getInferenceModeValues(),
136 | documents, getInferenceNiters(), 1);
137 | if (ldaRepeatInference > 1) {
138 | // initialize save structure for word wise topic stabilization
139 | ArrayList values = new ArrayList();
140 | for (int k = 0; k < documents.length; k++) {
141 | values.add(new int[modelZ[k].size()][m.K]);
142 | }
143 | for (int k = 1; k < ldaRepeatInference; k++) {
144 | for (int p = 0; p < documents.length; p++) {
145 | for (int t = 0; t < modelZ[p].size(); t++) {
146 | int topic = modelZ[p].get(t);
147 | values.get(p)[t][topic]++;
148 | }
149 | }
150 | m = inference(documents);
151 | modelZ = m.z;
152 | modelModeZ = getTopicListFromRepeated(getInferenceModeValues(),
153 | documents, getInferenceNiters(), 1);
154 | }
155 | }
156 | annotateTokenWithTopicId(jcas, modelZ, modelModeZ, documents);
157 | if (ldaAnnotateDocumentTopicDistribution)
158 | annotateDocumentsWithTopicDistribution(jcas, documents, m);
159 | if(ldaAnnotateWordTopicDistribution)
160 | annotateWordsWithTopicDistribution(jcas,m);
161 | }
162 |
163 | private void annotateWordsWithTopicDistribution(JCas jcas, Model m) {
164 |
165 | HashMap map = new HashMap();
166 | for(int wi =0;wi< m.phi.length;wi++){
167 | double[] topics=m.phi[wi];
168 | String word = getInferencerGlobalDict().id2word.get(wi);
169 | DoubleArray arr = new DoubleArray(jcas, topics.length);
170 | for(int ti=0;ti[] documents, Model m) {
189 | int si = 0;
190 | int ti = 0;
191 | int start = -1;
192 | int docSize = documents[si].size();
193 | for (Token t : select(jcas, Token.class)) {
194 | if (start < 0) {
195 | docSize = documents[si].size();
196 | start = t.getBegin();
197 | }
198 | ti++;
199 | if (ti == docSize) {
200 | TopicDistribution td = new TopicDistribution(jcas, start,
201 | t.getEnd());
202 | start = -1;
203 | DoubleArray arr = new DoubleArray(jcas, m.K);
204 | for (int i = 0; i < m.theta[si].length; i++) {
205 | arr.set(i, m.theta[si][i]);
206 | }
207 | td.setTopicDistribution(arr);
208 | td.addToIndexes();
209 |
210 | si++;
211 |
212 | ti = 0;
213 | }
214 |
215 | }
216 | }
217 |
218 | private List[] getTopicListFromRepeated(ArrayList values,
219 | List[] partsArray, int max, int min) {
220 | @SuppressWarnings("unchecked")
221 | List[] newZ = new ArrayList[values.size()];
222 | Random r = new Random();
223 | for (int s = 0; s < values.size(); s++) {
224 | int[][] sentence = values.get(s);
225 | newZ[s] = new ArrayList();
226 | for (int t = 0; t < sentence.length; t++) {
227 | List candidates = getTopicCandidates(sentence[t], max,
228 | min);
229 | if (candidates.size() > 0) {
230 | int topic = candidates.get(r.nextInt(candidates.size()));
231 | newZ[s].add(topic);
232 | } else {
233 | System.out.println("No Candidates found");
234 |
235 | System.out.println();
236 | }
237 |
238 | }
239 |
240 | }
241 | return newZ;
242 |
243 | }
244 |
245 | private List getTopicCandidates(int[] topics, int max, int min) {
246 | ArrayList candidates = new ArrayList();
247 | for (int m = max; m >= min; m--) {
248 |
249 | for (int t = 0; t < topics.length; t++) {
250 | if (topics[t] == m) {
251 | candidates.add(t);
252 | }
253 | }
254 | if (candidates.size() > 0) {
255 | return candidates;
256 | }
257 | }
258 | return new ArrayList();
259 | }
260 |
261 | public abstract List[] getDocuments(JCas jcas);
262 | }
263 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDADataset.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2007 by
3 | *
4 | * Xuan-Hieu Phan
5 | * hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
6 | * Graduate School of Information Sciences
7 | * Tohoku University
8 | *
9 | * Cam-Tu Nguyen
10 | * ncamtu@gmail.com
11 | * College of Technology
12 | * Vietnam National University, Hanoi
13 | *
14 | * JGibbsLDA is a free software; you can redistribute it and/or modify
15 | * it under the terms of the GNU General Public License as published
16 | * by the Free Software Foundation; either version 2 of the License,
17 | * or (at your option) any later version.
18 | *
19 | * JGibbsLDA is distributed in the hope that it will be useful, but
20 | * WITHOUT ANY WARRANTY; without even the implied warranty of
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 | * GNU General Public License for more details.
23 | *
24 | * You should have received a copy of the GNU General Public License
25 | * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 | */
28 | package jgibbslda;
29 |
30 | import java.io.BufferedReader;
31 | import java.io.FileInputStream;
32 | import java.io.InputStreamReader;
33 | import java.util.HashMap;
34 | import java.util.List;
35 | import java.util.Map;
36 | import java.util.Vector;
37 |
38 | public class LDADataset {
39 | //---------------------------------------------------------------
40 | // Instance Variables
41 | //---------------------------------------------------------------
42 |
43 | public Dictionary localDict; // local dictionary
44 | public Document [] docs; // a list of documents
45 | public int M; // number of documents
46 | public int V; // number of words
47 |
48 | // map from local coordinates (id) to global ones
49 | // null if the global dictionary is not set
50 | public Map lid2gid;
51 |
52 | //link to a global dictionary (optional), null for train data, not null for test data
53 | public Dictionary globalDict;
54 |
55 | //--------------------------------------------------------------
56 | // Constructor
57 | //--------------------------------------------------------------
58 | public LDADataset(){
59 | localDict = new Dictionary();
60 | M = 0;
61 | V = 0;
62 | docs = null;
63 |
64 | globalDict = null;
65 | lid2gid = null;
66 | }
67 |
68 | public LDADataset(int M){
69 | localDict = new Dictionary();
70 | this.M = M;
71 | this.V = 0;
72 | docs = new Document[M];
73 |
74 | globalDict = null;
75 | lid2gid = null;
76 | }
77 |
78 | public LDADataset(int M, Dictionary globalDict){
79 | localDict = new Dictionary();
80 | this.M = M;
81 | this.V = 0;
82 | docs = new Document[M];
83 |
84 | this.globalDict = globalDict;
85 | lid2gid = new HashMap();
86 | }
87 |
88 | //-------------------------------------------------------------
89 | //Public Instance Methods
90 | //-------------------------------------------------------------
91 | /**
92 | * set the document at the index idx if idx is greater than 0 and less than M
93 | * @param doc document to be set
94 | * @param idx index in the document array
95 | */
96 | public void setDoc(Document doc, int idx){
97 | if (0 <= idx && idx < M){
98 | docs[idx] = doc;
99 | }
100 | }
101 | /**
102 | * set the document at the index idx if idx is greater than 0 and less than M
103 | * @param str string contains doc
104 | * @param idx index in the document array
105 | */
106 | public void setDoc(String str, int idx){
107 | if (0 <= idx && idx < M){
108 | String [] words = str.split("[ \\t\\n]");
109 |
110 | Vector ids = new Vector();
111 |
112 | for (String word : words){
113 | int _id = localDict.word2id.size();
114 |
115 | if (localDict.contains(word))
116 | _id = localDict.getID(word);
117 |
118 | if (globalDict != null){
119 | //get the global id
120 | Integer id = globalDict.getID(word);
121 | //System.out.println(id);
122 |
123 | if (id != null){
124 | localDict.addWord(word);
125 |
126 | lid2gid.put(_id, id);
127 | ids.add(_id);
128 | }
129 | else { //not in global dictionary
130 | //do nothing currently
131 | }
132 | }
133 | else {
134 | localDict.addWord(word);
135 | ids.add(_id);
136 | }
137 | }
138 |
139 | Document doc = new Document(ids, str);
140 | docs[idx] = doc;
141 | V = localDict.word2id.size();
142 | }
143 | }
144 |
145 |
146 | public void setDoc(List words, int idx){
147 | String str = "";
148 | if (0 <= idx && idx < M){
149 |
150 | Vector ids = new Vector();
151 |
152 | for (String word : words){
153 | str+=word+" ";
154 | int _id = localDict.word2id.size();
155 |
156 | if (localDict.contains(word))
157 | _id = localDict.getID(word);
158 |
159 | if (globalDict != null){
160 | //get the global id
161 | Integer id = globalDict.getID(word);
162 | //System.out.println(id);
163 |
164 | if (id != null){
165 | localDict.addWord(word);
166 |
167 | lid2gid.put(_id, id);
168 | ids.add(_id);
169 | }
170 | else { //not in global dictionary
171 | //do nothing currently
172 | }
173 | }
174 | else {
175 | localDict.addWord(word);
176 | ids.add(_id);
177 | }
178 | }
179 |
180 | Document doc = new Document(ids, str);
181 | docs[idx] = doc;
182 | V = localDict.word2id.size();
183 | }
184 | }
185 | //---------------------------------------------------------------
186 | // I/O methods
187 | //---------------------------------------------------------------
188 |
189 | /**
190 | * read a dataset from a stream, create new dictionary
191 | * @return dataset if success and null otherwise
192 | */
193 | public static LDADataset readDataSet(String filename){
194 | try {
195 | BufferedReader reader = new BufferedReader(new InputStreamReader(
196 | new FileInputStream(filename), "UTF-8"));
197 |
198 | LDADataset data = readDataSet(reader);
199 |
200 | reader.close();
201 | return data;
202 | }
203 | catch (Exception e){
204 | System.out.println("Read Dataset Error: " + e.getMessage());
205 | e.printStackTrace();
206 | return null;
207 | }
208 | }
209 |
210 | /**
211 | * read a dataset from a file with a preknown vocabulary
212 | * @param filename file from which we read dataset
213 | * @param dict the dictionary
214 | * @return dataset if success and null otherwise
215 | */
216 | public static LDADataset readDataSet(String filename, Dictionary dict){
217 | try {
218 | BufferedReader reader = new BufferedReader(new InputStreamReader(
219 | new FileInputStream(filename), "UTF-8"));
220 | LDADataset data = readDataSet(reader, dict);
221 |
222 | reader.close();
223 | return data;
224 | }
225 | catch (Exception e){
226 | System.out.println("Read Dataset Error: " + e.getMessage());
227 | e.printStackTrace();
228 | return null;
229 | }
230 | }
231 |
232 | /**
233 | * read a dataset from a stream, create new dictionary
234 | * @return dataset if success and null otherwise
235 | */
236 | public static LDADataset readDataSet(BufferedReader reader){
237 | try {
238 | //read number of document
239 | String line;
240 | line = reader.readLine();
241 | int M = Integer.parseInt(line);
242 |
243 | LDADataset data = new LDADataset(M);
244 | for (int i = 0; i < M; ++i){
245 | line = reader.readLine();
246 |
247 | data.setDoc(line, i);
248 | }
249 |
250 | return data;
251 | }
252 | catch (Exception e){
253 | System.out.println("Read Dataset Error: " + e.getMessage());
254 | e.printStackTrace();
255 | return null;
256 | }
257 | }
258 |
259 | /**
260 | * read a dataset from a stream with respect to a specified dictionary
261 | * @param reader stream from which we read dataset
262 | * @param dict the dictionary
263 | * @return dataset if success and null otherwise
264 | */
265 | public static LDADataset readDataSet(BufferedReader reader, Dictionary dict){
266 | try {
267 | //read number of document
268 | String line;
269 | line = reader.readLine();
270 | int M = Integer.parseInt(line);
271 | System.out.println("NewM:" + M);
272 |
273 | LDADataset data = new LDADataset(M, dict);
274 | for (int i = 0; i < M; ++i){
275 | line = reader.readLine();
276 |
277 | data.setDoc(line, i);
278 | }
279 |
280 | return data;
281 | }
282 | catch (Exception e){
283 | System.out.println("Read Dataset Error: " + e.getMessage());
284 | e.printStackTrace();
285 | return null;
286 | }
287 | }
288 |
289 | /**
290 | * read a dataset from a string, create new dictionary
291 | * @param str String from which we get the dataset, documents are seperated by newline character
292 | * @return dataset if success and null otherwise
293 | */
294 | public static LDADataset readDataSet(String [] strs){
295 | LDADataset data = new LDADataset(strs.length);
296 |
297 | for (int i = 0 ; i < strs.length; ++i){
298 | data.setDoc(strs[i], i);
299 | }
300 | return data;
301 | }
302 |
303 | /**
304 | * read a dataset from a string with respect to a specified dictionary
305 | * @param str String from which we get the dataset, documents are seperated by newline character
306 | * @param dict the dictionary
307 | * @return dataset if success and null otherwise
308 | */
309 | public static LDADataset readDataSet(String [] strs, Dictionary dict){
310 | //System.out.println("readDataset...");
311 | LDADataset data = new LDADataset(strs.length, dict);
312 |
313 | for (int i = 0 ; i < strs.length; ++i){
314 | //System.out.println("set doc " + i);
315 | data.setDoc(strs[i], i);
316 | }
317 | return data;
318 | }
319 |
320 | public static LDADataset readDataSet(List [] strs, Dictionary dict){
321 | //System.out.println("readDataset...");
322 | LDADataset data = new LDADataset(strs.length, dict);
323 |
324 | for (int i = 0 ; i < strs.length; ++i){
325 | //System.out.println("set doc " + i);
326 | data.setDoc(strs[i], i);
327 | }
328 | return data;
329 | }
330 | }
331 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/TopicTilingTopicDocument.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter;
25 |
26 | import java.util.ArrayList;
27 | import java.util.Collection;
28 | import java.util.Collections;
29 | import java.util.HashMap;
30 | import java.util.List;
31 | import java.util.Map.Entry;
32 |
33 | import jgibbslda.Inferencer;
34 | import jgibbslda.LDACmdOption;
35 | import jgibbslda.Model;
36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
37 |
38 | public class TopicTilingTopicDocument {
39 | public List similarityScores;
40 | public List minimaPosition;
41 | public List depthScores;
42 | private Inferencer inf;
43 | private LDACmdOption opt;
44 |
45 | private int segmentNumber = -1;
46 |
47 | private int window = 1;
48 | private String ldaModelDirectory;
49 | private String ldaModelName;
50 | private int repeatSegmentation = 1;
51 | private int inferenceIterations = 100;
52 | private int repeatInference = 1;
53 |
54 | public TopicTilingTopicDocument(String ldaModelDirectory, String ldaModelName, int window, int repeatSegmentation, int repeatInference, int inferenceIteration) {
55 | this(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInference, inferenceIteration, -1);
56 | }
57 |
58 | public TopicTilingTopicDocument(String ldaModelDirectory, String ldaModelName, int window, int repeatSegmentation, int repeatInference, int inferenceIteration, int segmentNumber) {
59 |
60 | super();
61 | this.ldaModelDirectory = ldaModelDirectory;
62 | this.ldaModelName = ldaModelName;
63 | this.window = window;
64 | this.repeatInference = repeatInference;
65 | this.repeatSegmentation = repeatSegmentation;
66 | this.inferenceIterations = inferenceIteration;
67 |
68 | opt = new LDACmdOption();
69 | opt.dir = this.ldaModelDirectory;
70 | opt.modelName = this.ldaModelName;
71 | this.segmentNumber = segmentNumber;
72 |
73 | }
74 |
75 | public List segment(List> sentences) {
76 | HashMap map = new HashMap();
77 | if (segmentNumber < 0) {
78 | return segment2(sentences);
79 | }
80 | for (int i = 0; i < repeatSegmentation; i++) {
81 |
82 | List segments = segment2(sentences);
83 | System.out.println(segments);
84 | for (int value : segments) {
85 | int count = 0;
86 | if (map.containsKey(value)) {
87 | count = map.get(value);
88 | }
89 | map.put(value, count + 1);
90 |
91 | }
92 | }
93 | System.out.println(map);
94 | List segments = new ArrayList();
95 | for (int i = repeatSegmentation; i >= 0; i--) {
96 | for (Entry e : map.entrySet()) {
97 | if (e.getValue() == i) {
98 | segments.add(e.getKey());
99 | if (segments.size() == segmentNumber) {
100 | Collections.sort(segments);
101 | return segments;
102 | }
103 | }
104 |
105 | }
106 | }
107 | Collections.sort(segments);
108 | return segments;
109 | }
110 |
111 | public List segment2(List> sentences) {
112 |
113 | similarityScores = getSimilarityScores(sentences);
114 | System.out.println("SIM_TOPIC_TILING_DT: "+similarityScores);
115 | minimaPosition = getMinima();
116 | depthScores = getDepthScores();
117 | List segments = new ArrayList();
118 | if (segmentNumber < 0)
119 | segments = getSegments();
120 | else
121 | segments = getSegmentsNumberGiven();
122 | // add the last sentence as boundary if it is not set
123 |
124 | if (segments.size() > 1 && segments.get(segments.size() - 1) != sentences.size()) {
125 | segments.add(sentences.size() - 1);
126 | } else {
127 | System.err.println("segment size:" + segments.size());
128 | System.err.println("similarites: " + similarityScores);
129 | }
130 | return segments;
131 | }
132 |
133 | private List getSegmentsNumberGiven() {
134 | List segments = new ArrayList(minimaPosition);
135 | List depths = depthScores;
136 | List depths2 = new ArrayList(depthScores);
137 | if (depths.size() > segmentNumber) {
138 |
139 | Collections.sort(depths);
140 | double min = depths.get(depths.size() - segmentNumber + 1);// save
141 |
142 | for (int i = segments.size() - 1; i >= 0; i--) {
143 | if (depths2.get(i) < min) {
144 | segments.remove(i);
145 | }
146 | }
147 | }
148 |
149 | return segments;
150 | }
151 |
152 | public List getSegments() {
153 | // copy minima list
154 | List segments = new ArrayList(minimaPosition);
155 |
156 | double mean = calculateMean(depthScores);
157 | double variance = calculateVariance(depthScores, mean);
158 | double threshold = mean - variance / 2.0;
159 |
160 | for (int i = segments.size() - 1; i >= 0; i--) {
161 | if (depthScores.get(i) < threshold) {
162 | segments.remove(i);
163 | }
164 | }
165 | return segments;
166 | }
167 |
168 | private double calculateVariance(List vals, double mean) {
169 | double variance = 0.0;
170 | for (double d : vals) {
171 | variance += (d - mean) * (d - mean);
172 | }
173 | variance /= vals.size();
174 | return variance;
175 | }
176 |
177 | private double calculateMean(List vals) {
178 | double mean = 0.0;
179 | for (double d : vals) {
180 | mean += d;
181 | }
182 | mean /= vals.size();
183 | return mean;
184 | }
185 |
186 | private List getDepthScores() {
187 | List depths = new ArrayList();
188 | for (int i : minimaPosition) {
189 | depths.add(getDepths(i));
190 | }
191 | return depths;
192 | }
193 |
194 | // //left and right neighbor
195 | private double getDepths(int minimumPosition) {
196 | int i = minimumPosition;
197 | double depths = similarityScores.get(i - 1) - similarityScores.get(i)
198 | + similarityScores.get(i + 1) - similarityScores.get(i);
199 | return depths;
200 | }
201 |
202 |
203 | private List getMinima() {
204 | List minima = new ArrayList();
205 | double prev = 0;
206 | double curr = 0;
207 | double next = 1;
208 | for (int i = 1; i < similarityScores.size() - 1; i++) {
209 | if (next != curr) {
210 | prev = similarityScores.get(i - 1);
211 | }
212 | curr = similarityScores.get(i);
213 | next = similarityScores.get(i + 1);
214 | if (curr < next && curr < prev) {
215 | minima.add(i);
216 | }
217 | }
218 | return minima;
219 |
220 | }
221 |
222 | private List getSimilarityScores(List> sentences) {
223 | List similarities = new ArrayList();
224 | List parts = new ArrayList();
225 | for (int i = 0; i < sentences.size(); i++) {
226 | parts.add(getPrev(sentences, i));
227 | }
228 | for (int i = window - 1; i > 0; i--) {
229 | parts.add(getPrev(sentences, sentences.size() - 1, i));
230 | }
231 | String[] partsArray = new String[parts.size()];
232 | int i = 0;
233 | for (String ss : parts) {
234 | partsArray[i++] = ss;
235 | }
236 | double[][] topicDocument = null;
237 | for (i = 0; i < repeatInference; i++) {
238 | Model m = inference(partsArray);
239 | if (topicDocument == null) {
240 | topicDocument = new double[partsArray.length][m.K];
241 | for (int j = 0; j < partsArray.length; j++) {
242 | for (int k = 0; k < m.K; k++) {
243 | topicDocument[j][k] = 1.0;
244 | }
245 | }
246 | }
247 | for (int j = 0; j < partsArray.length; j++) {
248 | for (int k = 0; k < m.K; k++) {
249 | topicDocument[j][k] *= m.theta[j][k];
250 | }
251 | }
252 | }
253 | for (i = 0; i < partsArray.length - window; i++) {
254 | double[] v1 = topicDocument[i];
255 | double[] v2 = topicDocument[i + window];
256 | double sim = calculateDotProduct(v1, v2);
257 | similarities.add(sim);
258 | }
259 | // System.out.println(similarities.size());
260 | return similarities;
261 | }
262 |
263 | private List getTopicCandidates(int[] topics) {
264 | ArrayList candidates = new ArrayList();
265 | for (int m = repeatInference; m >= 0; m--) {
266 |
267 | for (int t = 0; t < topics.length; t++) {
268 | if (topics[t] == m) {
269 | candidates.add(t);
270 | }
271 | }
272 | if (candidates.size() > 0) {
273 | return candidates;
274 | }
275 | }
276 | return null;
277 | }
278 |
279 | private int[] getVector(int topicNumber, Collection topicAssigment) {
280 | int[] vec = new int[topicNumber];
281 | for (int k : topicAssigment) {
282 | vec[k]++;
283 | }
284 | return vec;
285 | }
286 |
287 | private Model inference(String[] sentences) {
288 | inf = new Inferencer();
289 | inf.init(opt);
290 |
291 | inf.niters = inferenceIterations;
292 | // inf.niters = Integer.parseInt(prop.getProperty("infIteration"));
293 | Model m = inf.inference(sentences);
294 | return m;
295 | }
296 |
297 | private String getPrev(List> sentences, int i) {
298 |
299 | return getPrev(sentences, i, window);
300 | }
301 |
302 | private String getPrev(List> sentences, int i, int window) {
303 | String result = "";
304 | for (int k = i; k >= 0 && k > (i - window); k--) {
305 | for (Token t : sentences.get(k)) {
306 | result += t.getCoveredText() + " ";
307 | }
308 | }
309 | return result;
310 | }
311 |
312 | private double calculateDotProduct(int[] curr, int[] next) {
313 | int xy = 0;
314 | int sumX = 0;
315 | int sumY = 0;
316 | if (curr.length != next.length) {
317 | throw new IllegalArgumentException("Cosine Similarity: X != Y");
318 | }
319 | for (int i = 0; i < curr.length; i++) {
320 | int xi = curr[i];
321 | int yi = next[i];
322 |
323 | xy += xi * yi;
324 | sumX += xi * xi;
325 | sumY += yi * yi;
326 | }
327 |
328 | return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY));
329 | }
330 |
331 | private double calculateDotProduct(double[] curr, double[] next) {
332 | double xy = 0;
333 | double sumX = 0;
334 | double sumY = 0;
335 | if (curr.length != next.length) {
336 | throw new IllegalArgumentException("Cosine Similarity: X != Y");
337 | }
338 | for (int i = 0; i < curr.length; i++) {
339 | double xi = curr[i];
340 | double yi = next[i];
341 |
342 | xy += xi * yi;
343 | sumX += xi * xi;
344 | sumY += yi * yi;
345 | }
346 |
347 | return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY));
348 | }
349 |
350 | }
351 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/TextTilingWindowOptimized.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 |
25 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter;
26 |
27 | import java.io.FileNotFoundException;
28 | import java.io.FileReader;
29 | import java.io.IOException;
30 | import java.util.ArrayList;
31 | import java.util.Collection;
32 | import java.util.Collections;
33 | import java.util.HashMap;
34 | import java.util.List;
35 | import java.util.Map.Entry;
36 | import java.util.Properties;
37 | import java.util.Random;
38 |
39 | import jgibbslda.Inferencer;
40 | import jgibbslda.LDACmdOption;
41 | import jgibbslda.Model;
42 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
43 |
44 | public class TextTilingWindowOptimized {
45 | private int segmentNumber = -1;
46 | private int window = 1;
47 | private int additionalVectorSize = 1;
48 | public List similarityScores;
49 | public List minimaPosition;
50 | public List depthScores;
51 | private Inferencer inf;
52 | private String ldaModel;
53 | private LDACmdOption opt;
54 | private Properties prop;
55 | private int segmentIteration = 5;
56 | private int inferenceIterationRepeating = 1;
57 | private int inferenceIteration;
58 |
59 | public TextTilingWindowOptimized(String ldaModel) {
60 | this(ldaModel, -1);
61 | }
62 |
63 | public TextTilingWindowOptimized(String ldaModel, int segmentNumber) {
64 | super();
65 | this.ldaModel = ldaModel;
66 | opt = new LDACmdOption();
67 | opt.dir = ldaModel;
68 | // opt.modelName = "model-final";
69 | this.segmentNumber = segmentNumber;
70 | prop = new Properties();
71 | try {
72 | prop.load(new FileReader("topictiling_config"));
73 | } catch (FileNotFoundException e) {
74 | // TODO Auto-generated catch block
75 | e.printStackTrace();
76 | } catch (IOException e) {
77 | // TODO Auto-generated catch block
78 | e.printStackTrace();
79 | }
80 | opt.modelName = prop.getProperty("model_name");
81 | window = 1;
82 | inferenceIteration = 100;
83 | inferenceIterationRepeating = 1;
84 | segmentIteration = 1;
85 | if (prop.containsKey("window"))
86 | window = Integer.parseInt(prop.getProperty("window"));
87 | if (prop.containsKey("infIteration"))
88 | inferenceIteration = Integer.parseInt(prop.getProperty("infIteration"));
89 | if (prop.containsKey("infIterationRepeating"))
90 | inferenceIterationRepeating = Integer.parseInt(prop.getProperty("infIterationRepeating"));
91 | if (prop.containsKey("segmentIteration"))
92 | segmentIteration = Integer.parseInt(prop.getProperty("segmentIteration"));
93 | System.err.println("window:"+window);
94 | System.err.println("inferenceIteration:"+inferenceIteration);
95 | System.err.println("inferenceIterationRepeating:"+inferenceIterationRepeating);
96 | System.err.println("window:"+window);
97 | }
98 |
99 | public List segment(List> sentences) {
100 | HashMap map = new HashMap();
101 | if (segmentNumber < 0) {
102 | return segment2(sentences);
103 | }
104 | for (int i = 0; i < segmentIteration; i++) {
105 |
106 | List segments = segment2(sentences);
107 | System.out.println(segments);
108 | for (int value : segments) {
109 | int count = 0;
110 | if (map.containsKey(value)) {
111 | count = map.get(value);
112 | }
113 | map.put(value, count + 1);
114 |
115 | }
116 | }
117 | System.out.println(map);
118 | List segments = new ArrayList();
119 | for (int i = segmentIteration; i >= 0; i--) {
120 | for (Entry e : map.entrySet()) {
121 | if (e.getValue() == i) {
122 | segments.add(e.getKey());
123 | if (segments.size() == segmentNumber) {
124 | Collections.sort(segments);
125 | return segments;
126 | }
127 | }
128 |
129 | }
130 | }
131 | Collections.sort(segments);
132 | return segments;
133 | }
134 |
135 | public List segment2(List> sentences) {
136 |
137 | similarityScores = getSimilarityScores(sentences);
138 | minimaPosition = getMinima();
139 | depthScores = getDepthScores();
140 |
141 | List segments = new ArrayList();
142 | if (segmentNumber < 0)
143 | segments = getSegments();
144 | else
145 | segments = getSegmentsNumberGiven();
146 | // add the last sentence as boundary if it is not set
147 |
148 | if (segments.size()>1&&segments.get(segments.size() - 1) != sentences.size()) {
149 | segments.add(sentences.size() - 1);
150 | }else{
151 | System.err.println("segment size:"+segments.size());
152 | System.err.println("similarites: "+similarityScores);
153 | }
154 | // System.out.println(segments);
155 | return segments;
156 | }
157 |
158 | private List getSegmentsNumberGiven() {
159 | List segments = new ArrayList(minimaPosition);
160 | List depths = depthScores;
161 | List depths2 = new ArrayList(depthScores);
162 | if (depths.size() > segmentNumber) {
163 |
164 | Collections.sort(depths);
165 | double min = depths.get(depths.size() - segmentNumber + 1);// save
166 |
167 | for (int i = segments.size() - 1; i >= 0; i--) {
168 | if (depths2.get(i) < min) {
169 | segments.remove(i);
170 | }
171 | }
172 | }
173 |
174 | return segments;
175 | }
176 |
177 | public List getSegments() {
178 | // copy minima list
179 | List segments = new ArrayList(minimaPosition);
180 |
181 | double mean = calculateMean(depthScores);
182 | double variance = calculateVariance(depthScores, mean);
183 | double threshold = mean - variance / 2.0;
184 |
185 | for (int i = segments.size() - 1; i >= 0; i--) {
186 | if (depthScores.get(i) < threshold) {
187 | segments.remove(i);
188 | }
189 | }
190 | return segments;
191 | }
192 |
193 | private double calculateVariance(List vals, double mean) {
194 | double variance = 0.0;
195 | for (double d : vals) {
196 | variance += (d - mean) * (d - mean);
197 | }
198 | variance /= vals.size();
199 | return variance;
200 | }
201 |
202 | private double calculateMean(List vals) {
203 | double mean = 0.0;
204 | for (double d : vals) {
205 | mean += d;
206 | }
207 | mean /= vals.size();
208 | return mean;
209 | }
210 |
211 | private List getDepthScores() {
212 | List depths = new ArrayList();
213 | double depth;
214 | for (int i : minimaPosition) {
215 |
216 | depths.add(getDepths(i));
217 | }
218 | return depths;
219 | }
220 |
221 | // //left and right neighbor
222 | private double getDepths(int minimumPosition) {
223 | int i = minimumPosition;
224 | double depths = similarityScores.get(i - 1) - similarityScores.get(i)
225 | + similarityScores.get(i + 1) - similarityScores.get(i);
226 | return depths;
227 | }
228 |
229 | private List getMinima() {
230 | List minima = new ArrayList();
231 | double prev = 0;
232 | double curr = 0;
233 | double next = 1;
234 | for (int i = 1; i < similarityScores.size() - 1; i++) {
235 | if (next != curr) {
236 | prev = similarityScores.get(i - 1);
237 | }
238 | curr = similarityScores.get(i);
239 | next = similarityScores.get(i + 1);
240 | if (curr < next && curr < prev) {
241 | minima.add(i);
242 | }
243 | }
244 | return minima;
245 |
246 | }
247 |
248 | private List getSimilarityScores(List> sentences) {
249 | List similarities = new ArrayList();
250 | List parts = new ArrayList();
251 | for (int i = 0; i < sentences.size(); i++) {
252 | parts.add(getPrev(sentences, i));
253 | }
254 | for (int i = window - 1; i > 0; i--) {
255 | parts.add(getPrev(sentences, sentences.size() - 1, i));
256 | }
257 | String[] partsArray = new String[parts.size()];
258 | int i = 0;
259 | for (String ss : parts) {
260 | partsArray[i++] = ss;
261 | }
262 |
263 | Model m = inference(partsArray);
264 | if (inferenceIterationRepeating == 1) {
265 | for (i = 0; i < partsArray.length - window; i++) {
266 | int[] v1 = getVector(m.K, m.z[i]);
267 | int[] v2 = getVector(m.K, m.z[i + window]);
268 | double sim = calculateDotProduct(v1, v2);
269 | similarities.add(sim);
270 | }
271 |
272 | } else {
273 | // initialize save structure for word wise topic stabilization
274 | ArrayList values = new ArrayList();
275 | for (int k = 0; k < partsArray.length; k++) {
276 | values.add(new int[m.z[k].size()][m.K]);
277 | }
278 | for (int k = 1; k < inferenceIterationRepeating; k++) {
279 | for (int p = 0; p < partsArray.length; p++) {
280 | for (int t = 0; t < m.z[p].size(); t++) {
281 | int topic = m.z[p].get(t);
282 | values.get(p)[t][topic]++;
283 | }
284 | }
285 | m = inference(partsArray);
286 | }
287 |
288 | List[] newZ = new ArrayList[partsArray.length];
289 | Random r = new Random();
290 | for (int s = 0; s < values.size(); s++) {
291 | int[][] sentence = values.get(s);
292 | newZ[s] = new ArrayList();
293 | for (int t = 0; t < sentence.length; t++) {
294 | List candidates = getTopicCandidates(sentence[t]);
295 |
296 | int topic = candidates.get(r.nextInt(candidates.size()));
297 | newZ[s].add(topic);
298 | }
299 |
300 | }
301 | for (i = 0; i < newZ.length - window; i++) {
302 | int[] v1 = getVector(m.K, newZ[i]);
303 | int[] v2 = getVector(m.K, newZ[i + window]);
304 | double sim = calculateDotProduct(v1, v2);
305 | similarities.add(sim);
306 | }
307 |
308 | }
309 |
310 | return similarities;
311 | }
312 |
313 | private List getTopicCandidates(int[] topics) {
314 | ArrayList candidates = new ArrayList();
315 | for (int m = inferenceIterationRepeating; m >= 0; m--) {
316 |
317 | for (int t = 0; t < topics.length; t++) {
318 | if (topics[t] == m) {
319 | candidates.add(t);
320 | }
321 | }
322 | if (candidates.size() > 0) {
323 | return candidates;
324 | }
325 | }
326 | return null;
327 | }
328 |
329 | private int[] getVector(int topicNumber, Collection topicAssigment) {
330 | int[] vec = new int[topicNumber];
331 | for (int k : topicAssigment) {
332 | vec[k]++;
333 | }
334 | return vec;
335 | }
336 |
337 | private Model inference(String[] sentences) {
338 | inf = new Inferencer();
339 | inf.init(opt);
340 |
341 | inf.niters = inferenceIteration;
342 | Model m = inf.inference(sentences);
343 | return m;
344 | }
345 |
346 | private double[] norm(int[] v1) {
347 | double sum = 0.0;
348 | for (int v : v1) {
349 | sum += v;
350 | }
351 | double[] vd = new double[v1.length];
352 | for (int i = 0; i < v1.length; i++) {
353 | vd[i] = v1[i] / sum;
354 | }
355 | return vd;
356 | }
357 |
358 | private int[] getVector(int i, Model m) {
359 | int[] vec = new int[m.K];
360 | for (int k : m.z[i]) {
361 | vec[k]++;
362 | }
363 | return vec;
364 | }
365 |
366 | private String getPrev(List> sentences, int i) {
367 |
368 | return getPrev(sentences, i, window);
369 | }
370 |
371 | private String getPrev(List> sentences, int i, int window) {
372 | String result = "";
373 | for (int k = i; k >= 0 && k > (i - window); k--) {
374 | for (Token t : sentences.get(k)) {
375 | result += t.getCoveredText() + " ";
376 | }
377 | }
378 | return result;
379 | }
380 |
381 | private double calculateDotProduct(double[] vd1, double[] vd2) {
382 | double xy = 0;
383 | double sumX = 0;
384 | double sumY = 0;
385 | if (vd1.length != vd2.length) {
386 | throw new IllegalArgumentException("Cosine Similarity: X != Y");
387 | }
388 | for (int i = 0; i < vd1.length; i++) {
389 | double xi = vd1[i];
390 | double yi = vd2[i];
391 |
392 | xy += xi * yi;
393 | sumX += xi * xi;
394 | sumY += yi * yi;
395 | }
396 |
397 | return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY));
398 | }
399 |
400 | private double calculateDotProduct(int[] curr, int[] next) {
401 | int xy = 0;
402 | int sumX = 0;
403 | int sumY = 0;
404 | if (curr.length != next.length) {
405 | throw new IllegalArgumentException("Cosine Similarity: X != Y");
406 | }
407 | for (int i = 0; i < curr.length; i++) {
408 | int xi = curr[i];
409 | int yi = next[i];
410 |
411 | xy += xi * yi;
412 | sumX += xi * xi;
413 | sumY += yi * yi;
414 | }
415 |
416 | return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY));
417 | }
418 |
419 | }
420 |
--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/TopicTilingSegmenterAnnotator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Martin Riedl
3 | * riedl@cs.tu-darmstadt.de
4 | * FG Language Technology
5 | * Technische Universität Darmstadt, Germany
6 | *
7 | *
8 | * This file is part of TopicTiling.
9 | *
10 | * TopicTiling is free software: you can redistribute it and/or modify
11 | * it under the terms of the GNU General Public License as published by
12 | * the Free Software Foundation, either version 3 of the License, or
13 | * (at your option) any later version.
14 | *
15 | * TopicTiling is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with TopicTiling. If not, see .
22 | */
23 |
24 |
25 |
26 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator;
27 |
28 | import java.text.DecimalFormat;
29 | import java.util.ArrayList;
30 | import java.util.Collection;
31 | import java.util.Iterator;
32 | import java.util.List;
33 |
34 | import org.apache.uima.UimaContext;
35 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
36 | import org.apache.uima.cas.impl.ListUtils;
37 | import org.apache.uima.jcas.JCas;
38 | import org.apache.uima.resource.ResourceInitializationException;
39 | import org.uimafit.component.JCasAnnotator_ImplBase;
40 | import org.uimafit.descriptor.ConfigurationParameter;
41 | import org.uimafit.util.JCasUtil;
42 |
43 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TextTilingWindowOptimized;
44 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TopicTiling;
45 | import de.tudarmstadt.langtech.semantics.type.Segment;
46 | import de.tudarmstadt.langtech.semantics.type.SegmentQuantity;
47 | import de.tudarmstadt.langtech.semantics.type.SegmentScore;
48 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
49 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
50 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
51 |
52 | public class TopicTilingSegmenterAnnotator extends JCasAnnotator_ImplBase {
53 | private boolean printSegments = true;
54 |
55 | public static final String PARAM_USE_ASSIGNED_TOPICS = "UseAssgnedTopics";
56 | public static final String PARAM_LDA_MODEL_DIRECTORY = "LdaModelDirectory";
57 | public static final String PARAM_LDA_MODEL_NAME = "LdaModelName";
58 | public static final String PARAM_WINDOW = "TopicTilingWindow";
59 | public static final String PARAM_REPEAT_SEGMENTATION = "RepeatedSegmentation";
60 | public static final String PARAM_INFERENCE_ITERATION = "InferenceIteration";
61 | public static final String PARAM_REPEAT_INFERENCE = "RepeatedInference";
62 | public static final String PARAM_DEPTH_SCORE = "DepthScore";
63 | public static final String PARAM_MODE_COUNTING = "ModeCounting";
64 | public static final String PARAM_DEBUG="Debug";
65 | @ConfigurationParameter(name = PARAM_USE_ASSIGNED_TOPICS, mandatory = false)
66 | private boolean useAssignedTopics = false;;
67 | @ConfigurationParameter(name = PARAM_LDA_MODEL_DIRECTORY, mandatory = true)
68 | private String ldaModelDirectory;
69 | @ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true)
70 | private String ldaModelName;
71 | @ConfigurationParameter(name = PARAM_WINDOW, mandatory = true)
72 | private int window;
73 | @ConfigurationParameter(name = PARAM_REPEAT_INFERENCE, mandatory = true)
74 | private int repeatInferences;
75 | @ConfigurationParameter(name = PARAM_REPEAT_SEGMENTATION, mandatory = true)
76 | private int repeatSegmentation;
77 | @ConfigurationParameter(name = PARAM_INFERENCE_ITERATION, mandatory = true)
78 | private int inferenceIteration;
79 | @ConfigurationParameter(name = PARAM_MODE_COUNTING, mandatory = true)
80 | private boolean modeCounting;
81 | @ConfigurationParameter(name = PARAM_DEBUG, mandatory = false)
82 | private boolean debug;
83 | @ConfigurationParameter(name = PARAM_DEPTH_SCORE, mandatory = true)
84 | private String depthScore;
85 |
86 | @Override
87 | public void initialize(UimaContext context)
88 | throws ResourceInitializationException {
89 | super.initialize(context);
90 |
91 | }
92 |
93 | @Override
94 | public void process(JCas jcas) throws AnalysisEngineProcessException {
95 |
96 | List> s = new ArrayList>();
97 |
98 | // int i = 0;
99 | Iterator segments = JCasUtil.select(jcas, Segment.class)
100 | .iterator();
101 | Segment seg = null;
102 | if (segments.hasNext())
103 | seg = segments.next();
104 |
105 | for (Sentence ss : JCasUtil.select(jcas, Sentence.class)) {
106 |
107 | s.add(JCasUtil.selectCovered(Token.class, ss));
108 |
109 | }
110 |
111 | DocumentMetaData meta = DocumentMetaData.get(jcas);
112 | StringBuffer buffer = new StringBuffer();
113 | buffer.append(meta.getDocumentTitle());
114 |
115 | buffer.append("\n");
116 | // TopicTilingTopicDocument tttd ;
117 |
118 | TopicTiling tt;
119 | tt = new TopicTiling(ldaModelDirectory, ldaModelName, window,
120 | repeatSegmentation, repeatInferences, inferenceIteration,
121 | modeCounting, depthScore, useAssignedTopics,debug);
122 | buffer.append("GOL: " + getGoldSegments(jcas) + "\n");
123 | List segmentPositions;
124 | if (JCasUtil.select(jcas, SegmentQuantity.class).size() == 0) {
125 | segmentPositions = tt.segment(s);
126 | } else {
127 | int segNum = JCasUtil.select(jcas, SegmentQuantity.class)
128 | .iterator().next().getSegmentCount();
129 | segmentPositions = tt.segment(s, segNum);
130 | }
131 |
132 | int j = 0;
133 | for (List ss: s){
134 | String l = "";
135 | for (Token t:ss){
136 | l+=t.getCoveredText()+" ";
137 | }
138 | if(debug)System.out.println(j+"\t"+l);
139 | j++;
140 | }
141 | if(debug)System.out.println(segmentPositions);
142 | annotateSegments(jcas, segmentPositions, tt.depthScores,
143 | tt.minimaPosition,tt.similarityScores);
144 | }
145 |
146 | private void printRcode(JCas jcas, int segmentCount,
147 | TextTilingWindowOptimized tt, List segments) {
148 | DocumentMetaData metaData = DocumentMetaData.get(jcas);
149 | ;
150 | String main = metaData.getDocumentTitle()
151 | + ": Cosine Similarity between sentences ";
152 | if (segmentCount < 0)
153 | main = main + " (segments given: " + segmentCount + ")";
154 | StringBuffer buffer = new StringBuffer();
155 | buffer.append("#Cosine Similarity\n");
156 | buffer.append("pdf(file='" + metaData.getDocumentTitle()
157 | + ".pdf',20,7);\n");
158 | buffer.append(toListInR(tt.similarityScores, "cos"));
159 | buffer.append(toListInR(segments, "estSeg"));
160 | buffer.append(toListInR(getGoldSegments(jcas), "seg"));
161 | buffer.append(toListInR(tt.minimaPosition, "canSeg"));
162 | buffer.append(toListInR(tt.depthScores, "depth"));
163 | buffer.append("plot(0:"
164 | + (tt.similarityScores.size() - 1)
165 | + ",cos,type='l',xlab='Sentence',ylab='cosine similarity',main='"
166 | + main + "');\n");
167 | buffer.append("abline(v=seg,col='red',lty=5);\n");
168 | buffer.append("abline(v=estSeg,col='green',lwd=2,lty=4);\n");
169 | buffer.append("abline(v=seg[seg%in%estSeg],col='black',lwd=3);\n");
170 | buffer.append("points(estSeg,rep(max(cos)*0.98," + segments.size()
171 | + "),col='green',pch=22);\n");
172 | buffer.append("points(canSeg,rep(max(cos)*0.9,"
173 | + tt.minimaPosition.size() + "),col='blue',pch=23);\n");
174 | buffer.append("text(canSeg[-length(canSeg)],rep(max(cos)*c(0.84,0.88,0.92,0.94),length="
175 | + tt.depthScores.size() + "),labels=depth);\n");
176 | buffer.append("dev.off();dev.off()");
177 | System.out.println(buffer.toString());
178 |
179 | }
180 |
181 | private List getGoldSegments(JCas jcas) {
182 |
183 | List ret = new ArrayList();
184 | Iterator segIt = JCasUtil.iterator(jcas, Segment.class);
185 | int sentenceCount = -1;
186 | while (segIt.hasNext()) {
187 | Segment seg = segIt.next();
188 | for (Sentence s : JCasUtil.selectCovered(jcas, Sentence.class, seg)) {
189 | sentenceCount++;
190 | }
191 | ret.add(sentenceCount);
192 | }
193 | return ret;
194 | }
195 |
196 | private StringBuffer toListInR(List list, String name) {
197 | StringBuffer buffer = new StringBuffer();
198 | buffer.append(name);
199 | buffer.append("=c(");
200 | for (T sc : list) {
201 | if (sc instanceof Double) {
202 | DecimalFormat df = new DecimalFormat("#.##");
203 | buffer.append(df.format(sc).replace(",", "."));
204 | } else {
205 | buffer.append(sc);
206 | }
207 | buffer.append(",");
208 | }
209 | if (list.size() > 0)
210 | buffer.deleteCharAt(buffer.length() - 1);
211 | buffer.append(");\n");
212 | return buffer;
213 | }
214 | private String getSimilarityScores(List similarityScores, int from, int to){
215 | String scores = "";
216 | int f = from-1;
217 | if (f<0)f=0;
218 | if(debug)System.out.println(f+"\t"+(to-1));
219 | for(int i =f;i<=to-1;i++){
220 | scores+=","+similarityScores.get(i);
221 | }
222 | if (scores.length()>0)scores=scores.substring(1);
223 | return scores;
224 | }
225 | private void annotateSegments(JCas jcas, List segmentPositions,
226 | List depthScores, List minimaPosition,List similarityScores) {
227 | List sentences = new ArrayList(JCasUtil.select(jcas, Sentence.class));
228 |
229 | //add first segment which has no score
230 | int endIdx;
231 | if (segmentPositions.get(segmentPositions.size()-1)!=(sentences.size()-1)){
232 | segmentPositions.add(sentences.size()-1);
233 | depthScores.add(0.0);
234 | }
235 | int endSentece;
236 | if (segmentPositions.size()>0){
237 | endIdx=sentences.get(segmentPositions.get(0)).getEnd();
238 | endSentece=segmentPositions.get(0);
239 | }else{
240 | endIdx=sentences.get(sentences.size()-1).getEnd();
241 | endSentece=sentences.size()-1;
242 | }
243 | addSegment(sentences.get(0).getBegin(),endIdx,0.0,getSimilarityScores(similarityScores, 0,endSentece),jcas);
244 | int segEnd;
245 | int segStart;
246 | for(int i=1;i segmentPositions,
263 | List depthScores, List minimaPosition) {
264 | Iterator sentenceItr = JCasUtil
265 | .iterator(jcas, Sentence.class);
266 | int sentenceCount = -1;
267 | int prevBreak = 0;
268 |
269 | for (final int sBreak : segmentPositions) {
270 | final SegmentScore score = new SegmentScore(jcas);
271 |
272 | Sentence segmentSentence = null;
273 |
274 | int beginOffset = 0;
275 | int endOffset = 0;
276 |
277 | // move sentenceItr to last sentence in segment
278 | for (; sentenceCount < sBreak; sentenceCount++) {
279 | segmentSentence = sentenceItr.next();
280 |
281 | if (sentenceCount == prevBreak) {
282 | beginOffset = segmentSentence.getBegin();
283 | System.out.println("BeginOffset: "+ beginOffset);
284 | }
285 | }
286 |
287 | if (segmentSentence != null) {
288 | endOffset = segmentSentence.getEnd();
289 | System.out.println("end offset "+endOffset);
290 | }
291 | score.setBegin(beginOffset);
292 | score.setEnd(endOffset);
293 | int idx = minimaPosition.indexOf(sBreak);
294 | if (idx < 0) {
295 | score.setScore(1.0);
296 | } else {
297 | score.setScore(depthScores.get(idx));
298 | }
299 | score.addToIndexes();
300 | if (printSegments) {
301 | System.out.println(sBreak + "\t" + sentenceCount + "\t"
302 | + beginOffset + "\t" + endOffset);
303 | }
304 | prevBreak = sBreak;
305 | }
306 | }
307 |
308 | /**
309 | * expects a list with the sentencenumber that will be segmented
310 | *
311 | * @param jcas
312 | * @param sentenceBreaks
313 | */
314 | private void annotateSegments(JCas jcas, List sentenceBreaks) {
315 | Iterator sentenceItr = JCasUtil
316 | .iterator(jcas, Sentence.class);
317 | int sentenceCount = -1;
318 | int prevBreak = 0;
319 | if (printSegments) {
320 | System.out.println("Annotated Segments");
321 | System.out.println(sentenceBreaks.toString());
322 | }
323 |
324 | for (final int sBreak : sentenceBreaks) {
325 | final Segment seg = new Segment(jcas);
326 |
327 | Sentence segmentSentence = null;
328 |
329 | int beginOffset = 0;
330 | int endOffset = 0;
331 |
332 | // move sentenceItr to last sentence in segment
333 | for (; sentenceCount < sBreak; sentenceCount++) {
334 | segmentSentence = sentenceItr.next();
335 |
336 | if (sentenceCount == prevBreak) {
337 | beginOffset = segmentSentence.getBegin();
338 | }
339 | }
340 |
341 | if (segmentSentence != null) {
342 | endOffset = segmentSentence.getEnd();
343 | }
344 |
345 | seg.setBegin(beginOffset);
346 | seg.setEnd(endOffset);
347 | seg.addToIndexes();
348 |
349 | if (printSegments) {
350 | System.out.println(sBreak + "\t" + sentenceCount + "\t"
351 | + beginOffset + "\t" + endOffset);
352 | }
353 | prevBreak = sBreak;
354 | }
355 | }
356 | }
357 |
--------------------------------------------------------------------------------