├── topictiling.png
├── de.tudarmstadt.langtech.semantics.segmentation.topictiling
    ├── scripts
    │   ├── topictiling.sh
    │   ├── package.sh
    │   └── topictiling.bat
    ├── src
    │   ├── main
    │   │   ├── resources
    │   │   │   └── desc
    │   │   │   │   └── type
    │   │   │   │       ├── Segment.xml
    │   │   │   │       ├── SegmentQuantity.xml
    │   │   │   │       ├── CohesionIndicator.xml
    │   │   │   │       ├── SegmentScore.xml
    │   │   │   │       └── GibbsLdaDescriptor.xml
    │   │   └── java
    │   │   │   └── de
    │   │   │       └── tudarmstadt
    │   │   │           └── langtech
    │   │   │               ├── semantics
    │   │   │                   ├── type
    │   │   │                   │   ├── Segment.java
    │   │   │                   │   ├── Segment_Type.java
    │   │   │                   │   ├── SegmentQuantity.java
    │   │   │                   │   ├── SegmentQuantity_Type.java
    │   │   │                   │   ├── SegmentScore.java
    │   │   │                   │   └── SegmentScore_Type.java
    │   │   │                   └── segmentation
    │   │   │                   │   └── segmenter
    │   │   │                   │       ├── annotator
    │   │   │                   │           ├── SimpleSegmenter.java
    │   │   │                   │           ├── OutputSegments.java
    │   │   │                   │           ├── TopicTilingDocumentSegmenterAnnotator.java
    │   │   │                   │           └── TopicTilingSegmenterAnnotator.java
    │   │   │                   │       ├── RunTopicTilingOnFile.java
    │   │   │                   │       ├── TopicTilingTopicDocument.java
    │   │   │                   │       └── TextTilingWindowOptimized.java
    │   │   │               └── ml
    │   │   │                   └── lda
    │   │   │                       └── type
    │   │   │                           ├── GibbsLdaTopic.java
    │   │   │                           └── GibbsLdaTopic_Type.java
    │   └── test
    │   │   └── java
    │   │       ├── TestSimpleReader.java
    │   │       └── RunTopicTilingOnFile.java
    ├── .project
    ├── .classpath
    ├── pom.xml
    └── README.txt
├── split_output.py
├── de.tudarmstadt.langtech.lda
    ├── .project
    ├── pom.xml
    ├── src
    │   ├── main
    │   │   ├── java
    │   │   │   ├── jgibbslda
    │   │   │   │   ├── Constants.java
    │   │   │   │   ├── Conversion.java
    │   │   │   │   ├── LDACmdOption.java
    │   │   │   │   ├── Pair.java
    │   │   │   │   ├── LDA.java
    │   │   │   │   ├── Document.java
    │   │   │   │   ├── Estimator.java
    │   │   │   │   ├── Dictionary.java
    │   │   │   │   ├── LogSaveEstimator.java
    │   │   │   │   ├── Inferencer.java
    │   │   │   │   └── LDADataset.java
    │   │   │   └── de
    │   │   │   │   └── tudarmstadt
    │   │   │   │       └── langtech
    │   │   │   │           └── lda
    │   │   │   │               ├── consumer
    │   │   │   │                   └── GibbsLdaModelGeneratorConsumer.java
    │   │   │   │               ├── annotator
    │   │   │   │                   ├── GibbsLdaDocumentBasedTopicIdAnnotator.java
    │   │   │   │                   ├── GibbsLdaSentenceBasedTopicIdAnnotator.java
    │   │   │   │                   ├── GibbsLdaTopicModelAnnotator.java
    │   │   │   │                   └── GibbsLdaTopicIdAnnotator.java
    │   │   │   │               └── type
    │   │   │   │                   ├── Topic.java
    │   │   │   │                   ├── Topic_Type.java
    │   │   │   │                   ├── TopicDistribution.java
    │   │   │   │                   ├── WordTopicDistribution.java
    │   │   │   │                   ├── TopicDistribution_Type.java
    │   │   │   │                   └── WordTopicDistribution_Type.java
    │   │   └── resources
    │   │   │   └── desc
    │   │   │       └── type
    │   │   │           └── gibbsldatypes.xml
    │   └── test
    │   │   └── java
    │   │       └── de
    │   │           └── tudarmstadt
    │   │               └── langtech
    │   │                   └── lda
    │   │                       └── TestLdaTopicModelAnnotator.java
    └── .classpath
└── README.md


/topictiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/riedlma/topictiling/HEAD/topictiling.png


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/topictiling.sh:
--------------------------------------------------------------------------------
1 | java -Xmx1G -cp $(echo dependency/*jar| tr ' ' ':'):de.tudarmstadt.langtech.semantics.segmentation.topictiling-0.0.2.jar de.tudarmstadt.langtech.semantics.segmentation.segmenter.RunTopicTilingOnFile $@
2 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/package.sh:
--------------------------------------------------------------------------------
 1 | outp=topictiling_0.0.2
 2 | 
 3 | cd ../de.tudarmstadt.langtech.lda
 4 | mvn package
 5 | mvn install
 6 | cd ../de.tudarmstadt.langtech.semantics.segmentation.topictiling
 7 | mvn package
 8 | mvn dependency:copy-dependencies
 9 | 
10 | mkdir $outp
11 | cp target/*jar $outp
12 | cp -r target/dependency $outp
13 | cp scripts/top*sh $outp
14 | cp scripts/top*bat $outp
15 | 
16 | cp README.txt $outp


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
 3 |   <name>SegmentDescriptor</name>
 4 |   <description/>
 5 |   <version>1.0</version>
 6 |   <vendor/>
 7 |   <types>
 8 |     <typeDescription>
 9 |       <name>de.tudarmstadt.langtech.semantics.type.Segment</name>
10 |       <description/>
11 |       <supertypeName>uima.tcas.Annotation</supertypeName>
12 |     </typeDescription>
13 |   </types>
14 | </typeSystemDescription>
15 | 


--------------------------------------------------------------------------------
/split_output.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | topic_output = sys.argv[1]
 5 | output_folder = sys.argv[2]
 6 | 
 7 | if not os.path.exists(output_folder):
 8 | 	os.makedirs(output_folder)
 9 | 
10 | out = ""
11 | out_filename = ""
12 | for l in open(topic_output):
13 | 	out+=l
14 | 	if l.strip()=="<document>":
15 | 		out=""
16 | 	if l.strip().startswith("<documentName>"):
17 | 		docname = l.strip().replace("<documentName>","").replace("</documentName>","")
18 | 	if l.strip().startswith("</document>"):
19 | 		fw = open(os.path.join(output_folder,docname),"w")
20 | 		fw.write(out)
21 | 		fw.close()
22 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>de.tudarmstadt.langtech.lda</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>de.tudarmstadt.langtech.semantics.segmentation.topictiling</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentQuantity.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
 3 |   <name>SegmentQuantity</name>
 4 |   <description/>
 5 |   <version>1.0</version>
 6 |   <vendor/>
 7 |   <types>
 8 |     <typeDescription>
 9 |       <name>de.tudarmstadt.langtech.semantics.type.SegmentQuantity</name>
10 |       <description>Saves the number of segments a document should consist of according to a given gold-standard.</description>
11 |       <supertypeName>uima.tcas.Annotation</supertypeName>
12 |       <features>
13 |         <featureDescription>
14 |           <name>segmentCount</name>
15 |           <description/>
16 |           <rangeTypeName>uima.cas.Integer</rangeTypeName>
17 |         </featureDescription>
18 |       </features>
19 |     </typeDescription>
20 |   </types>
21 | </typeSystemDescription>
22 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/CohesionIndicator.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
 3 |   <name>CohesionIndicator</name>
 4 |   <description/>
 5 |   <version>1.0</version>
 6 |   <vendor/>
 7 |   <types>
 8 |     <typeDescription>
 9 |       <name>de.tudarmstadt.ukp.dkpro.semantics.type.CohesionIndicator</name>
10 |       <description>Marks a range that is relevant for cohesion. This may be, for instance, a Lemma.</description>
11 |       <supertypeName>uima.tcas.Annotation</supertypeName>
12 |       <features>
13 |         <featureDescription>
14 |           <name>stringRepresentation</name>
15 |           <description/>
16 |           <rangeTypeName>uima.cas.String</rangeTypeName>
17 |         </featureDescription>
18 |       </features>
19 |     </typeDescription>
20 |   </types>
21 | </typeSystemDescription>
22 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentScore.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?><typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
 2 |   <name>SegmentScore</name>
 3 |   <description/>
 4 |   <version>1.0</version>
 5 |   <vendor/>
 6 |   <types>
 7 |     <typeDescription>
 8 |       <name>de.tudarmstadt.langtech.semantics.type.SegmentScore</name>
 9 |       <description/>
10 |       <supertypeName>uima.tcas.Annotation</supertypeName>
11 |       <features>
12 |         <featureDescription>
13 |           <name>score</name>
14 |           <description/>
15 |           <rangeTypeName>uima.cas.Double</rangeTypeName>
16 |         </featureDescription>
17 |         <featureDescription>
18 |           <name>similarityScores</name>
19 |           <description/>
20 |           <rangeTypeName>uima.cas.String</rangeTypeName>
21 |         </featureDescription>
22 |       </features>
23 |     </typeDescription>
24 |   </types>
25 | </typeSystemDescription>
26 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/GibbsLdaDescriptor.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
 3 |   <name>GibbsLdaDescriptor</name>
 4 |   <description/>
 5 |   <version>1.0</version>
 6 |   <vendor/>
 7 |   <types>
 8 |     <typeDescription>
 9 |       <name>de.tudarmstadt.langtech.ml.lda.type.GibbsLdaTopic</name>
10 |       <description/>
11 |       <supertypeName>uima.tcas.Annotation</supertypeName>
12 |       <features>
13 |         <featureDescription>
14 |           <name>topic</name>
15 |           <description/>
16 |           <rangeTypeName>uima.cas.Integer</rangeTypeName>
17 |         </featureDescription>
18 |         <featureDescription>
19 |           <name>termId</name>
20 |           <description/>
21 |           <rangeTypeName>uima.cas.Integer</rangeTypeName>
22 |         </featureDescription>
23 |       </features>
24 |     </typeDescription>
25 |   </types>
26 | </typeSystemDescription>
27 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>de.tudarmstadt.ukp.dkpro</groupId>
 4 |   <artifactId>de.tudarmstadt.ukp.dkpro.lda</artifactId>
 5 |   <version>0.0.1-SNAPSHOT</version>
 6 |   <dependencies>
 7 |   	<dependency>
 8 |   		<groupId>args4j</groupId>
 9 |   		<artifactId>args4j</artifactId>
10 |   		<version>2.0.16</version>
11 |   		<type>jar</type>
12 |   		<scope>compile</scope>
13 |   	</dependency>
14 |   	<dependency>
15 |   		<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
16 |   		<artifactId>de.tudarmstadt.ukp.dkpro.core.api.metadata-asl</artifactId>
17 |   		<version>1.4.0</version>
18 |   	</dependency>
19 |   	<dependency>
20 |   		<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
21 |   		<artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId>
22 |   		<version>1.4.0</version>
23 |   	</dependency>
24 |   </dependencies>
25 | </project>


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/test/java/TestSimpleReader.java:
--------------------------------------------------------------------------------
 1 | import java.io.File;
 2 | import java.io.IOException;
 3 | 
 4 | import org.apache.commons.io.FileUtils;
 5 | import org.apache.uima.UIMAException;
 6 | import org.apache.uima.analysis_engine.AnalysisEngine;
 7 | import org.apache.uima.jcas.JCas;
 8 | import org.uimafit.factory.AnalysisEngineFactory;
 9 | import org.uimafit.factory.JCasFactory;
10 | import org.uimafit.pipeline.SimplePipeline;
11 | import org.uimafit.util.JCasUtil;
12 | 
13 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.SimpleSegmenter;
14 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
15 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
16 | 
17 | public class TestSimpleReader {
18 | 	public static void main(String[] args) throws UIMAException, IOException {
19 | //		String f = "test.txt";
20 | //		JCas jcas = JCasFactory.createJCas();
21 | //		jcas.setDocumentText(FileUtils.readFileToString(new File(f)));
22 | //		AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(SimpleSegmenter.class);
23 | //		SimplePipeline.runPipeline(jcas, segmenter);
24 | //		for(Sentence s:JCasUtil.select(jcas, Sentence.class)){
25 | //			System.out.println(s.getCoveredText());
26 | //			for (Token t: JCasUtil.selectCovered( Token.class,s)){
27 | //				System.out.println(t.getCoveredText());
28 | //			}	
29 | //		}
30 | 		
31 | 		
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Constants.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2007 by
 3 |  * 
 4 |  * 	Xuan-Hieu Phan
 5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 6 |  * 	Graduate School of Information Sciences
 7 |  * 	Tohoku University
 8 |  * 
 9 |  *  Cam-Tu Nguyen
10 |  *  ncamtu@gmail.com
11 |  *  College of Technology
12 |  *  Vietnam National University, Hanoi
13 |  *
14 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
15 |  * it under the terms of the GNU General Public License as published
16 |  * by the Free Software Foundation; either version 2 of the License,
17 |  * or (at your option) any later version.
18 |  *
19 |  * JGibbsLDA is distributed in the hope that it will be useful, but
20 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 |  * GNU General Public License for more details.
23 |  *
24 |  * You should have received a copy of the GNU General Public License
25 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 |  */
28 | 
29 | package jgibbslda;
30 | 
31 | public class Constants {
32 | 	public static final long BUFFER_SIZE_LONG = 1000000;
33 | 	public static final short BUFFER_SIZE_SHORT = 512;
34 | 	
35 | 	public static final int MODEL_STATUS_UNKNOWN = 0;
36 | 	public static final int MODEL_STATUS_EST = 1;
37 | 	public static final int MODEL_STATUS_ESTC = 2;
38 | 	public static final int MODEL_STATUS_INF = 3;
39 | }
40 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
21 | 		<attributes>
22 | 			<attribute name="maven.pomderived" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
26 | 		<attributes>
27 | 			<attribute name="maven.pomderived" value="true"/>
28 | 		</attributes>
29 | 	</classpathentry>
30 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
31 | 		<attributes>
32 | 			<attribute name="maven.pomderived" value="true"/>
33 | 		</attributes>
34 | 	</classpathentry>
35 | 	<classpathentry kind="output" path="target/classes"/>
36 | </classpath>
37 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Conversion.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2007 by
 3 |  * 
 4 |  * 	Xuan-Hieu Phan
 5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 6 |  * 	Graduate School of Information Sciences
 7 |  * 	Tohoku University
 8 |  * 
 9 |  *  Cam-Tu Nguyen
10 |  *  ncamtu@gmail.com
11 |  *  College of Technology
12 |  *  Vietnam National University, Hanoi
13 |  *
14 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
15 |  * it under the terms of the GNU General Public License as published
16 |  * by the Free Software Foundation; either version 2 of the License,
17 |  * or (at your option) any later version.
18 |  *
19 |  * JGibbsLDA is distributed in the hope that it will be useful, but
20 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 |  * GNU General Public License for more details.
23 |  *
24 |  * You should have received a copy of the GNU General Public License
25 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 |  */
28 | 
29 | package jgibbslda;
30 | 
31 | public class Conversion {
32 | 	public static String ZeroPad( int number, int width )
33 | 	{
34 | 	      StringBuffer result = new StringBuffer("");
35 | 	      for( int i = 0; i < width-Integer.toString(number).length(); i++ )
36 | 	         result.append( "0" );
37 | 	      result.append( Integer.toString(number) );
38 | 	     
39 | 	      return result.toString();
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/consumer/GibbsLdaModelGeneratorConsumer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *	Martin Riedl
 3 |  *	riedl@cs.tu-darmstadt.de
 4 |  *  FG Language Technology
 5 |  * 	Technische Universität Darmstadt, Germany
 6 |  * 
 7 |  * 
 8 |  *  This file is part of TopicTiling.
 9 |  *
10 |  *  TopicTiling is free software: you can redistribute it and/or modify
11 |  *  it under the terms of the GNU General Public License as published by
12 |  *  the Free Software Foundation, either version 3 of the License, or
13 |  *  (at your option) any later version.
14 |  *
15 |  *  TopicTiling is distributed in the hope that it will be useful,
16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 |  *  GNU General Public License for more details.
19 |  *
20 |  *  You should have received a copy of the GNU General Public License
21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
22 |  */
23 | 
24 | package de.tudarmstadt.langtech.lda.consumer;
25 | 
26 | import jgibbslda.Estimator;
27 | import jgibbslda.LDACmdOption;
28 | 
29 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
30 | import org.apache.uima.jcas.JCas;
31 | import org.uimafit.component.JCasConsumer_ImplBase;
32 | 
33 | public class GibbsLdaModelGeneratorConsumer extends JCasConsumer_ImplBase {
34 | 
35 | 	@Override
36 | 	public void process(JCas aJCas)
37 | 		throws AnalysisEngineProcessException {
38 | 		LDACmdOption options = new LDACmdOption();
39 | 		Estimator es = new Estimator();
40 | 		es.init(options);
41 | 		
42 | 	}
43 | 	
44 | }
45 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
21 | 		<attributes>
22 | 			<attribute name="maven.pomderived" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry combineaccessrules="false" kind="src" path="/de.tudarmstadt.langtech.lda"/>
26 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
27 | 		<attributes>
28 | 			<attribute name="maven.pomderived" value="true"/>
29 | 		</attributes>
30 | 	</classpathentry>
31 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
32 | 		<attributes>
33 | 			<attribute name="maven.pomderived" value="true"/>
34 | 		</attributes>
35 | 	</classpathentry>
36 | 	<classpathentry kind="output" path="target/classes"/>
37 | </classpath>
38 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>de.tudarmstadt.langtech.semantics.segmentation</groupId>
 4 |   <artifactId>de.tudarmstadt.langtech.semantics.segmentation.topictiling</artifactId>
 5 |   <version>0.0.2</version>
 6 |   <dependencies>
 7 |   	<dependency>
 8 |   		<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
 9 |   		<artifactId>
10 |   			de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl
11 |   		</artifactId>
12 |   		<version>1.4.0</version>
13 |   	</dependency>
14 |   	<dependency>
15 |   		<groupId>args4j</groupId>
16 |   		<artifactId>args4j</artifactId>
17 |   		<version>2.0.16</version>
18 |   	</dependency>
19 |   	<dependency>
20 |   		<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
21 |   		<artifactId>
22 |   			de.tudarmstadt.ukp.dkpro.core.stanfordnlp-gpl
23 |   		</artifactId>
24 |   		<version>1.4.0</version>
25 |   	</dependency>
26 |   	<dependency>
27 |   		<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
28 |   		<artifactId>
29 |   			de.tudarmstadt.ukp.dkpro.core.io.text-asl
30 |   		</artifactId>
31 |   		<version>1.4.0</version>
32 |   	</dependency>
33 |   	<dependency>
34 |   		<groupId>org.uimafit</groupId>
35 |   		<artifactId>uimafit</artifactId>
36 |   		<version>1.4.0</version>
37 |   	</dependency>
38 |   	<dependency>
39 |   		<groupId>de.tudarmstadt.ukp.dkpro</groupId>
40 |   		<artifactId>de.tudarmstadt.ukp.dkpro.lda</artifactId>
41 |   		<version>0.0.1-SNAPSHOT</version>
42 |   	</dependency>
43 |   </dependencies>
44 | </project>


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDACmdOption.java:
--------------------------------------------------------------------------------
 1 | package jgibbslda;
 2 | 
 3 | import org.kohsuke.args4j.*;
 4 | 
 5 | public class LDACmdOption {
 6 | 	
 7 | 	@Option(name="-est", usage="Specify whether we want to estimate model from scratch")
 8 | 	public boolean est = false;
 9 | 	
10 | 	@Option(name="-estc", usage="Specify whether we want to continue the last estimation")
11 | 	public boolean estc = false;
12 | 	
13 | 	@Option(name="-inf", usage="Specify whether we want to do inference")
14 | 	public boolean inf = true;
15 | 	
16 | 	@Option(name="-dir", usage="Specify directory")
17 | 	public String dir = "";
18 | 	
19 | 	@Option(name="-dfile", usage="Specify data file")
20 | 	public String dfile = "";
21 | 	
22 | 	@Option(name="-model", usage="Specify the model name")
23 | 	public String modelName = "";
24 | 	
25 | 	@Option(name="-alpha", usage="Specify alpha")
26 | 	public double alpha = -1.0;
27 | 	
28 | 	@Option(name="-beta", usage="Specify beta")
29 | 	public double beta = -1.0;
30 | 	
31 | 	@Option(name="-ntopics", usage="Specify the number of topics")
32 | 	public int K = 100;
33 | 	
34 | 	@Option(name="-niters", usage="Specify the number of iterations")
35 | 	public int niters = 1000;
36 | 	
37 | 	@Option(name="-savestep", usage="Specify the number of steps to save the model since the last save")
38 | 	public int savestep = 100;
39 | 	
40 | 	@Option(name="-twords", usage="Specify the number of most likely words to be printed for each topic")
41 | 	public int twords = 100;
42 | 	
43 | 	@Option(name="-withrawdata", usage="Specify whether we include raw data in the input")
44 | 	public boolean withrawdata = false;
45 | 	
46 | 	@Option(name="-wordmap", usage="Specify the wordmap file")
47 | 	public String wordMapFileName = "wordmap.txt";
48 | }
49 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Pair.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2007 by
 3 |  * 
 4 |  * 	Xuan-Hieu Phan
 5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 6 |  * 	Graduate School of Information Sciences
 7 |  * 	Tohoku University
 8 |  * 
 9 |  *  Cam-Tu Nguyen
10 |  *  ncamtu@gmail.com
11 |  *  College of Technology
12 |  *  Vietnam National University, Hanoi
13 |  *
14 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
15 |  * it under the terms of the GNU General Public License as published
16 |  * by the Free Software Foundation; either version 2 of the License,
17 |  * or (at your option) any later version.
18 |  *
19 |  * JGibbsLDA is distributed in the hope that it will be useful, but
20 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 |  * GNU General Public License for more details.
23 |  *
24 |  * You should have received a copy of the GNU General Public License
25 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 |  */
28 | 
29 | package jgibbslda;
30 | 
31 | import java.util.Comparator;
32 | 
33 | public class Pair implements Comparable<Pair> {
34 | 	public Object first;
35 | 	public Comparable second;
36 | 	public static boolean naturalOrder = false;
37 | 	
38 | 	public Pair(Object k, Comparable v){
39 | 		first = k;
40 | 		second = v;		
41 | 	}
42 | 	
43 | 	public Pair(Object k, Comparable v, boolean naturalOrder){
44 | 		first = k;
45 | 		second = v;
46 | 		Pair.naturalOrder = naturalOrder; 
47 | 	}
48 | 	
49 | 	public int compareTo(Pair p){
50 | 		if (naturalOrder)
51 | 			return this.second.compareTo(p.second);
52 | 		else return -this.second.compareTo(p.second);
53 | 	}
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/resources/desc/type/gibbsldatypes.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
 3 |   <name>gibbsldatypes</name>
 4 |   <description/>
 5 |   <version>1.0</version>
 6 |   <vendor/>
 7 |   <types>
 8 |     <typeDescription>
 9 |       <name>de.tudarmstadt.ukp.dkpro.lda.type.Topic</name>
10 |       <description/>
11 |       <supertypeName>uima.tcas.Annotation</supertypeName>
12 |       <features>
13 |         <featureDescription>
14 |           <name>topicId</name>
15 |           <description/>
16 |           <rangeTypeName>uima.cas.Integer</rangeTypeName>
17 |         </featureDescription>
18 |         <featureDescription>
19 |           <name>topicModeId</name>
20 |           <description/>
21 |           <rangeTypeName>uima.cas.Integer</rangeTypeName>
22 |         </featureDescription>
23 |       </features>
24 |     </typeDescription>
25 |     <typeDescription>
26 |       <name>de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution</name>
27 |       <description/>
28 |       <supertypeName>uima.tcas.Annotation</supertypeName>
29 |       <features>
30 |         <featureDescription>
31 |           <name>topicDistribution</name>
32 |           <description/>
33 |           <rangeTypeName>uima.cas.DoubleArray</rangeTypeName>
34 |         </featureDescription>
35 |       </features>
36 |     </typeDescription>
37 |     <typeDescription>
38 |       <name>de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution</name>
39 |       <description/>
40 |       <supertypeName>uima.tcas.Annotation</supertypeName>
41 |       <features>
42 |         <featureDescription>
43 |           <name>topicDistribution</name>
44 |           <description/>
45 |           <rangeTypeName>uima.cas.DoubleArray</rangeTypeName>
46 |           <multipleReferencesAllowed>true</multipleReferencesAllowed>
47 |         </featureDescription>
48 |       </features>
49 |     </typeDescription>
50 |   </types>
51 | </typeSystemDescription>
52 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/Segment.java:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | /* First created by JCasGen Fri Nov 08 17:22:57 CET 2013 */
 4 | package de.tudarmstadt.langtech.semantics.type;
 5 | 
 6 | import org.apache.uima.jcas.JCas; 
 7 | import org.apache.uima.jcas.JCasRegistry;
 8 | import org.apache.uima.jcas.cas.TOP_Type;
 9 | 
10 | import org.apache.uima.jcas.tcas.Annotation;
11 | 
12 | 
13 | /** 
14 |  * Updated by JCasGen Fri Nov 08 17:22:57 CET 2013
15 |  * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml
16 |  * @generated */
17 | public class Segment extends Annotation {
18 |   /** @generated
19 |    * @ordered 
20 |    */
21 |   @SuppressWarnings ("hiding")
22 |   public final static int typeIndexID = JCasRegistry.register(Segment.class);
23 |   /** @generated
24 |    * @ordered 
25 |    */
26 |   @SuppressWarnings ("hiding")
27 |   public final static int type = typeIndexID;
28 |   /** @generated  */
29 |   @Override
30 |   public              int getTypeIndexID() {return typeIndexID;}
31 |  
32 |   /** Never called.  Disable default constructor
33 |    * @generated */
34 |   protected Segment() {/* intentionally empty block */}
35 |     
36 |   /** Internal - constructor used by generator 
37 |    * @generated */
38 |   public Segment(int addr, TOP_Type type) {
39 |     super(addr, type);
40 |     readObject();
41 |   }
42 |   
43 |   /** @generated */
44 |   public Segment(JCas jcas) {
45 |     super(jcas);
46 |     readObject();   
47 |   } 
48 | 
49 |   /** @generated */  
50 |   public Segment(JCas jcas, int begin, int end) {
51 |     super(jcas);
52 |     setBegin(begin);
53 |     setEnd(end);
54 |     readObject();
55 |   }   
56 | 
57 |   /** <!-- begin-user-doc -->
58 |     * Write your own initialization here
59 |     * <!-- end-user-doc -->
60 |   @generated modifiable */
61 |   private void readObject() {/*default - does nothing empty block */}
62 |      
63 | }
64 | 
65 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/scripts/topictiling.bat:
--------------------------------------------------------------------------------
1 | java -cp dependency/ant-1.8.1.jar:dependency/ant-launcher-1.8.1.jar:dependency/aopalliance-1.0.jar:dependency/args4j-2.0.16.jar:dependency/commons-compress-1.4.1.jar:dependency/commons-io-2.0.1.jar:dependency/commons-lang-2.6.jar:dependency/commons-logging-1.1.0.jboss.jar:dependency/commons-logging-1.1.1.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.coref-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.io-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.lexmorph-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.metadata-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.ner-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.parameter-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.resources-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.api.syntax-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp.dkpro.core.io.text-asl-1.4.0.jar:dependency/de.tudarmstadt.ukp .dkpro.core.stanfordnlp-gpl-1.4.0.jar:dependency/icu4j-4.0.1.jar:dependency/jVinci-2.4.0.jar:dependency/joda-time-2.0.jar:dependency/lda.jar:dependency/org.apache.log4j-1.2.13.v200706111418.jar:dependency/serializer-2.7.1.jar:dependency/spring-aop-3.1.0.RELEASE.jar:dependency/spring-asm-3.1.0.RELEASE.jar:dependency/spring-beans-3.1.0.RELEASE.jar:dependency/spring-context-3.1.0.RELEASE.jar:dependency/spring-core-3.1.0.RELEASE.jar:dependency/spring-expression-3.1.0.RELEASE.jar:dependency/stanford-corenlp-1.3.3.jar:dependency/uimafit-1.4.0.jar:dependency/uimaj-adapter-vinci-2.4.0.jar:dependency/uimaj-core-2.4.0.jar:dependency/uimaj-cpe-2.4.0.jar:dependency/uimaj-document-annotation-2.4.0.jar:dependency/uimaj-tools-2.4.0.jar:dependency/xalan-2.7.1.jar:dependency/xercesImpl-2.9.1.jar:dependency/xml-apis-1.3.03.jar:dependency/xom-1.2.5.jar:dependency/xz-1.0.jar:de.tudarmstadt.langtech.semantics.segmentation.topictiling-0.0.2.jar  de.tudarmstadt.langtech.semantics.segmentation.segmenter.RunTopicTilingOnFile
2 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/SimpleSegmenter.java:
--------------------------------------------------------------------------------
 1 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator;
 2 | 
 3 | import java.text.BreakIterator;
 4 | 
 5 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 6 | import org.apache.uima.jcas.JCas;
 7 | import org.uimafit.component.JCasAnnotator_ImplBase;
 8 | import org.uimafit.descriptor.ConfigurationParameter;
 9 | 
10 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
11 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
12 | 
13 | public class SimpleSegmenter extends JCasAnnotator_ImplBase{
14 | 	public static  final String PARAM_TOKEN_BOUNDARY="TokenBoundary";
15 | 	public static  final String PARAM_SENTENCE_BOUNDARY="SentenceBoundary";
16 | 	@ConfigurationParameter(name = PARAM_SENTENCE_BOUNDARY,mandatory=false)
17 | 	private char sentenceBoundary = '\n';
18 | 	@ConfigurationParameter(name = PARAM_TOKEN_BOUNDARY,mandatory=false)
19 | 	private char tokenBoundary = ' ';
20 | 	
21 | 	
22 | 	@Override
23 | 	public void process(JCas aJCas) throws AnalysisEngineProcessException {
24 | 		String txt = aJCas.getDocumentText();
25 | 		int prevToken = 0;
26 | 		int prevSentence = 0;
27 | 		System.out.println(txt);
28 | 		int i =0;
29 | 		for (i=0;i<txt.length();i++){
30 | 			
31 | 			if (txt.charAt(i)==sentenceBoundary && i-prevSentence>0){
32 | 				Sentence s = new Sentence(aJCas,prevSentence,i);
33 | 				s.addToIndexes();
34 | 				prevSentence=i+1;
35 | 				Token t = new Token(aJCas,prevToken,i);
36 | 				t.addToIndexes();
37 | 				prevToken=i+1;
38 | 			}
39 | 			if (txt.charAt(i)==tokenBoundary && i-prevToken>0){
40 | 				Token t = new Token(aJCas,prevToken,i);
41 | 				t.addToIndexes();				
42 | 				prevToken=i+1;
43 | 			}
44 | 			
45 | 		}
46 | 		if (i-prevSentence>0){
47 | 			Sentence s = new Sentence(aJCas,prevSentence,i);
48 | 			s.addToIndexes();
49 | 			Token t = new Token(aJCas,prevToken,i);
50 | 			t.addToIndexes();
51 | 		}
52 | 	}
53 | 	
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaDocumentBasedTopicIdAnnotator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *	Martin Riedl
 3 |  *	riedl@cs.tu-darmstadt.de
 4 |  *  FG Language Technology
 5 |  * 	Technische Universität Darmstadt, Germany
 6 |  * 
 7 |  * 
 8 |  *  This file is part of TopicTiling.
 9 |  *
10 |  *  TopicTiling is free software: you can redistribute it and/or modify
11 |  *  it under the terms of the GNU General Public License as published by
12 |  *  the Free Software Foundation, either version 3 of the License, or
13 |  *  (at your option) any later version.
14 |  *
15 |  *  TopicTiling is distributed in the hope that it will be useful,
16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 |  *  GNU General Public License for more details.
19 |  *
20 |  *  You should have received a copy of the GNU General Public License
21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
22 |  */
23 | 
24 | package de.tudarmstadt.langtech.lda.annotator;
25 | 
26 | import static org.uimafit.util.JCasUtil.select;
27 | import static org.uimafit.util.JCasUtil.selectCovered;
28 | 
29 | import java.util.ArrayList;
30 | import java.util.Collection;
31 | import java.util.List;
32 | 
33 | import org.apache.uima.jcas.JCas;
34 | 
35 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
37 | 
38 | 
39 | 
40 | public class GibbsLdaDocumentBasedTopicIdAnnotator
41 | 	extends GibbsLdaTopicIdAnnotator {
42 | 
43 | 	@Override
44 | 	public List<String>[] getDocuments(JCas jcas) {
45 | 		Collection<Sentence> sentences = select(jcas, Sentence.class);
46 | 		@SuppressWarnings("unchecked")
47 | 		List<String>[] arr = new ArrayList[1];
48 | 		arr[0]= new ArrayList<String>();
49 | 		for (Sentence s : sentences) {
50 | 			for (Token t : selectCovered(Token.class, s)) {
51 | 				arr[0].add(t.getCoveredText());
52 | 			}
53 | 		}
54 | 		
55 | 		return arr;
56 | 	}
57 | 	
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/Segment_Type.java:
--------------------------------------------------------------------------------
 1 | 
 2 | /* First created by JCasGen Fri Nov 08 17:22:57 CET 2013 */
 3 | package de.tudarmstadt.langtech.semantics.type;
 4 | 
 5 | import org.apache.uima.jcas.JCas;
 6 | import org.apache.uima.jcas.JCasRegistry;
 7 | import org.apache.uima.cas.impl.CASImpl;
 8 | import org.apache.uima.cas.impl.FSGenerator;
 9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.jcas.tcas.Annotation_Type;
13 | 
14 | /** 
15 |  * Updated by JCasGen Fri Nov 08 17:22:57 CET 2013
16 |  * @generated */
17 | public class Segment_Type extends Annotation_Type {
18 |   /** @generated */
19 |   @Override
20 |   protected FSGenerator getFSGenerator() {return fsGenerator;}
21 |   /** @generated */
22 |   private final FSGenerator fsGenerator = 
23 |     new FSGenerator() {
24 |       public FeatureStructure createFS(int addr, CASImpl cas) {
25 |   			 if (Segment_Type.this.useExistingInstance) {
26 |   			   // Return eq fs instance if already created
27 |   		     FeatureStructure fs = Segment_Type.this.jcas.getJfsFromCaddr(addr);
28 |   		     if (null == fs) {
29 |   		       fs = new Segment(addr, Segment_Type.this);
30 |   			   Segment_Type.this.jcas.putJfsFromCaddr(addr, fs);
31 |   			   return fs;
32 |   		     }
33 |   		     return fs;
34 |         } else return new Segment(addr, Segment_Type.this);
35 |   	  }
36 |     };
37 |   /** @generated */
38 |   @SuppressWarnings ("hiding")
39 |   public final static int typeIndexID = Segment.typeIndexID;
40 |   /** @generated 
41 |      @modifiable */
42 |   @SuppressWarnings ("hiding")
43 |   public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.semantics.type.Segment");
44 | 
45 | 
46 | 
47 |   /** initialize variables to correspond with Cas Type and Features
48 | 	* @generated */
49 |   public Segment_Type(JCas jcas, Type casType) {
50 |     super(jcas, casType);
51 |     casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
52 | 
53 |   }
54 | }
55 | 
56 | 
57 | 
58 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaSentenceBasedTopicIdAnnotator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *	Martin Riedl
 3 |  *	riedl@cs.tu-darmstadt.de
 4 |  *  FG Language Technology
 5 |  * 	Technische Universität Darmstadt, Germany
 6 |  * 
 7 |  * 
 8 |  *  This file is part of TopicTiling.
 9 |  *
10 |  *  TopicTiling is free software: you can redistribute it and/or modify
11 |  *  it under the terms of the GNU General Public License as published by
12 |  *  the Free Software Foundation, either version 3 of the License, or
13 |  *  (at your option) any later version.
14 |  *
15 |  *  TopicTiling is distributed in the hope that it will be useful,
16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 |  *  GNU General Public License for more details.
19 |  *
20 |  *  You should have received a copy of the GNU General Public License
21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
22 |  */
23 | 
24 | package de.tudarmstadt.langtech.lda.annotator;
25 | 
26 | import static org.uimafit.util.JCasUtil.select;
27 | import static org.uimafit.util.JCasUtil.selectCovered;
28 | 
29 | import java.util.ArrayList;
30 | import java.util.Collection;
31 | import java.util.List;
32 | 
33 | import org.apache.uima.jcas.JCas;
34 | 
35 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
37 | 
38 | public class GibbsLdaSentenceBasedTopicIdAnnotator extends
39 | 		GibbsLdaTopicIdAnnotator {
40 | 
41 | 
42 | 	public List<String>[] getDocuments(JCas jcas) {
43 | 		Collection<Sentence> sentences = select(jcas, Sentence.class);
44 | 		@SuppressWarnings("unchecked")
45 | 		List<String>[] arr = new ArrayList[sentences.size()];
46 | 		int i = 0;
47 | 		for (Sentence s : select(jcas, Sentence.class)) {
48 | 			System.out.println(s.getCoveredText());
49 | 		}
50 | 		for (Sentence s : sentences) {
51 | 			StringBuffer line = new StringBuffer();
52 | 			arr[i] = new ArrayList<String>();
53 | 			for (Token t : selectCovered(Token.class, s)) {
54 | 				line.append(t.getCoveredText());
55 | 				line.append(" ");
56 | 				arr[i].add(t.getCoveredText());
57 | 			}
58 | 			i++;
59 | 		}
60 | 
61 | 		return arr;
62 | 	}
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDA.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2007 by
 3 |  * 
 4 |  * 	Xuan-Hieu Phan
 5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 6 |  * 	Graduate School of Information Sciences
 7 |  * 	Tohoku University
 8 |  * 
 9 |  *  Cam-Tu Nguyen
10 |  *  ncamtu@gmail.com
11 |  *  College of Technology
12 |  *  Vietnam National University, Hanoi
13 |  *
14 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
15 |  * it under the terms of the GNU General Public License as published
16 |  * by the Free Software Foundation; either version 2 of the License,
17 |  * or (at your option) any later version.
18 |  *
19 |  * JGibbsLDA is distributed in the hope that it will be useful, but
20 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 |  * GNU General Public License for more details.
23 |  *
24 |  * You should have received a copy of the GNU General Public License
25 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 |  */
28 | 
29 | package jgibbslda;
30 | 
31 | import org.kohsuke.args4j.*;
32 | 
33 | public class LDA {
34 | 	
35 | 	public static void main(String args[]){
36 | 		LDACmdOption option = new LDACmdOption();
37 | 		CmdLineParser parser = new CmdLineParser(option);
38 | 		
39 | 		try {
40 | 			if (args.length == 0){
41 | 				showHelp(parser);
42 | 				return;
43 | 			}
44 | 			
45 | 			parser.parseArgument(args);
46 | 			
47 | 			if (option.est || option.estc){
48 | 				Estimator estimator = new Estimator();
49 | 				estimator.init(option);
50 | 				estimator.estimate();
51 | 			}
52 | 			else if (option.inf){
53 | 				Inferencer inferencer = new Inferencer();
54 | 				inferencer.init(option);
55 | 				
56 | 				Model newModel = inferencer.inference();
57 | 			
58 | 				for (int i = 0; i < newModel.phi.length; ++i){
59 | 					//phi: K * V
60 | 					System.out.println("-----------------------\ntopic" + i  + " : ");
61 | 					for (int j = 0; j < 10; ++j){
62 | 						System.out.println(inferencer.globalDict.id2word.get(j) + "\t" + newModel.phi[i][j]);
63 | 					}
64 | 				}
65 | 			}
66 | 		}
67 | 		catch (CmdLineException cle){
68 | 			System.out.println("Command line error: " + cle.getMessage());
69 | 			showHelp(parser);
70 | 			return;
71 | 		}
72 | 		catch (Exception e){
73 | 			System.out.println("Error in main: " + e.getMessage());
74 | 			e.printStackTrace();
75 | 			return;
76 | 		}
77 | 	}
78 | 	
79 | 	public static void showHelp(CmdLineParser parser){
80 | 		System.out.println("LDA [options ...] [arguments...]");
81 | 		parser.printUsage(System.out);
82 | 	}
83 | 	
84 | }
85 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Document.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2007 by
 3 |  * 
 4 |  * 	Xuan-Hieu Phan
 5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
 6 |  * 	Graduate School of Information Sciences
 7 |  * 	Tohoku University
 8 |  * 
 9 |  *  Cam-Tu Nguyen
10 |  *  ncamtu@gmail.com
11 |  *  College of Technology
12 |  *  Vietnam National University, Hanoi
13 |  *
14 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
15 |  * it under the terms of the GNU General Public License as published
16 |  * by the Free Software Foundation; either version 2 of the License,
17 |  * or (at your option) any later version.
18 |  *
19 |  * JGibbsLDA is distributed in the hope that it will be useful, but
20 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 |  * GNU General Public License for more details.
23 |  *
24 |  * You should have received a copy of the GNU General Public License
25 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
26 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27 |  */
28 | 
29 | package jgibbslda;
30 | 
31 | import java.util.Vector;
32 | 
33 | public class Document {
34 | 
35 | 	//----------------------------------------------------
36 | 	//Instance Variables
37 | 	//----------------------------------------------------
38 | 	public int [] words;
39 | 	public String rawStr;
40 | 	public int length;
41 | 	
42 | 	//----------------------------------------------------
43 | 	//Constructors
44 | 	//----------------------------------------------------
45 | 	public Document(){
46 | 		words = null;
47 | 		rawStr = "";
48 | 		length = 0;
49 | 	}
50 | 	
51 | 	public Document(int length){
52 | 		this.length = length;
53 | 		rawStr = "";
54 | 		words = new int[length];
55 | 	}
56 | 	
57 | 	public Document(int length, int [] words){
58 | 		this.length = length;
59 | 		rawStr = "";
60 | 		
61 | 		this.words = new int[length];
62 | 		for (int i =0 ; i < length; ++i){
63 | 			this.words[i] = words[i];
64 | 		}
65 | 	}
66 | 	
67 | 	public Document(int length, int [] words, String rawStr){
68 | 		this.length = length;
69 | 		this.rawStr = rawStr;
70 | 		
71 | 		this.words = new int[length];
72 | 		for (int i =0 ; i < length; ++i){
73 | 			this.words[i] = words[i];
74 | 		}
75 | 	}
76 | 	
77 | 	public Document(Vector<Integer> doc){
78 | 		this.length = doc.size();
79 | 		rawStr = "";
80 | 		this.words = new int[length];
81 | 		for (int i = 0; i < length; i++){
82 | 			this.words[i] = doc.get(i);
83 | 		}
84 | 	}
85 | 	
86 | 	public Document(Vector<Integer> doc, String rawStr){
87 | 		this.length = doc.size();
88 | 		this.rawStr = rawStr;
89 | 		this.words = new int[length];
90 | 		for (int i = 0; i < length; ++i){
91 | 			this.words[i] = doc.get(i);
92 | 		}
93 | 	}
94 | }
95 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentQuantity.java:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | /* First created by JCasGen Fri Nov 08 16:28:43 CET 2013 */
 4 | package de.tudarmstadt.langtech.semantics.type;
 5 | 
 6 | import org.apache.uima.jcas.JCas; 
 7 | import org.apache.uima.jcas.JCasRegistry;
 8 | import org.apache.uima.jcas.cas.TOP_Type;
 9 | 
10 | import org.apache.uima.jcas.tcas.Annotation;
11 | 
12 | 
13 | /** Saves the number of segments a document should consist of according to a given gold-standard.
14 |  * Updated by JCasGen Fri Nov 08 16:59:47 CET 2013
15 |  * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/Segment.xml
16 |  * @generated */
17 | public class SegmentQuantity extends Annotation {
18 |   /** @generated
19 |    * @ordered 
20 |    */
21 |   @SuppressWarnings ("hiding")
22 |   public final static int typeIndexID = JCasRegistry.register(SegmentQuantity.class);
23 |   /** @generated
24 |    * @ordered 
25 |    */
26 |   @SuppressWarnings ("hiding")
27 |   public final static int type = typeIndexID;
28 |   /** @generated  */
29 |   @Override
30 |   public              int getTypeIndexID() {return typeIndexID;}
31 |  
32 |   /** Never called.  Disable default constructor
33 |    * @generated */
34 |   protected SegmentQuantity() {/* intentionally empty block */}
35 |     
36 |   /** Internal - constructor used by generator 
37 |    * @generated */
38 |   public SegmentQuantity(int addr, TOP_Type type) {
39 |     super(addr, type);
40 |     readObject();
41 |   }
42 |   
43 |   /** @generated */
44 |   public SegmentQuantity(JCas jcas) {
45 |     super(jcas);
46 |     readObject();   
47 |   } 
48 | 
49 |   /** @generated */  
50 |   public SegmentQuantity(JCas jcas, int begin, int end) {
51 |     super(jcas);
52 |     setBegin(begin);
53 |     setEnd(end);
54 |     readObject();
55 |   }   
56 | 
57 |   /** <!-- begin-user-doc -->
58 |     * Write your own initialization here
59 |     * <!-- end-user-doc -->
60 |   @generated modifiable */
61 |   private void readObject() {/*default - does nothing empty block */}
62 |      
63 |  
64 |     
65 |   //*--------------*
66 |   //* Feature: segmentCount
67 | 
68 |   /** getter for segmentCount - gets 
69 |    * @generated */
70 |   public int getSegmentCount() {
71 |     if (SegmentQuantity_Type.featOkTst && ((SegmentQuantity_Type)jcasType).casFeat_segmentCount == null)
72 |       jcasType.jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
73 |     return jcasType.ll_cas.ll_getIntValue(addr, ((SegmentQuantity_Type)jcasType).casFeatCode_segmentCount);}
74 |     
75 |   /** setter for segmentCount - sets  
76 |    * @generated */
77 |   public void setSegmentCount(int v) {
78 |     if (SegmentQuantity_Type.featOkTst && ((SegmentQuantity_Type)jcasType).casFeat_segmentCount == null)
79 |       jcasType.jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
80 |     jcasType.ll_cas.ll_setIntValue(addr, ((SegmentQuantity_Type)jcasType).casFeatCode_segmentCount, v);}    
81 |   }
82 | 
83 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/Topic.java:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | /* First created by JCasGen Tue Feb 21 09:57:17 CET 2012 */
 4 | package de.tudarmstadt.langtech.lda.type;
 5 | 
 6 | import org.apache.uima.jcas.JCas; 
 7 | import org.apache.uima.jcas.JCasRegistry;
 8 | import org.apache.uima.jcas.cas.TOP_Type;
 9 | 
10 | import org.apache.uima.jcas.tcas.Annotation;
11 | 
12 | 
13 | /** 
14 |  * Updated by JCasGen Thu Apr 12 12:36:02 CEST 2012
15 |  * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml
16 |  * @generated */
17 | public class Topic extends Annotation {
18 |   /** @generated
19 |    * @ordered 
20 |    */
21 |   public final static int typeIndexID = JCasRegistry.register(Topic.class);
22 |   /** @generated
23 |    * @ordered 
24 |    */
25 |   public final static int type = typeIndexID;
26 |   /** @generated  */
27 |   public              int getTypeIndexID() {return typeIndexID;}
28 |  
29 |   /** Never called.  Disable default constructor
30 |    * @generated */
31 |   protected Topic() {}
32 |     
33 |   /** Internal - constructor used by generator 
34 |    * @generated */
35 |   public Topic(int addr, TOP_Type type) {
36 |     super(addr, type);
37 |     readObject();
38 |   }
39 |   
40 |   /** @generated */
41 |   public Topic(JCas jcas) {
42 |     super(jcas);
43 |     readObject();   
44 |   } 
45 | 
46 |   /** @generated */  
47 |   public Topic(JCas jcas, int begin, int end) {
48 |     super(jcas);
49 |     setBegin(begin);
50 |     setEnd(end);
51 |     readObject();
52 |   }   
53 | 
54 |   /** <!-- begin-user-doc -->
55 |     * Write your own initialization here
56 |     * <!-- end-user-doc -->
57 |   @generated modifiable */
58 |   private void readObject() {}
59 |      
60 |   //*--------------*
61 |   //* Feature: topicId
62 | 
63 |   /** getter for topicId - gets 
64 |    * @generated */
65 |   public int getTopicId() {
66 |     if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicId == null)
67 |       jcasType.jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
68 |     return jcasType.ll_cas.ll_getIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicId);}
69 |     
70 |   /** setter for topicId - sets  
71 |    * @generated */
72 |   public void setTopicId(int v) {
73 |     if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicId == null)
74 |       jcasType.jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
75 |     jcasType.ll_cas.ll_setIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicId, v);}    
76 |    
77 |     
78 |   //*--------------*
79 |   //* Feature: topicModeId
80 | 
81 |   /** getter for topicModeId - gets 
82 |    * @generated */
83 |   public int getTopicModeId() {
84 |     if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicModeId == null)
85 |       jcasType.jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
86 |     return jcasType.ll_cas.ll_getIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicModeId);}
87 |     
88 |   /** setter for topicModeId - sets  
89 |    * @generated */
90 |   public void setTopicModeId(int v) {
91 |     if (Topic_Type.featOkTst && ((Topic_Type)jcasType).casFeat_topicModeId == null)
92 |       jcasType.jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
93 |     jcasType.ll_cas.ll_setIntValue(addr, ((Topic_Type)jcasType).casFeatCode_topicModeId, v);}    
94 |   }
95 | 
96 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentQuantity_Type.java:
--------------------------------------------------------------------------------
 1 | 
 2 | /* First created by JCasGen Fri Nov 08 16:28:43 CET 2013 */
 3 | package de.tudarmstadt.langtech.semantics.type;
 4 | 
 5 | import org.apache.uima.jcas.JCas;
 6 | import org.apache.uima.jcas.JCasRegistry;
 7 | import org.apache.uima.cas.impl.CASImpl;
 8 | import org.apache.uima.cas.impl.FSGenerator;
 9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 | 
16 | /** Saves the number of segments a document should consist of according to a given gold-standard.
17 |  * Updated by JCasGen Fri Nov 08 16:59:47 CET 2013
18 |  * @generated */
19 | public class SegmentQuantity_Type extends Annotation_Type {
20 |   /** @generated */
21 |   @Override
22 |   protected FSGenerator getFSGenerator() {return fsGenerator;}
23 |   /** @generated */
24 |   private final FSGenerator fsGenerator = 
25 |     new FSGenerator() {
26 |       public FeatureStructure createFS(int addr, CASImpl cas) {
27 |   			 if (SegmentQuantity_Type.this.useExistingInstance) {
28 |   			   // Return eq fs instance if already created
29 |   		     FeatureStructure fs = SegmentQuantity_Type.this.jcas.getJfsFromCaddr(addr);
30 |   		     if (null == fs) {
31 |   		       fs = new SegmentQuantity(addr, SegmentQuantity_Type.this);
32 |   			   SegmentQuantity_Type.this.jcas.putJfsFromCaddr(addr, fs);
33 |   			   return fs;
34 |   		     }
35 |   		     return fs;
36 |         } else return new SegmentQuantity(addr, SegmentQuantity_Type.this);
37 |   	  }
38 |     };
39 |   /** @generated */
40 |   @SuppressWarnings ("hiding")
41 |   public final static int typeIndexID = SegmentQuantity.typeIndexID;
42 |   /** @generated 
43 |      @modifiable */
44 |   @SuppressWarnings ("hiding")
45 |   public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
46 |  
47 |   /** @generated */
48 |   final Feature casFeat_segmentCount;
49 |   /** @generated */
50 |   final int     casFeatCode_segmentCount;
51 |   /** @generated */ 
52 |   public int getSegmentCount(int addr) {
53 |         if (featOkTst && casFeat_segmentCount == null)
54 |       jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
55 |     return ll_cas.ll_getIntValue(addr, casFeatCode_segmentCount);
56 |   }
57 |   /** @generated */    
58 |   public void setSegmentCount(int addr, int v) {
59 |         if (featOkTst && casFeat_segmentCount == null)
60 |       jcas.throwFeatMissing("segmentCount", "de.tudarmstadt.ukp.dkpro.semantics.type.SegmentQuantity");
61 |     ll_cas.ll_setIntValue(addr, casFeatCode_segmentCount, v);}
62 |     
63 |   
64 | 
65 | 
66 | 
67 |   /** initialize variables to correspond with Cas Type and Features
68 | 	* @generated */
69 |   public SegmentQuantity_Type(JCas jcas, Type casType) {
70 |     super(jcas, casType);
71 |     casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
72 | 
73 |  
74 |     casFeat_segmentCount = jcas.getRequiredFeatureDE(casType, "segmentCount", "uima.cas.Integer", featOkTst);
75 |     casFeatCode_segmentCount  = (null == casFeat_segmentCount) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_segmentCount).getCode();
76 | 
77 |   }
78 | }
79 | 
80 | 
81 | 
82 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/test/java/de/tudarmstadt/langtech/lda/TestLdaTopicModelAnnotator.java:
--------------------------------------------------------------------------------
 1 | package de.tudarmstadt.langtech.lda;
 2 | 
 3 | import static org.uimafit.factory.AnalysisEngineFactory.createPrimitive;
 4 | 
 5 | import java.io.IOException;
 6 | import java.text.BreakIterator;
 7 | 
 8 | import org.apache.uima.UIMAException;
 9 | import org.apache.uima.analysis_engine.AnalysisEngine;
10 | import org.apache.uima.jcas.JCas;
11 | import org.uimafit.component.xwriter.CASDumpWriter;
12 | import org.uimafit.factory.AnalysisEngineFactory;
13 | import org.uimafit.factory.JCasFactory;
14 | import org.uimafit.pipeline.SimplePipeline;
15 | 
16 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaDocumentBasedTopicIdAnnotator;
17 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaSentenceBasedTopicIdAnnotator;
18 | import de.tudarmstadt.langtech.lda.annotator.GibbsLdaTopicIdAnnotator;
19 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
20 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
21 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
22 | 
23 | public class TestLdaTopicModelAnnotator {
24 | 	public static void main(String[] args) throws UIMAException, IOException {
25 | 		JCas jcas = getJCas();
26 | 
27 | 		//sentence wise
28 | 		AnalysisEngine ae = AnalysisEngineFactory.createPrimitive(GibbsLdaSentenceBasedTopicIdAnnotator.class, 
29 | 				GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_NAME, "model-final",
30 | 				GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_DIR, "src/test/resources/model",
31 | 				GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION,true,
32 | 				GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION,true,
33 | 				GibbsLdaTopicIdAnnotator.PARAM_LDA_REPEAT_INFERENCE, 100
34 | 				);
35 | 
36 | 		//document wise
37 | 		AnalysisEngine ae2 = AnalysisEngineFactory.createPrimitive(GibbsLdaDocumentBasedTopicIdAnnotator.class, 
38 | 				GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_NAME, "model-final",
39 | 				GibbsLdaTopicIdAnnotator.PARAM_LDA_MODEL_DIR, "src/test/resources/model",
40 | 				GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION,true,
41 | 				GibbsLdaTopicIdAnnotator.PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION,true,
42 | 				GibbsLdaTopicIdAnnotator.PARAM_LDA_REPEAT_INFERENCE, 100
43 | 				);
44 | 
45 | 		
46 | 		AnalysisEngine out = createPrimitive(CASDumpWriter.class);
47 | 		SimplePipeline.runPipeline(jcas, ae,out);
48 | 	}
49 | 
50 | 	private static JCas getJCas() throws UIMAException {
51 | 		JCas jcas = JCasFactory.createJCas();
52 | 		jcas.setDocumentLanguage("en");
53 | 		String text = "This is some example document. And there is more text";
54 | 		jcas.setDocumentText(text);
55 | 		DocumentMetaData metaData = new DocumentMetaData(jcas);
56 | 		metaData.setDocumentTitle("Titel");
57 | 		metaData.addToIndexes();
58 | 		BreakIterator boundary = BreakIterator.getWordInstance();
59 | 		
60 | 
61 | 		// print each sentence in reverse order
62 | 		boundary.setText(text);
63 | 		int start = boundary.first();
64 | 		for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
65 | 			Token t = new Token(jcas, start, end);
66 | 			t.addToIndexes();
67 | 		}
68 | 		boundary = BreakIterator.getSentenceInstance();
69 | 		boundary.setText(text);
70 | 		
71 | 		start = boundary.first();
72 | 		for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
73 | 			Sentence t = new Sentence(jcas, start, end);
74 | 			t.addToIndexes();
75 | 		}
76 | 		return jcas;
77 | 
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/ml/lda/type/GibbsLdaTopic.java:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | /* First created by JCasGen Fri Nov 08 16:28:12 CET 2013 */
  4 | package de.tudarmstadt.langtech.ml.lda.type;
  5 | 
  6 | import org.apache.uima.jcas.JCas; 
  7 | import org.apache.uima.jcas.JCasRegistry;
  8 | import org.apache.uima.jcas.cas.TOP_Type;
  9 | 
 10 | import org.apache.uima.jcas.tcas.Annotation;
 11 | 
 12 | 
 13 | /** 
 14 |  * Updated by JCasGen Fri Nov 08 16:59:29 CET 2013
 15 |  * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/GibbsLdaDescriptor.xml
 16 |  * @generated */
 17 | public class GibbsLdaTopic extends Annotation {
 18 |   /** @generated
 19 |    * @ordered 
 20 |    */
 21 |   @SuppressWarnings ("hiding")
 22 |   public final static int typeIndexID = JCasRegistry.register(GibbsLdaTopic.class);
 23 |   /** @generated
 24 |    * @ordered 
 25 |    */
 26 |   @SuppressWarnings ("hiding")
 27 |   public final static int type = typeIndexID;
 28 |   /** @generated  */
 29 |   @Override
 30 |   public              int getTypeIndexID() {return typeIndexID;}
 31 |  
 32 |   /** Never called.  Disable default constructor
 33 |    * @generated */
 34 |   protected GibbsLdaTopic() {/* intentionally empty block */}
 35 |     
 36 |   /** Internal - constructor used by generator 
 37 |    * @generated */
 38 |   public GibbsLdaTopic(int addr, TOP_Type type) {
 39 |     super(addr, type);
 40 |     readObject();
 41 |   }
 42 |   
 43 |   /** @generated */
 44 |   public GibbsLdaTopic(JCas jcas) {
 45 |     super(jcas);
 46 |     readObject();   
 47 |   } 
 48 | 
 49 |   /** @generated */  
 50 |   public GibbsLdaTopic(JCas jcas, int begin, int end) {
 51 |     super(jcas);
 52 |     setBegin(begin);
 53 |     setEnd(end);
 54 |     readObject();
 55 |   }   
 56 | 
 57 |   /** <!-- begin-user-doc -->
 58 |     * Write your own initialization here
 59 |     * <!-- end-user-doc -->
 60 |   @generated modifiable */
 61 |   private void readObject() {/*default - does nothing empty block */}
 62 |      
 63 |  
 64 |     
 65 |   //*--------------*
 66 |   //* Feature: topic
 67 | 
 68 |   /** getter for topic - gets 
 69 |    * @generated */
 70 |   public int getTopic() {
 71 |     if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_topic == null)
 72 |       jcasType.jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 73 |     return jcasType.ll_cas.ll_getIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_topic);}
 74 |     
 75 |   /** setter for topic - sets  
 76 |    * @generated */
 77 |   public void setTopic(int v) {
 78 |     if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_topic == null)
 79 |       jcasType.jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 80 |     jcasType.ll_cas.ll_setIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_topic, v);}    
 81 |    
 82 |     
 83 |   //*--------------*
 84 |   //* Feature: termId
 85 | 
 86 |   /** getter for termId - gets 
 87 |    * @generated */
 88 |   public int getTermId() {
 89 |     if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_termId == null)
 90 |       jcasType.jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 91 |     return jcasType.ll_cas.ll_getIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_termId);}
 92 |     
 93 |   /** setter for termId - sets  
 94 |    * @generated */
 95 |   public void setTermId(int v) {
 96 |     if (GibbsLdaTopic_Type.featOkTst && ((GibbsLdaTopic_Type)jcasType).casFeat_termId == null)
 97 |       jcasType.jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 98 |     jcasType.ll_cas.ll_setIntValue(addr, ((GibbsLdaTopic_Type)jcasType).casFeatCode_termId, v);}    
 99 |   }
100 | 
101 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentScore.java:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | /* First created by JCasGen Fri Nov 08 16:51:38 CET 2013 */
  4 | package de.tudarmstadt.langtech.semantics.type;
  5 | 
  6 | import org.apache.uima.jcas.JCas; 
  7 | import org.apache.uima.jcas.JCasRegistry;
  8 | import org.apache.uima.jcas.cas.TOP_Type;
  9 | 
 10 | import org.apache.uima.jcas.tcas.Annotation;
 11 | 
 12 | 
 13 | /** 
 14 |  * Updated by JCasGen Wed Aug 26 15:50:04 CEST 2015
 15 |  * XML source: /Users/riedl/work/workspaces/intern4.2/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/resources/desc/type/SegmentScore.xml
 16 |  * @generated */
 17 | public class SegmentScore extends Annotation {
 18 |   /** @generated
 19 |    * @ordered 
 20 |    */
 21 |   @SuppressWarnings ("hiding")
 22 |   public final static int typeIndexID = JCasRegistry.register(SegmentScore.class);
 23 |   /** @generated
 24 |    * @ordered 
 25 |    */
 26 |   @SuppressWarnings ("hiding")
 27 |   public final static int type = typeIndexID;
 28 |   /** @generated  */
 29 |   @Override
 30 |   public              int getTypeIndexID() {return typeIndexID;}
 31 |  
 32 |   /** Never called.  Disable default constructor
 33 |    * @generated */
 34 |   protected SegmentScore() {/* intentionally empty block */}
 35 |     
 36 |   /** Internal - constructor used by generator 
 37 |    * @generated */
 38 |   public SegmentScore(int addr, TOP_Type type) {
 39 |     super(addr, type);
 40 |     readObject();
 41 |   }
 42 |   
 43 |   /** @generated */
 44 |   public SegmentScore(JCas jcas) {
 45 |     super(jcas);
 46 |     readObject();   
 47 |   } 
 48 | 
 49 |   /** @generated */  
 50 |   public SegmentScore(JCas jcas, int begin, int end) {
 51 |     super(jcas);
 52 |     setBegin(begin);
 53 |     setEnd(end);
 54 |     readObject();
 55 |   }   
 56 | 
 57 |   /** <!-- begin-user-doc -->
 58 |     * Write your own initialization here
 59 |     * <!-- end-user-doc -->
 60 |   @generated modifiable */
 61 |   private void readObject() {/*default - does nothing empty block */}
 62 |      
 63 |  
 64 |     
 65 |   //*--------------*
 66 |   //* Feature: score
 67 | 
 68 |   /** getter for score - gets 
 69 |    * @generated */
 70 |   public double getScore() {
 71 |     if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_score == null)
 72 |       jcasType.jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
 73 |     return jcasType.ll_cas.ll_getDoubleValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_score);}
 74 |     
 75 |   /** setter for score - sets  
 76 |    * @generated */
 77 |   public void setScore(double v) {
 78 |     if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_score == null)
 79 |       jcasType.jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
 80 |     jcasType.ll_cas.ll_setDoubleValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_score, v);}    
 81 |    
 82 |     
 83 |   //*--------------*
 84 |   //* Feature: similarityScores
 85 | 
 86 |   /** getter for similarityScores - gets 
 87 |    * @generated */
 88 |   public String getSimilarityScores() {
 89 |     if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_similarityScores == null)
 90 |       jcasType.jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
 91 |     return jcasType.ll_cas.ll_getStringValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_similarityScores);}
 92 |     
 93 |   /** setter for similarityScores - sets  
 94 |    * @generated */
 95 |   public void setSimilarityScores(String v) {
 96 |     if (SegmentScore_Type.featOkTst && ((SegmentScore_Type)jcasType).casFeat_similarityScores == null)
 97 |       jcasType.jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
 98 |     jcasType.ll_cas.ll_setStringValue(addr, ((SegmentScore_Type)jcasType).casFeatCode_similarityScores, v);}    
 99 |   }
100 | 
101 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/OutputSegments.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *	Martin Riedl
  3 |  *	riedl@cs.tu-darmstadt.de
  4 |  *  FG Language Technology
  5 |  * 	Technische Universität Darmstadt, Germany
  6 |  * 
  7 |  * 
  8 |  *  This file is part of TopicTiling.
  9 |  *
 10 |  *  TopicTiling is free software: you can redistribute it and/or modify
 11 |  *  it under the terms of the GNU General Public License as published by
 12 |  *  the Free Software Foundation, either version 3 of the License, or
 13 |  *  (at your option) any later version.
 14 |  *
 15 |  *  TopicTiling is distributed in the hope that it will be useful,
 16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *  GNU General Public License for more details.
 19 |  *
 20 |  *  You should have received a copy of the GNU General Public License
 21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
 22 |  */
 23 | 		
 24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator;
 25 | 
 26 | import java.io.FileNotFoundException;
 27 | import java.io.PrintStream;
 28 | import java.util.Collection;
 29 | 
 30 | import org.apache.commons.lang.StringEscapeUtils;
 31 | import org.apache.uima.UimaContext;
 32 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 33 | import org.apache.uima.jcas.JCas;
 34 | import org.apache.uima.resource.ResourceInitializationException;
 35 | import org.uimafit.component.JCasAnnotator_ImplBase;
 36 | import org.uimafit.descriptor.ConfigurationParameter;
 37 | import org.uimafit.util.JCasUtil;
 38 | 
 39 | import de.tudarmstadt.langtech.semantics.type.SegmentScore;
 40 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
 41 | 
 42 | public class OutputSegments extends JCasAnnotator_ImplBase {
 43 | 	public static final String PARAM_OUTPUT = "Output";
 44 | 	@ConfigurationParameter(name = PARAM_OUTPUT, mandatory = false)
 45 | 	private String output;
 46 | 	private PrintStream ps;
 47 | 	@Override
 48 | 	public void initialize(UimaContext context)
 49 | 			throws ResourceInitializationException {
 50 | 		super.initialize(context);
 51 | 		if(output==null){
 52 | 			ps = System.out;
 53 | 		}else{
 54 | 			try {
 55 | 				ps = new  PrintStream(output);
 56 | 			} catch (FileNotFoundException e) {
 57 | 				e.printStackTrace();
 58 | 			}
 59 | 		}
 60 | 		ps.println("<documents>");
 61 | 	}
 62 | 
 63 | 	@Override
 64 | 	public void process(JCas aJCas) throws AnalysisEngineProcessException {
 65 | 		ps.println("<document>");
 66 | 		ps.println("<documentName>"+DocumentMetaData.get(aJCas).getDocumentTitle()+"</documentName>");
 67 | 		ps.println("<segments>");
 68 | 		Collection<SegmentScore> ss = JCasUtil.select(aJCas, SegmentScore.class);
 69 | 		int i = 0;
 70 | 		for (SegmentScore s : ss) {
 71 | 			if(i==0){
 72 | 				if(s.getBegin()!=0){
 73 | 					ps.println("<segment>");
 74 | 					ps.println("<depthScore></depthScore>");
 75 | 					ps.println("<text>");
 76 | 					ps.println(StringEscapeUtils.escapeXml(aJCas.getDocumentText().substring(0,s.getBegin())));
 77 | 					ps.println("</text>");
 78 | 					ps.println("</segment>");
 79 | 				}
 80 | 			}
 81 | 			ps.println("<segment>");
 82 | //			ps.println("<similarityScores>"+s.getSimilarityScores()+"</similarityScores>");
 83 | 			ps.println("<depthScore>"+s.getScore()+"</depthScore>");
 84 | 			ps.println("<text>");
 85 | 			ps.println(StringEscapeUtils.escapeXml(s.getCoveredText()));
 86 | 			ps.println("</text>");
 87 | 			ps.println("</segment>");
 88 | 			i+=1;
 89 | 		}
 90 | 		ps.println("</segments>");
 91 | 		ps.println("</document>");
 92 | 	}
 93 | 	@Override
 94 | 	public void collectionProcessComplete()
 95 | 			throws AnalysisEngineProcessException {
 96 | 	ps.println("</documents>");
 97 | 	ps.close();
 98 | 		super.collectionProcessComplete();
 99 | 	}
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/Topic_Type.java:
--------------------------------------------------------------------------------
  1 | 
  2 | /* First created by JCasGen Tue Feb 21 09:57:17 CET 2012 */
  3 | package de.tudarmstadt.langtech.lda.type;
  4 | 
  5 | import org.apache.uima.jcas.JCas;
  6 | import org.apache.uima.jcas.JCasRegistry;
  7 | import org.apache.uima.cas.impl.CASImpl;
  8 | import org.apache.uima.cas.impl.FSGenerator;
  9 | import org.apache.uima.cas.FeatureStructure;
 10 | import org.apache.uima.cas.impl.TypeImpl;
 11 | import org.apache.uima.cas.Type;
 12 | import org.apache.uima.cas.impl.FeatureImpl;
 13 | import org.apache.uima.cas.Feature;
 14 | import org.apache.uima.jcas.tcas.Annotation_Type;
 15 | 
 16 | /** 
 17 |  * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
 18 |  * @generated */
 19 | public class Topic_Type extends Annotation_Type {
 20 |   /** @generated */
 21 |   protected FSGenerator getFSGenerator() {return fsGenerator;}
 22 |   /** @generated */
 23 |   private final FSGenerator fsGenerator = 
 24 |     new FSGenerator() {
 25 |       public FeatureStructure createFS(int addr, CASImpl cas) {
 26 |   			 if (Topic_Type.this.useExistingInstance) {
 27 |   			   // Return eq fs instance if already created
 28 |   		     FeatureStructure fs = Topic_Type.this.jcas.getJfsFromCaddr(addr);
 29 |   		     if (null == fs) {
 30 |   		       fs = new Topic(addr, Topic_Type.this);
 31 |   			   Topic_Type.this.jcas.putJfsFromCaddr(addr, fs);
 32 |   			   return fs;
 33 |   		     }
 34 |   		     return fs;
 35 |         } else return new Topic(addr, Topic_Type.this);
 36 |   	  }
 37 |     };
 38 |   /** @generated */
 39 |   public final static int typeIndexID = Topic.typeIndexID;
 40 |   /** @generated 
 41 |      @modifiable */
 42 |   public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.Topic");
 43 | 
 44 | 
 45 | 
 46 |   /** @generated */
 47 |   final Feature casFeat_topicId;
 48 |   /** @generated */
 49 |   final int     casFeatCode_topicId;
 50 |   /** @generated */ 
 51 |   public int getTopicId(int addr) {
 52 |         if (featOkTst && casFeat_topicId == null)
 53 |       jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
 54 |     return ll_cas.ll_getIntValue(addr, casFeatCode_topicId);
 55 |   }
 56 |   /** @generated */    
 57 |   public void setTopicId(int addr, int v) {
 58 |         if (featOkTst && casFeat_topicId == null)
 59 |       jcas.throwFeatMissing("topicId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
 60 |     ll_cas.ll_setIntValue(addr, casFeatCode_topicId, v);}
 61 |     
 62 |   
 63 |  
 64 |   /** @generated */
 65 |   final Feature casFeat_topicModeId;
 66 |   /** @generated */
 67 |   final int     casFeatCode_topicModeId;
 68 |   /** @generated */ 
 69 |   public int getTopicModeId(int addr) {
 70 |         if (featOkTst && casFeat_topicModeId == null)
 71 |       jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
 72 |     return ll_cas.ll_getIntValue(addr, casFeatCode_topicModeId);
 73 |   }
 74 |   /** @generated */    
 75 |   public void setTopicModeId(int addr, int v) {
 76 |         if (featOkTst && casFeat_topicModeId == null)
 77 |       jcas.throwFeatMissing("topicModeId", "de.tudarmstadt.ukp.dkpro.lda.type.Topic");
 78 |     ll_cas.ll_setIntValue(addr, casFeatCode_topicModeId, v);}
 79 |     
 80 |   
 81 | 
 82 | 
 83 | 
 84 |   /** initialize variables to correspond with Cas Type and Features
 85 | 	* @generated */
 86 |   public Topic_Type(JCas jcas, Type casType) {
 87 |     super(jcas, casType);
 88 |     casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
 89 | 
 90 |  
 91 |     casFeat_topicId = jcas.getRequiredFeatureDE(casType, "topicId", "uima.cas.Integer", featOkTst);
 92 |     casFeatCode_topicId  = (null == casFeat_topicId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicId).getCode();
 93 | 
 94 |  
 95 |     casFeat_topicModeId = jcas.getRequiredFeatureDE(casType, "topicModeId", "uima.cas.Integer", featOkTst);
 96 |     casFeatCode_topicModeId  = (null == casFeat_topicModeId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicModeId).getCode();
 97 | 
 98 |   }
 99 | }
100 | 
101 | 
102 | 
103 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/ml/lda/type/GibbsLdaTopic_Type.java:
--------------------------------------------------------------------------------
  1 | 
  2 | /* First created by JCasGen Fri Nov 08 16:28:12 CET 2013 */
  3 | package de.tudarmstadt.langtech.ml.lda.type;
  4 | 
  5 | import org.apache.uima.jcas.JCas;
  6 | import org.apache.uima.jcas.JCasRegistry;
  7 | import org.apache.uima.cas.impl.CASImpl;
  8 | import org.apache.uima.cas.impl.FSGenerator;
  9 | import org.apache.uima.cas.FeatureStructure;
 10 | import org.apache.uima.cas.impl.TypeImpl;
 11 | import org.apache.uima.cas.Type;
 12 | import org.apache.uima.cas.impl.FeatureImpl;
 13 | import org.apache.uima.cas.Feature;
 14 | import org.apache.uima.jcas.tcas.Annotation_Type;
 15 | 
 16 | /** 
 17 |  * Updated by JCasGen Fri Nov 08 16:59:29 CET 2013
 18 |  * @generated */
 19 | public class GibbsLdaTopic_Type extends Annotation_Type {
 20 |   /** @generated */
 21 |   @Override
 22 |   protected FSGenerator getFSGenerator() {return fsGenerator;}
 23 |   /** @generated */
 24 |   private final FSGenerator fsGenerator = 
 25 |     new FSGenerator() {
 26 |       public FeatureStructure createFS(int addr, CASImpl cas) {
 27 |   			 if (GibbsLdaTopic_Type.this.useExistingInstance) {
 28 |   			   // Return eq fs instance if already created
 29 |   		     FeatureStructure fs = GibbsLdaTopic_Type.this.jcas.getJfsFromCaddr(addr);
 30 |   		     if (null == fs) {
 31 |   		       fs = new GibbsLdaTopic(addr, GibbsLdaTopic_Type.this);
 32 |   			   GibbsLdaTopic_Type.this.jcas.putJfsFromCaddr(addr, fs);
 33 |   			   return fs;
 34 |   		     }
 35 |   		     return fs;
 36 |         } else return new GibbsLdaTopic(addr, GibbsLdaTopic_Type.this);
 37 |   	  }
 38 |     };
 39 |   /** @generated */
 40 |   @SuppressWarnings ("hiding")
 41 |   public final static int typeIndexID = GibbsLdaTopic.typeIndexID;
 42 |   /** @generated 
 43 |      @modifiable */
 44 |   @SuppressWarnings ("hiding")
 45 |   public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 46 |  
 47 |   /** @generated */
 48 |   final Feature casFeat_topic;
 49 |   /** @generated */
 50 |   final int     casFeatCode_topic;
 51 |   /** @generated */ 
 52 |   public int getTopic(int addr) {
 53 |         if (featOkTst && casFeat_topic == null)
 54 |       jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 55 |     return ll_cas.ll_getIntValue(addr, casFeatCode_topic);
 56 |   }
 57 |   /** @generated */    
 58 |   public void setTopic(int addr, int v) {
 59 |         if (featOkTst && casFeat_topic == null)
 60 |       jcas.throwFeatMissing("topic", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 61 |     ll_cas.ll_setIntValue(addr, casFeatCode_topic, v);}
 62 |     
 63 |   
 64 |  
 65 |   /** @generated */
 66 |   final Feature casFeat_termId;
 67 |   /** @generated */
 68 |   final int     casFeatCode_termId;
 69 |   /** @generated */ 
 70 |   public int getTermId(int addr) {
 71 |         if (featOkTst && casFeat_termId == null)
 72 |       jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 73 |     return ll_cas.ll_getIntValue(addr, casFeatCode_termId);
 74 |   }
 75 |   /** @generated */    
 76 |   public void setTermId(int addr, int v) {
 77 |         if (featOkTst && casFeat_termId == null)
 78 |       jcas.throwFeatMissing("termId", "de.tudarmstadt.ukp.dkpro.ml.lda.type.GibbsLdaTopic");
 79 |     ll_cas.ll_setIntValue(addr, casFeatCode_termId, v);}
 80 |     
 81 |   
 82 | 
 83 | 
 84 | 
 85 |   /** initialize variables to correspond with Cas Type and Features
 86 | 	* @generated */
 87 |   public GibbsLdaTopic_Type(JCas jcas, Type casType) {
 88 |     super(jcas, casType);
 89 |     casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
 90 | 
 91 |  
 92 |     casFeat_topic = jcas.getRequiredFeatureDE(casType, "topic", "uima.cas.Integer", featOkTst);
 93 |     casFeatCode_topic  = (null == casFeat_topic) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topic).getCode();
 94 | 
 95 |  
 96 |     casFeat_termId = jcas.getRequiredFeatureDE(casType, "termId", "uima.cas.Integer", featOkTst);
 97 |     casFeatCode_termId  = (null == casFeat_termId) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_termId).getCode();
 98 | 
 99 |   }
100 | }
101 | 
102 | 
103 | 
104 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaTopicModelAnnotator.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *	Martin Riedl
  3 |  *	riedl@cs.tu-darmstadt.de
  4 |  *  FG Language Technology
  5 |  * 	Technische Universität Darmstadt, Germany
  6 |  * 
  7 |  * 
  8 |  *  This file is part of TopicTiling.
  9 |  *
 10 |  *  TopicTiling is free software: you can redistribute it and/or modify
 11 |  *  it under the terms of the GNU General Public License as published by
 12 |  *  the Free Software Foundation, either version 3 of the License, or
 13 |  *  (at your option) any later version.
 14 |  *
 15 |  *  TopicTiling is distributed in the hope that it will be useful,
 16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *  GNU General Public License for more details.
 19 |  *
 20 |  *  You should have received a copy of the GNU General Public License
 21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
 22 |  */
 23 | 
 24 | 
 25 | package de.tudarmstadt.langtech.lda.annotator;
 26 | 
 27 | import java.util.ArrayList;
 28 | import java.util.List;
 29 | 
 30 | import jgibbslda.Dictionary;
 31 | import jgibbslda.Inferencer;
 32 | import jgibbslda.LDACmdOption;
 33 | import jgibbslda.Model;
 34 | 
 35 | import org.apache.uima.UimaContext;
 36 | import org.apache.uima.resource.ResourceInitializationException;
 37 | import org.uimafit.component.JCasAnnotator_ImplBase;
 38 | import org.uimafit.descriptor.ConfigurationParameter;
 39 | 
 40 | /**
 41 |  * @author Martin Riedl
 42 |  */
 43 | public abstract class GibbsLdaTopicModelAnnotator extends JCasAnnotator_ImplBase{
 44 | 	public static final String PARAM_LDA_MODEL_DIR = "LdaModelDir";
 45 | 	public static final String PARAM_LDA_MODEL_NAME = "LdaModelName";
 46 | 	public static final String PARAM_LDA_INFERENCE_ITERATIONS = "LdaInferenceIterations";
 47 | 	public static final String PARAM_LDA_INFERENCE_SAVE_PATH = "LdaInferenceSavePath";
 48 | 	
 49 | 
 50 | 	
 51 | 	@ConfigurationParameter(name = PARAM_LDA_INFERENCE_SAVE_PATH, mandatory = false)
 52 | 	private String ldaInferenceSavePath;
 53 | 	private String ldaInferenceSaveName;
 54 | 
 55 | 	public String getLdaInferenceSaveName() {
 56 | 		return ldaInferenceSaveName;
 57 | 	}
 58 | 	public void setLdaInferenceSaveName(String ldaInferenceSaveName) {
 59 | 		this.ldaInferenceSaveName = ldaInferenceSaveName;
 60 | 	}
 61 | 	@ConfigurationParameter(name = PARAM_LDA_MODEL_DIR, mandatory = true)
 62 | 	private String ldaModelDir;
 63 | 	@ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true)
 64 | 	private String ldaModelName;
 65 | 	@ConfigurationParameter(name = PARAM_LDA_INFERENCE_ITERATIONS, mandatory = false, description = "Inference iterations used to built topic distribution for new model", defaultValue = "100")
 66 | 	private int ldaInferenceIteration;
 67 | 	
 68 | 	private Inferencer inferencer;
 69 | 
 70 | //	public Model inference(String[] documents) {
 71 | //		Model m =  inferencer.inference(documents);
 72 | //		if(ldaInferenceSavePath!=null){
 73 | //			m.dir = ldaInferenceSavePath;
 74 | //			m.saveModel("inference_"+ldaInferenceSaveName);
 75 | //		}
 76 | //		return m;
 77 | //	}
 78 | 	
 79 | 	public Model inference(List<String>[] documents) {
 80 | 		Model m =  inferencer.inference(documents);
 81 | 		if(ldaInferenceSavePath!=null){
 82 | 			m.dir = ldaInferenceSavePath;
 83 | 			m.saveModel("inference_"+ldaInferenceSaveName);
 84 | 		}
 85 | 		return m;
 86 | 	}
 87 | 	public int getInferenceNiters() {
 88 | 		return inferencer.niters;
 89 | 	}
 90 | 
 91 | 	public ArrayList<int[][]> getInferenceModeValues() {
 92 | 		return inferencer.values;
 93 | 	}
 94 | 	
 95 | 	public Dictionary getInferencerGlobalDict(){
 96 | 		return inferencer.globalDict;
 97 | 	}
 98 | 	
 99 | 	
100 | 	@Override
101 | 	public void initialize(UimaContext context)
102 | 		throws ResourceInitializationException {
103 | 		super.initialize(context);
104 | 		LDACmdOption options = new LDACmdOption();
105 | 		options.dir = ldaModelDir;
106 | 		options.modelName = ldaModelName;
107 | 		options.niters = ldaInferenceIteration;
108 | 		//Initiliaze inferencer
109 | 		inferencer = new Inferencer();
110 | 		inferencer.init(options);
111 | 	}
112 | 	
113 | 	
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/README.txt:
--------------------------------------------------------------------------------
 1 |  ----------------------------------------------------
 2 | |                     TopicTiling                    |
 3 |  ----------------------------------------------------
 4 | 
 5 | Topic Tiling is a LDA based Text Segmentation algorithm. 
 6 | This algorithm is based on the well-known TextTiling 
 7 | algorithm, and segments documents using the Latent 
 8 | Dirichlet Allocation (LDA) topic model. TopicTiling performs 
 9 | the segmentation in linear time and thus is computationally 
10 | less expensive than other LDA-based segmentation methods. 
11 | 
12 | USE:
13 | 
14 | The tool has been developed and tested using unix-based systems.
15 | As TopicTiling is written in Java it should also run on Windows
16 | machines. For executing TopicTiling you have to uncompress the 
17 | zip file and execute the topictiling.sh (Unix-based system) or 
18 | topictiling.bat (Windows-based system). The output is given in 
19 | an XML format with suggested topical boundaries.
20 | 
21 | HINT FOR NON-LATIN LANGUAGES:
22 | If you want to process e.g. Chinese, Arabic languages with TopicTiling
23 | you have to provide tokenized text (both for TopicTiling and GibbsLDA)
24 | and in addition use the flag -s which disables the Stanford tokenization
25 | and uses instead a simple whitespace tokenizer that expects one sentence
26 | per line
27 | 
28 | 
29 | The parameters of the script are shown when no parameters are given: 
30 | 
31 |  [java] Option "-fd" is required
32 |  [java] java -jar myprogram.jar [options...] arguments...
33 |  [java]  -dn      : Use the direct neighbor otherwise the highest neighbor will be used
34 |  [java]             (default false)
35 |  [java]  -fd VAL  : Directory fo the test files
36 |  [java]  -fp VAL  : File pattern for the test files
37 |  [java]  -i N     : Number of inference iterations used to annotate words with topic
38 |  [java]             IDs (default 100)
39 |  [java]  -m       : Use mode counting (true/false) (default=true)
40 |  [java]  -out VAL : File the content is written to (otherwise stdout will be used)
41 |  [java]  -ri N    : Use the repeated inference method
42 |  [java]  -rs N    : Use the repeated segmentation
43 |  [java]  -s       : Use simple segmentation (default=false)
44 |  [java]  -tmd VAL : Directory of the topic model (GibbsLDA should be used)
45 |  [java]  -tmn VAL : Name of the topic model (GibbsLDA should be used)
46 |  [java]  -w N     : Window size used to calculate the sentence similarity
47 | 
48 | The parameters -fp, -fd, -tmd, -tmn are the ones that have to be specified 
49 | and –ri should be parametrized by using about 5 repeated inferences.
50 | 
51 | For the algorithms it’s important to have a trained LDA model. The model should 
52 | be in a similar domain as the data you apply my algorithm. You have to train it 
53 | yourself using GibssLda++ or JGibbslda (http://gibbslda.sourceforge.net/) . They 
54 | both have the same output format. The output of the algorithms is given in XML
55 | and looks like:
56 | 
57 | <document>
58 | <documentName>…</documentName>
59 | <segment>
60 | <depthScore>score<depthScore>
61 | <text>…</text>
62 | </segment>
63 | …
64 | 
65 | </document>
66 | 
67 | The code returns all possible boundary positions (all maxima). If the number of 
68 | segments is known, select the the N highest depthScore values as boundary positions.
69 |  
70 | 
71 | LICENSE:
72 | 
73 | The software is released under GPL 3.0
74 | 
75 | PAPERS:
76 | 
77 | 
78 |     Riedl, M., Biemann, C. (2012): Text Segmentation with Topic Models. Journal for Language Technology and Computational Linguistics (JLCL), Vol. 27, No. 1, pp. 47--70, August 2012 (pdf)
79 |     Riedl M., Biemann C. (2012): How Text Segmentation Algorithms Gain from Topic Models, Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT 2012), Montreal, Canada. (pdf)
80 |     Riedl M., Biemann C. (2012): TopicTiling: A Text Segmentation Algorithm based on LDA, Proceedings of the Student Research Workshop of the 50th Meeting of the Association for Computational Linguistics, Jeju, Republic of Korea. (pdf)
81 |     Riedl M., Biemann C. (2012): Sweeping through the Topic Space: Bad luck? Roll again! In Proceedings of the Joint Workshop on Unsupervised and Semi-Supervised Learning in NLP held in conjunction with EACL 2012, Avignon, France (pdf)
82 |     
83 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/TopicDistribution.java:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | /* First created by JCasGen Wed Apr 11 15:17:37 CEST 2012 */
 4 | package de.tudarmstadt.langtech.lda.type;
 5 | 
 6 | import org.apache.uima.jcas.JCas; 
 7 | import org.apache.uima.jcas.JCasRegistry;
 8 | import org.apache.uima.jcas.cas.TOP_Type;
 9 | 
10 | import org.apache.uima.jcas.tcas.Annotation;
11 | import org.apache.uima.jcas.cas.DoubleArray;
12 | 
13 | 
14 | /** 
15 |  * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
16 |  * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml
17 |  * @generated */
18 | public class TopicDistribution extends Annotation {
19 |   /** @generated
20 |    * @ordered 
21 |    */
22 |   public final static int typeIndexID = JCasRegistry.register(TopicDistribution.class);
23 |   /** @generated
24 |    * @ordered 
25 |    */
26 |   public final static int type = typeIndexID;
27 |   /** @generated  */
28 |   public              int getTypeIndexID() {return typeIndexID;}
29 |  
30 |   /** Never called.  Disable default constructor
31 |    * @generated */
32 |   protected TopicDistribution() {}
33 |     
34 |   /** Internal - constructor used by generator 
35 |    * @generated */
36 |   public TopicDistribution(int addr, TOP_Type type) {
37 |     super(addr, type);
38 |     readObject();
39 |   }
40 |   
41 |   /** @generated */
42 |   public TopicDistribution(JCas jcas) {
43 |     super(jcas);
44 |     readObject();   
45 |   } 
46 | 
47 |   /** @generated */  
48 |   public TopicDistribution(JCas jcas, int begin, int end) {
49 |     super(jcas);
50 |     setBegin(begin);
51 |     setEnd(end);
52 |     readObject();
53 |   }   
54 | 
55 |   /** <!-- begin-user-doc -->
56 |     * Write your own initialization here
57 |     * <!-- end-user-doc -->
58 |   @generated modifiable */
59 |   private void readObject() {}
60 |      
61 |  
62 |     
63 |   //*--------------*
64 |   //* Feature: topicDistribution
65 | 
66 |   /** getter for topicDistribution - gets 
67 |    * @generated */
68 |   public DoubleArray getTopicDistribution() {
69 |     if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
70 |       jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
71 |     return (DoubleArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution)));}
72 |     
73 |   /** setter for topicDistribution - sets  
74 |    * @generated */
75 |   public void setTopicDistribution(DoubleArray v) {
76 |     if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
77 |       jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
78 |     jcasType.ll_cas.ll_setRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution, jcasType.ll_cas.ll_getFSRef(v));}    
79 |     
80 |   /** indexed getter for topicDistribution - gets an indexed value - 
81 |    * @generated */
82 |   public double getTopicDistribution(int i) {
83 |     if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
84 |       jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
85 |     jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);
86 |     return jcasType.ll_cas.ll_getDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);}
87 | 
88 |   /** indexed setter for topicDistribution - sets an indexed value - 
89 |    * @generated */
90 |   public void setTopicDistribution(int i, double v) { 
91 |     if (TopicDistribution_Type.featOkTst && ((TopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
92 |       jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
93 |     jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);
94 |     jcasType.ll_cas.ll_setDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((TopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i, v);}
95 |   }
96 | 
97 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/type/SegmentScore_Type.java:
--------------------------------------------------------------------------------
  1 | 
  2 | /* First created by JCasGen Fri Nov 08 16:51:38 CET 2013 */
  3 | package de.tudarmstadt.langtech.semantics.type;
  4 | 
  5 | import org.apache.uima.jcas.JCas;
  6 | import org.apache.uima.jcas.JCasRegistry;
  7 | import org.apache.uima.cas.impl.CASImpl;
  8 | import org.apache.uima.cas.impl.FSGenerator;
  9 | import org.apache.uima.cas.FeatureStructure;
 10 | import org.apache.uima.cas.impl.TypeImpl;
 11 | import org.apache.uima.cas.Type;
 12 | import org.apache.uima.cas.impl.FeatureImpl;
 13 | import org.apache.uima.cas.Feature;
 14 | import org.apache.uima.jcas.tcas.Annotation_Type;
 15 | 
 16 | /** 
 17 |  * Updated by JCasGen Wed Aug 26 15:50:04 CEST 2015
 18 |  * @generated */
 19 | public class SegmentScore_Type extends Annotation_Type {
 20 |   /** @generated */
 21 |   @Override
 22 |   protected FSGenerator getFSGenerator() {return fsGenerator;}
 23 |   /** @generated */
 24 |   private final FSGenerator fsGenerator = 
 25 |     new FSGenerator() {
 26 |       public FeatureStructure createFS(int addr, CASImpl cas) {
 27 |   			 if (SegmentScore_Type.this.useExistingInstance) {
 28 |   			   // Return eq fs instance if already created
 29 |   		     FeatureStructure fs = SegmentScore_Type.this.jcas.getJfsFromCaddr(addr);
 30 |   		     if (null == fs) {
 31 |   		       fs = new SegmentScore(addr, SegmentScore_Type.this);
 32 |   			   SegmentScore_Type.this.jcas.putJfsFromCaddr(addr, fs);
 33 |   			   return fs;
 34 |   		     }
 35 |   		     return fs;
 36 |         } else return new SegmentScore(addr, SegmentScore_Type.this);
 37 |   	  }
 38 |     };
 39 |   /** @generated */
 40 |   @SuppressWarnings ("hiding")
 41 |   public final static int typeIndexID = SegmentScore.typeIndexID;
 42 |   /** @generated 
 43 |      @modifiable */
 44 |   @SuppressWarnings ("hiding")
 45 |   public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.langtech.semantics.type.SegmentScore");
 46 |  
 47 |   /** @generated */
 48 |   final Feature casFeat_score;
 49 |   /** @generated */
 50 |   final int     casFeatCode_score;
 51 |   /** @generated */ 
 52 |   public double getScore(int addr) {
 53 |         if (featOkTst && casFeat_score == null)
 54 |       jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
 55 |     return ll_cas.ll_getDoubleValue(addr, casFeatCode_score);
 56 |   }
 57 |   /** @generated */    
 58 |   public void setScore(int addr, double v) {
 59 |         if (featOkTst && casFeat_score == null)
 60 |       jcas.throwFeatMissing("score", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
 61 |     ll_cas.ll_setDoubleValue(addr, casFeatCode_score, v);}
 62 |     
 63 |   
 64 |  
 65 |   /** @generated */
 66 |   final Feature casFeat_similarityScores;
 67 |   /** @generated */
 68 |   final int     casFeatCode_similarityScores;
 69 |   /** @generated */ 
 70 |   public String getSimilarityScores(int addr) {
 71 |         if (featOkTst && casFeat_similarityScores == null)
 72 |       jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
 73 |     return ll_cas.ll_getStringValue(addr, casFeatCode_similarityScores);
 74 |   }
 75 |   /** @generated */    
 76 |   public void setSimilarityScores(int addr, String v) {
 77 |         if (featOkTst && casFeat_similarityScores == null)
 78 |       jcas.throwFeatMissing("similarityScores", "de.tudarmstadt.langtech.semantics.type.SegmentScore");
 79 |     ll_cas.ll_setStringValue(addr, casFeatCode_similarityScores, v);}
 80 |     
 81 |   
 82 | 
 83 | 
 84 | 
 85 |   /** initialize variables to correspond with Cas Type and Features
 86 | 	* @generated */
 87 |   public SegmentScore_Type(JCas jcas, Type casType) {
 88 |     super(jcas, casType);
 89 |     casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
 90 | 
 91 |  
 92 |     casFeat_score = jcas.getRequiredFeatureDE(casType, "score", "uima.cas.Double", featOkTst);
 93 |     casFeatCode_score  = (null == casFeat_score) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_score).getCode();
 94 | 
 95 |  
 96 |     casFeat_similarityScores = jcas.getRequiredFeatureDE(casType, "similarityScores", "uima.cas.String", featOkTst);
 97 |     casFeatCode_similarityScores  = (null == casFeat_similarityScores) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_similarityScores).getCode();
 98 | 
 99 |   }
100 | }
101 | 
102 | 
103 | 
104 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/WordTopicDistribution.java:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | /* First created by JCasGen Thu Apr 12 12:36:03 CEST 2012 */
 4 | package de.tudarmstadt.langtech.lda.type;
 5 | 
 6 | import org.apache.uima.jcas.JCas; 
 7 | import org.apache.uima.jcas.JCasRegistry;
 8 | import org.apache.uima.jcas.cas.TOP_Type;
 9 | 
10 | import org.apache.uima.jcas.tcas.Annotation;
11 | import org.apache.uima.jcas.cas.DoubleArray;
12 | 
13 | 
14 | /** 
15 |  * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
16 |  * XML source: /home/riedl/work/workspace/de.tudarmstadt.ukp.dkpro.lda/src/main/resources/desc/type/gibbsldatypes.xml
17 |  * @generated */
18 | public class WordTopicDistribution extends Annotation {
19 |   /** @generated
20 |    * @ordered 
21 |    */
22 |   public final static int typeIndexID = JCasRegistry.register(WordTopicDistribution.class);
23 |   /** @generated
24 |    * @ordered 
25 |    */
26 |   public final static int type = typeIndexID;
27 |   /** @generated  */
28 |   public              int getTypeIndexID() {return typeIndexID;}
29 |  
30 |   /** Never called.  Disable default constructor
31 |    * @generated */
32 |   protected WordTopicDistribution() {}
33 |     
34 |   /** Internal - constructor used by generator 
35 |    * @generated */
36 |   public WordTopicDistribution(int addr, TOP_Type type) {
37 |     super(addr, type);
38 |     readObject();
39 |   }
40 |   
41 |   /** @generated */
42 |   public WordTopicDistribution(JCas jcas) {
43 |     super(jcas);
44 |     readObject();   
45 |   } 
46 | 
47 |   /** @generated */  
48 |   public WordTopicDistribution(JCas jcas, int begin, int end) {
49 |     super(jcas);
50 |     setBegin(begin);
51 |     setEnd(end);
52 |     readObject();
53 |   }   
54 | 
55 |   /** <!-- begin-user-doc -->
56 |     * Write your own initialization here
57 |     * <!-- end-user-doc -->
58 |   @generated modifiable */
59 |   private void readObject() {}
60 |      
61 |  
62 |     
63 |   //*--------------*
64 |   //* Feature: topicDistribution
65 | 
66 |   /** getter for topicDistribution - gets 
67 |    * @generated */
68 |   public DoubleArray getTopicDistribution() {
69 |     if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
70 |       jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
71 |     return (DoubleArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution)));}
72 |     
73 |   /** setter for topicDistribution - sets  
74 |    * @generated */
75 |   public void setTopicDistribution(DoubleArray v) {
76 |     if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
77 |       jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
78 |     jcasType.ll_cas.ll_setRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution, jcasType.ll_cas.ll_getFSRef(v));}    
79 |     
80 |   /** indexed getter for topicDistribution - gets an indexed value - 
81 |    * @generated */
82 |   public double getTopicDistribution(int i) {
83 |     if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
84 |       jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
85 |     jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);
86 |     return jcasType.ll_cas.ll_getDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);}
87 | 
88 |   /** indexed setter for topicDistribution - sets an indexed value - 
89 |    * @generated */
90 |   public void setTopicDistribution(int i, double v) { 
91 |     if (WordTopicDistribution_Type.featOkTst && ((WordTopicDistribution_Type)jcasType).casFeat_topicDistribution == null)
92 |       jcasType.jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
93 |     jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i);
94 |     jcasType.ll_cas.ll_setDoubleArrayValue(jcasType.ll_cas.ll_getRefValue(addr, ((WordTopicDistribution_Type)jcasType).casFeatCode_topicDistribution), i, v);}
95 |   }
96 | 
97 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/TopicDistribution_Type.java:
--------------------------------------------------------------------------------
 1 | 
 2 | /* First created by JCasGen Wed Apr 11 15:17:37 CEST 2012 */
 3 | package de.tudarmstadt.langtech.lda.type;
 4 | 
 5 | import org.apache.uima.jcas.JCas;
 6 | import org.apache.uima.jcas.JCasRegistry;
 7 | import org.apache.uima.cas.impl.CASImpl;
 8 | import org.apache.uima.cas.impl.FSGenerator;
 9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 | 
16 | /** 
17 |  * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
18 |  * @generated */
19 | public class TopicDistribution_Type extends Annotation_Type {
20 |   /** @generated */
21 |   protected FSGenerator getFSGenerator() {return fsGenerator;}
22 |   /** @generated */
23 |   private final FSGenerator fsGenerator = 
24 |     new FSGenerator() {
25 |       public FeatureStructure createFS(int addr, CASImpl cas) {
26 |   			 if (TopicDistribution_Type.this.useExistingInstance) {
27 |   			   // Return eq fs instance if already created
28 |   		     FeatureStructure fs = TopicDistribution_Type.this.jcas.getJfsFromCaddr(addr);
29 |   		     if (null == fs) {
30 |   		       fs = new TopicDistribution(addr, TopicDistribution_Type.this);
31 |   			   TopicDistribution_Type.this.jcas.putJfsFromCaddr(addr, fs);
32 |   			   return fs;
33 |   		     }
34 |   		     return fs;
35 |         } else return new TopicDistribution(addr, TopicDistribution_Type.this);
36 |   	  }
37 |     };
38 |   /** @generated */
39 |   public final static int typeIndexID = TopicDistribution.typeIndexID;
40 |   /** @generated 
41 |      @modifiable */
42 |   public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
43 |  
44 |   /** @generated */
45 |   final Feature casFeat_topicDistribution;
46 |   /** @generated */
47 |   final int     casFeatCode_topicDistribution;
48 |   /** @generated */ 
49 |   public int getTopicDistribution(int addr) {
50 |         if (featOkTst && casFeat_topicDistribution == null)
51 |       jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
52 |     return ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution);
53 |   }
54 |   /** @generated */    
55 |   public void setTopicDistribution(int addr, int v) {
56 |         if (featOkTst && casFeat_topicDistribution == null)
57 |       jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
58 |     ll_cas.ll_setRefValue(addr, casFeatCode_topicDistribution, v);}
59 |     
60 |    /** @generated */
61 |   public double getTopicDistribution(int addr, int i) {
62 |         if (featOkTst && casFeat_topicDistribution == null)
63 |       jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
64 |     if (lowLevelTypeChecks)
65 |       return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, true);
66 |     jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
67 |   return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
68 |   }
69 |    
70 |   /** @generated */ 
71 |   public void setTopicDistribution(int addr, int i, double v) {
72 |         if (featOkTst && casFeat_topicDistribution == null)
73 |       jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.TopicDistribution");
74 |     if (lowLevelTypeChecks)
75 |       ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v, true);
76 |     jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
77 |     ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v);
78 |   }
79 |  
80 | 
81 | 
82 | 
83 |   /** initialize variables to correspond with Cas Type and Features
84 | 	* @generated */
85 |   public TopicDistribution_Type(JCas jcas, Type casType) {
86 |     super(jcas, casType);
87 |     casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
88 | 
89 |  
90 |     casFeat_topicDistribution = jcas.getRequiredFeatureDE(casType, "topicDistribution", "uima.cas.DoubleArray", featOkTst);
91 |     casFeatCode_topicDistribution  = (null == casFeat_topicDistribution) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicDistribution).getCode();
92 | 
93 |   }
94 | }
95 | 
96 | 
97 | 
98 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/type/WordTopicDistribution_Type.java:
--------------------------------------------------------------------------------
 1 | 
 2 | /* First created by JCasGen Thu Apr 12 12:36:03 CEST 2012 */
 3 | package de.tudarmstadt.langtech.lda.type;
 4 | 
 5 | import org.apache.uima.jcas.JCas;
 6 | import org.apache.uima.jcas.JCasRegistry;
 7 | import org.apache.uima.cas.impl.CASImpl;
 8 | import org.apache.uima.cas.impl.FSGenerator;
 9 | import org.apache.uima.cas.FeatureStructure;
10 | import org.apache.uima.cas.impl.TypeImpl;
11 | import org.apache.uima.cas.Type;
12 | import org.apache.uima.cas.impl.FeatureImpl;
13 | import org.apache.uima.cas.Feature;
14 | import org.apache.uima.jcas.tcas.Annotation_Type;
15 | 
16 | /** 
17 |  * Updated by JCasGen Thu Apr 12 12:36:03 CEST 2012
18 |  * @generated */
19 | public class WordTopicDistribution_Type extends Annotation_Type {
20 |   /** @generated */
21 |   protected FSGenerator getFSGenerator() {return fsGenerator;}
22 |   /** @generated */
23 |   private final FSGenerator fsGenerator = 
24 |     new FSGenerator() {
25 |       public FeatureStructure createFS(int addr, CASImpl cas) {
26 |   			 if (WordTopicDistribution_Type.this.useExistingInstance) {
27 |   			   // Return eq fs instance if already created
28 |   		     FeatureStructure fs = WordTopicDistribution_Type.this.jcas.getJfsFromCaddr(addr);
29 |   		     if (null == fs) {
30 |   		       fs = new WordTopicDistribution(addr, WordTopicDistribution_Type.this);
31 |   			   WordTopicDistribution_Type.this.jcas.putJfsFromCaddr(addr, fs);
32 |   			   return fs;
33 |   		     }
34 |   		     return fs;
35 |         } else return new WordTopicDistribution(addr, WordTopicDistribution_Type.this);
36 |   	  }
37 |     };
38 |   /** @generated */
39 |   public final static int typeIndexID = WordTopicDistribution.typeIndexID;
40 |   /** @generated 
41 |      @modifiable */
42 |   public final static boolean featOkTst = JCasRegistry.getFeatOkTst("de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
43 |  
44 |   /** @generated */
45 |   final Feature casFeat_topicDistribution;
46 |   /** @generated */
47 |   final int     casFeatCode_topicDistribution;
48 |   /** @generated */ 
49 |   public int getTopicDistribution(int addr) {
50 |         if (featOkTst && casFeat_topicDistribution == null)
51 |       jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
52 |     return ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution);
53 |   }
54 |   /** @generated */    
55 |   public void setTopicDistribution(int addr, int v) {
56 |         if (featOkTst && casFeat_topicDistribution == null)
57 |       jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
58 |     ll_cas.ll_setRefValue(addr, casFeatCode_topicDistribution, v);}
59 |     
60 |    /** @generated */
61 |   public double getTopicDistribution(int addr, int i) {
62 |         if (featOkTst && casFeat_topicDistribution == null)
63 |       jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
64 |     if (lowLevelTypeChecks)
65 |       return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, true);
66 |     jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
67 | 	return ll_cas.ll_getDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
68 |   }
69 |    
70 |   /** @generated */ 
71 |   public void setTopicDistribution(int addr, int i, double v) {
72 |         if (featOkTst && casFeat_topicDistribution == null)
73 |       jcas.throwFeatMissing("topicDistribution", "de.tudarmstadt.ukp.dkpro.lda.type.WordTopicDistribution");
74 |     if (lowLevelTypeChecks)
75 |       ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v, true);
76 |     jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i);
77 |     ll_cas.ll_setDoubleArrayValue(ll_cas.ll_getRefValue(addr, casFeatCode_topicDistribution), i, v);
78 |   }
79 |  
80 | 
81 | 
82 | 
83 |   /** initialize variables to correspond with Cas Type and Features
84 | 	* @generated */
85 |   public WordTopicDistribution_Type(JCas jcas, Type casType) {
86 |     super(jcas, casType);
87 |     casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
88 | 
89 |  
90 |     casFeat_topicDistribution = jcas.getRequiredFeatureDE(casType, "topicDistribution", "uima.cas.DoubleArray", featOkTst);
91 |     casFeatCode_topicDistribution  = (null == casFeat_topicDistribution) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_topicDistribution).getCode();
92 | 
93 |   }
94 | }
95 | 
96 | 
97 | 
98 |     


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/test/java/RunTopicTilingOnFile.java:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.uima.UIMAException;
  6 | import org.apache.uima.analysis_engine.AnalysisEngine;
  7 | import org.apache.uima.collection.CollectionReader;
  8 | import org.apache.uima.resource.ResourceInitializationException;
  9 | import org.kohsuke.args4j.CmdLineException;
 10 | import org.kohsuke.args4j.CmdLineParser;
 11 | import org.kohsuke.args4j.Option;
 12 | import org.uimafit.factory.AnalysisEngineFactory;
 13 | import org.uimafit.factory.CollectionReaderFactory;
 14 | import org.uimafit.pipeline.SimplePipeline;
 15 | 
 16 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.OutputSegments;
 17 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.TopicTilingSegmenterAnnotator;
 18 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader;
 19 | import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter;
 20 | 
 21 | public class RunTopicTilingOnFile {
 22 | 	
 23 | 	private static class Options {
 24 | 		@Option(name="-tmd",usage="Directory of the topic model (GibbsLDA should be used)",required = true)
 25 | 		String topicModelDirectory;
 26 | 		@Option(name="-tmn",usage="Name of the topic model (GibbsLDA should be used)",required = true)
 27 | 		String topicModelName;
 28 | 		@Option(name="-dn",usage="Use the direct neighbor otherwise the highest neighbor will be used (default false)",required=false)
 29 | 		boolean useDirectNeighbor=false;
 30 | 		@Option(name="-i",usage="Number of inference iterations used to annotate words with topic IDs (default 100)",required=false)
 31 | 		int inferenceIterations=100;
 32 | 		@Option(name="-m",usage="Use mode counting (true/false) (default=true)",required=false)
 33 | 		boolean modeCounting=true;
 34 | 		@Option(name="-w",usage="Window size used to calculate the sentence similarity", required=false)
 35 | 		int windowSize=1;
 36 | 		@Option(name="-ri",usage="Use the repeated inference method",required = false)
 37 | 		int repeatedInference=1;
 38 | 		@Option(name="-rs",usage="Use the repeated segmentation",required = false)
 39 | 		int repeatedSegmentation=1;
 40 | 		@Option(name="-fd",usage="Directory fo the test files",required = true)
 41 | 		public String fileDirectory;
 42 | 		@Option(name="-fp",usage="File pattern for the test files",required = true)
 43 | 		public String filePattern;
 44 | 		@Option(name="-out",usage="File the content is written to (otherwise stdout will be used)",required = false)
 45 | 		public String output=null;
 46 | //		@Option(name="-n",usage="Number of segments that should be made (the value -1 indicates, that segments are searched automatically)",required = true)
 47 | //		public String segmentNumber;
 48 | 	}
 49 | 
 50 | 	public static void main(final String[] args)
 51 | 		throws ResourceInitializationException, UIMAException, IOException {
 52 | 		Options options = new Options();
 53 | 		CmdLineParser parser = new CmdLineParser(options);
 54 | 		try {
 55 | 		    parser.parseArgument(args);
 56 | 		} catch( CmdLineException e ) {
 57 | 		    System.err.println(e.getMessage());
 58 | 		    System.err.println("java -jar myprogram.jar [options...] arguments...");
 59 | 		    parser.printUsage(System.err);
 60 | 		    return;
 61 | 		}
 62 | 
 63 | 		new RunTopicTilingOnFile(options);
 64 | 
 65 | 	}
 66 | 
 67 | 	public RunTopicTilingOnFile(Options opt) throws UIMAException, IOException {
 68 | 		String neighbor = "HIGHEST_NEIGHBOR";
 69 | 		if (opt.useDirectNeighbor)
 70 | 			neighbor = "DIRECT_NEIGHBOR";
 71 | 		final CollectionReader reader = CollectionReaderFactory.createCollectionReader(
 72 | 				TextReader.class,
 73 | 				TextReader.PARAM_PATH, opt.fileDirectory
 74 | 				,
 75 | 
 76 | 
 77 | 				TextReader.PARAM_PATTERNS, new String[] { "[+]"+opt.filePattern }
 78 | 				);
 79 | 		
 80 | 		AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(StanfordSegmenter.class);
 81 | 		AnalysisEngine topicTiling = AnalysisEngineFactory
 82 | 					.createPrimitive(
 83 | 							TopicTilingSegmenterAnnotator.class,
 84 | 							TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_DIRECTORY,
 85 | 							opt.topicModelDirectory,
 86 | 							TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_NAME,
 87 | 							opt.topicModelName,
 88 | 							TopicTilingSegmenterAnnotator.PARAM_INFERENCE_ITERATION,
 89 | 							opt.inferenceIterations,
 90 | 							TopicTilingSegmenterAnnotator.PARAM_REPEAT_INFERENCE,
 91 | 							opt.repeatedInference,
 92 | 							TopicTilingSegmenterAnnotator.PARAM_REPEAT_SEGMENTATION,
 93 | 							opt.repeatedSegmentation,
 94 | 							TopicTilingSegmenterAnnotator.PARAM_WINDOW,
 95 | 							opt.windowSize,
 96 | 							TopicTilingSegmenterAnnotator.PARAM_DEPTH_SCORE,
 97 | 							neighbor,
 98 | 							TopicTilingSegmenterAnnotator.PARAM_MODE_COUNTING,
 99 | 							opt.modeCounting);
100 | 		AnalysisEngine outputSegments = AnalysisEngineFactory.createPrimitive(OutputSegments.class,OutputSegments.PARAM_OUTPUT,opt.output);
101 | 		SimplePipeline.runPipeline(reader, segmenter, topicTiling,outputSegments);
102 | 
103 | 	}
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Estimator.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2007 by
  3 |  * 
  4 |  * 	Xuan-Hieu Phan
  5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
  6 |  * 	Graduate School of Information Sciences
  7 |  * 	Tohoku University
  8 |  * 
  9 |  *  Cam-Tu Nguyen
 10 |  *  ncamtu@gmail.com
 11 |  *  College of Technology
 12 |  *  Vietnam National University, Hanoi
 13 |  *
 14 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
 15 |  * it under the terms of the GNU General Public License as published
 16 |  * by the Free Software Foundation; either version 2 of the License,
 17 |  * or (at your option) any later version.
 18 |  *
 19 |  * JGibbsLDA is distributed in the hope that it will be useful, but
 20 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 22 |  * GNU General Public License for more details.
 23 |  *
 24 |  * You should have received a copy of the GNU General Public License
 25 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
 26 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 27 |  */
 28 | 
 29 | package jgibbslda;
 30 | 
 31 | import java.io.File;
 32 | 
 33 | public class Estimator {
 34 | 	
 35 | 	// output model
 36 | 	protected Model trnModel;
 37 | 	LDACmdOption option;
 38 | 	
 39 | 	public boolean init(LDACmdOption option){
 40 | 		this.option = option;
 41 | 		trnModel = new Model();
 42 | 		
 43 | 		if (option.est){
 44 | 			if (!trnModel.initNewModel(option))
 45 | 				return false;
 46 | 			trnModel.data.localDict.writeWordMap(option.dir + File.separator + option.wordMapFileName);
 47 | 		}
 48 | 		else if (option.estc){
 49 | 			if (!trnModel.initEstimatedModel(option))
 50 | 				return false;
 51 | 		}
 52 | 		
 53 | 		return true;
 54 | 	}
 55 | 	
 56 | 	public void estimate(){
 57 | 		System.out.println("Sampling " + trnModel.niters + " iteration!");
 58 | 		
 59 | 		int lastIter = trnModel.liter;
 60 | 		for (trnModel.liter = lastIter + 1; trnModel.liter < trnModel.niters + lastIter; trnModel.liter++){
 61 | 			System.out.println("Iteration " + trnModel.liter + " ...");
 62 | 			
 63 | 			// for all z_i
 64 | 			for (int m = 0; m < trnModel.M; m++){				
 65 | 				for (int n = 0; n < trnModel.data.docs[m].length; n++){
 66 | 					// z_i = z[m][n]
 67 | 					// sample from p(z_i|z_-i, w)
 68 | 					int topic = sampling(m, n);
 69 | 					trnModel.z[m].set(n, topic);
 70 | 				}// end for each word
 71 | 			}// end for each document
 72 | 			
 73 | 			if (option.savestep > 0){
 74 | 				if (trnModel.liter % option.savestep == 0){
 75 | 					System.out.println("Saving the model at iteration " + trnModel.liter + " ...");
 76 | 					computeTheta();
 77 | 					computePhi();
 78 | 					trnModel.saveModel("model-" + Conversion.ZeroPad(trnModel.liter, 5));
 79 | 				}
 80 | 			}
 81 | 		}// end iterations		
 82 | 		
 83 | 		System.out.println("Gibbs sampling completed!\n");
 84 | 		System.out.println("Saving the final model!\n");
 85 | 		computeTheta();
 86 | 		computePhi();
 87 | 		trnModel.liter--;
 88 | 		trnModel.saveModel("model-final");
 89 | 	}
 90 | 	
 91 | 	/**
 92 | 	 * Do sampling
 93 | 	 * @param m document number
 94 | 	 * @param n word number
 95 | 	 * @return topic id
 96 | 	 */
 97 | 	public int sampling(int m, int n){
 98 | 		// remove z_i from the count variable
 99 | 		int topic = trnModel.z[m].get(n);
100 | 		int w = trnModel.data.docs[m].words[n];
101 | 		
102 | 		trnModel.nw[w][topic] -= 1;
103 | 		trnModel.nd[m][topic] -= 1;
104 | 		trnModel.nwsum[topic] -= 1;
105 | 		trnModel.ndsum[m] -= 1;
106 | 		
107 | 		double Vbeta = trnModel.V * trnModel.beta;
108 | 		double Kalpha = trnModel.K * trnModel.alpha;
109 | 		
110 | 		//do multinominal sampling via cumulative method
111 | 		for (int k = 0; k < trnModel.K; k++){
112 | 			trnModel.p[k] = (trnModel.nw[w][k] + trnModel.beta)/(trnModel.nwsum[k] + Vbeta) *
113 | 					(trnModel.nd[m][k] + trnModel.alpha)/(trnModel.ndsum[m] + Kalpha);
114 | 		}
115 | 		
116 | 		// cumulate multinomial parameters
117 | 		for (int k = 1; k < trnModel.K; k++){
118 | 			trnModel.p[k] += trnModel.p[k - 1];
119 | 		}
120 | 		
121 | 		// scaled sample because of unnormalized p[]
122 | 		double u = Math.random() * trnModel.p[trnModel.K - 1];
123 | 		
124 | 		for (topic = 0; topic < trnModel.K; topic++){
125 | 			if (trnModel.p[topic] > u) //sample topic w.r.t distribution p
126 | 				break;
127 | 		}
128 | 		
129 | 		// add newly estimated z_i to count variables
130 | 		trnModel.nw[w][topic] += 1;
131 | 		trnModel.nd[m][topic] += 1;
132 | 		trnModel.nwsum[topic] += 1;
133 | 		trnModel.ndsum[m] += 1;
134 | 		
135 |  		return topic;
136 | 	}
137 | 	
138 | 	public void computeTheta(){
139 | 		for (int m = 0; m < trnModel.M; m++){
140 | 			for (int k = 0; k < trnModel.K; k++){
141 | 				trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha);
142 | 			}
143 | 		}
144 | 	}
145 | 	
146 | 	public void computePhi(){
147 | 		for (int k = 0; k < trnModel.K; k++){
148 | 			for (int w = 0; w < trnModel.V; w++){
149 | 				trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta);
150 | 			}
151 | 		}
152 | 	}
153 | }
154 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Dictionary.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2007 by
  3 |  * 
  4 |  * 	Xuan-Hieu Phan
  5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
  6 |  * 	Graduate School of Information Sciences
  7 |  * 	Tohoku University
  8 |  * 
  9 |  *  Cam-Tu Nguyen
 10 |  *  ncamtu@gmail.com
 11 |  *  College of Technology
 12 |  *  Vietnam National University, Hanoi
 13 |  *
 14 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
 15 |  * it under the terms of the GNU General Public License as published
 16 |  * by the Free Software Foundation; either version 2 of the License,
 17 |  * or (at your option) any later version.
 18 |  *
 19 |  * JGibbsLDA is distributed in the hope that it will be useful, but
 20 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 22 |  * GNU General Public License for more details.
 23 |  *
 24 |  * You should have received a copy of the GNU General Public License
 25 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
 26 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 27 |  */
 28 | package jgibbslda;
 29 | 
 30 | import java.io.BufferedReader;
 31 | import java.io.BufferedWriter;
 32 | import java.io.FileInputStream;
 33 | import java.io.FileOutputStream;
 34 | import java.io.InputStreamReader;
 35 | import java.io.OutputStreamWriter;
 36 | import java.util.HashMap;
 37 | import java.util.Iterator;
 38 | import java.util.Map;
 39 | import java.util.StringTokenizer;
 40 | 
 41 | public class Dictionary {
 42 | 	public Map<String,Integer> word2id;
 43 | 	public Map<Integer, String> id2word;
 44 | 		
 45 | 	//--------------------------------------------------
 46 | 	// constructors
 47 | 	//--------------------------------------------------
 48 | 	
 49 | 	public Dictionary(){
 50 | 		word2id = new HashMap<String, Integer>();
 51 | 		id2word = new HashMap<Integer, String>();
 52 | 	}
 53 | 	
 54 | 	//---------------------------------------------------
 55 | 	// get/set methods
 56 | 	//---------------------------------------------------
 57 | 	
 58 | 	public String getWord(int id){
 59 | 		return id2word.get(id);
 60 | 	}
 61 | 	
 62 | 	public Integer getID (String word){
 63 | 		return word2id.get(word);
 64 | 	}
 65 | 	
 66 | 	//----------------------------------------------------
 67 | 	// checking methods
 68 | 	//----------------------------------------------------
 69 | 	/**
 70 | 	 * check if this dictionary contains a specified word
 71 | 	 */
 72 | 	public boolean contains(String word){
 73 | 		return word2id.containsKey(word);
 74 | 	}
 75 | 	
 76 | 	public boolean contains(int id){
 77 | 		return id2word.containsKey(id);
 78 | 	}
 79 | 	//---------------------------------------------------
 80 | 	// manupulating methods
 81 | 	//---------------------------------------------------
 82 | 	/**
 83 | 	 * add a word into this dictionary
 84 | 	 * return the corresponding id
 85 | 	 */
 86 | 	public int addWord(String word){
 87 | 		if (!contains(word)){
 88 | 			int id = word2id.size();
 89 | 			
 90 | 			word2id.put(word, id);
 91 | 			id2word.put(id,word);
 92 | 			
 93 | 			return id;
 94 | 		}
 95 | 		else return getID(word);		
 96 | 	}
 97 | 	
 98 | 	//---------------------------------------------------
 99 | 	// I/O methods
100 | 	//---------------------------------------------------
101 | 	/**
102 | 	 * read dictionary from file
103 | 	 */
104 | 	public boolean readWordMap(String wordMapFile){		
105 | 		try{
106 | 			BufferedReader reader = new BufferedReader(new InputStreamReader(
107 | 					new FileInputStream(wordMapFile), "UTF-8"));
108 | 			String line;
109 | 			
110 | 			//read the number of words
111 | 			line = reader.readLine();			
112 | 			int nwords = Integer.parseInt(line);
113 | 			
114 | 			//read map
115 | 			for (int i = 0; i < nwords; ++i){
116 | 				line = reader.readLine();
117 | 				StringTokenizer tknr = new StringTokenizer(line, " \t\n\r");
118 | 				
119 | 				if (tknr.countTokens() != 2) continue;
120 | 				
121 | 				String word = tknr.nextToken();
122 | 				String id = tknr.nextToken();
123 | 				int intID = Integer.parseInt(id);
124 | 				
125 | 				id2word.put(intID, word);
126 | 				word2id.put(word, intID);
127 | 			}
128 | 			
129 | 			reader.close();
130 | 			return true;
131 | 		}
132 | 		catch (Exception e){
133 | 			System.out.println("Error while reading dictionary:" + e.getMessage());
134 | 			e.printStackTrace();
135 | 			return false;
136 | 		}		
137 | 	}
138 | 	
139 | 	public boolean writeWordMap(String wordMapFile){
140 | 		try{
141 | 			BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
142 | 					new FileOutputStream(wordMapFile), "UTF-8"));
143 | 			
144 | 			//write number of words
145 | 			writer.write(word2id.size() + "\n");
146 | 			
147 | 			//write word to id
148 | 			Iterator<String> it = word2id.keySet().iterator();
149 | 			while (it.hasNext()){
150 | 				String key = it.next();
151 | 				Integer value = word2id.get(key);
152 | 				
153 | 				writer.write(key + " " + value + "\n");
154 | 			}
155 | 			
156 | 			writer.close();
157 | 			return true;
158 | 		}
159 | 		catch (Exception e){
160 | 			System.out.println("Error while writing word map " + e.getMessage());
161 | 			e.printStackTrace();
162 | 			return false;
163 | 		}
164 | 		
165 | 		
166 | 	}
167 | }
168 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LogSaveEstimator.java:
--------------------------------------------------------------------------------
  1 | package jgibbslda;
  2 | 
  3 | /*
  4 |  * Copyright (C) 2007 by
  5 |  * 
  6 |  * 	Xuan-Hieu Phan
  7 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
  8 |  * 	Graduate School of Information Sciences
  9 |  * 	Tohoku University
 10 |  * 
 11 |  *  Cam-Tu Nguyen
 12 |  *  ncamtu@gmail.com
 13 |  *  College of Technology
 14 |  *  Vietnam National University, Hanoi
 15 |  *  
 16 |  *  Martin Riedl
 17 |  *  riedl@cs.tu-darmstadt.de
 18 |  *  FG Language Technology
 19 |  *  Technische Universität Darmstadt, Germany
 20 |  *
 21 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
 22 |  * it under the terms of the GNU General Public License as published
 23 |  * by the Free Software Foundation; either version 2 of the License,
 24 |  * or (at your option) any later version.
 25 |  *
 26 |  * JGibbsLDA is distributed in the hope that it will be useful, but
 27 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 28 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 29 |  * GNU General Public License for more details.
 30 |  *
 31 |  * You should have received a copy of the GNU General Public License
 32 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
 33 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 34 |  */
 35 | 
 36 | 
 37 | import java.io.File;
 38 | import java.util.Arrays;
 39 | 
 40 | public class LogSaveEstimator {
 41 | 	
 42 | 	// output model
 43 | 	protected Model trnModel;
 44 | 	LDACmdOption option;
 45 | 	public int[] savesteps;
 46 | 	
 47 | 	public boolean init(LDACmdOption option){
 48 | 		this.option = option;
 49 | 		trnModel = new Model();
 50 | 		
 51 | 		if (option.est){
 52 | 			if (!trnModel.initNewModel(option))
 53 | 				return false;
 54 | 			trnModel.data.localDict.writeWordMap(option.dir + File.separator + option.wordMapFileName);
 55 | 		}
 56 | 		else if (option.estc){
 57 | 			if (!trnModel.initEstimatedModel(option))
 58 | 				return false;
 59 | 		}
 60 | 		
 61 | 		return true;
 62 | 	}
 63 | 	
 64 | 	public void estimate(){
 65 | 		System.out.println("Sampling " + trnModel.niters + " iteration!");
 66 | 		
 67 | 		
 68 | 		int lastIter = trnModel.liter;
 69 | 		for (trnModel.liter = lastIter + 1; trnModel.liter < trnModel.niters + lastIter; trnModel.liter++){
 70 | 			System.out.println("Iteration " + trnModel.liter + " ...");
 71 | 			
 72 | 			// for all z_i
 73 | 			for (int m = 0; m < trnModel.M; m++){		
 74 | 				
 75 | 				for (int n = 0; n < trnModel.data.docs[m].length; n++){
 76 | 					// z_i = z[m][n]
 77 | 					// sample from p(z_i|z_-i, w)
 78 | 					int topic = sampling(m, n);
 79 | 					trnModel.z[m].set(n, topic);
 80 | 				}// end for each word
 81 | 			}// end for each document
 82 | 			
 83 | //			if (option.savestep > 0){
 84 | 				if (Arrays.binarySearch(savesteps,trnModel.liter)>=0){
 85 | 					System.out.println("Saving the model at iteration " + trnModel.liter + " ...");
 86 | 					computeTheta();
 87 | 					computePhi();
 88 | 					trnModel.saveModel("model-" + Conversion.ZeroPad(trnModel.liter, 5));
 89 | 				}
 90 | //			}
 91 | 		}// end iterations		
 92 | 		
 93 | 		System.out.println("Gibbs sampling completed!\n");
 94 | 		System.out.println("Saving the final model!\n");
 95 | 		computeTheta();
 96 | 		computePhi();
 97 | 		trnModel.liter--;
 98 | 		trnModel.saveModel("model-final");
 99 | 	}
100 | 	
101 | 	/**
102 | 	 * Do sampling
103 | 	 * @param m document number
104 | 	 * @param n word number
105 | 	 * @return topic id
106 | 	 */
107 | 	public int sampling(int m, int n){
108 | 		// remove z_i from the count variable
109 | 		int topic = trnModel.z[m].get(n);
110 | 		int w = trnModel.data.docs[m].words[n];
111 | 		trnModel.nw[w][topic] -= 1;
112 | 		trnModel.nd[m][topic] -= 1;
113 | 		trnModel.nwsum[topic] -= 1;
114 | 		trnModel.ndsum[m] -= 1;
115 | 		
116 | 		double Vbeta = trnModel.V * trnModel.beta;
117 | 		double Kalpha = trnModel.K * trnModel.alpha;
118 | 		
119 | 		//do multinominal sampling via cumulative method
120 | 		for (int k = 0; k < trnModel.K; k++){
121 | 			trnModel.p[k] = (trnModel.nw[w][k] + trnModel.beta)/(trnModel.nwsum[k] + Vbeta) *
122 | 					(trnModel.nd[m][k] + trnModel.alpha)/(trnModel.ndsum[m] + Kalpha);
123 | 		}
124 | 		
125 | 		// cumulate multinomial parameters
126 | 		for (int k = 1; k < trnModel.K; k++){
127 | 			trnModel.p[k] += trnModel.p[k - 1];
128 | 		}
129 | 		
130 | 		// scaled sample because of unnormalized p[]
131 | 		double u = Math.random() * trnModel.p[trnModel.K - 1];
132 | 		
133 | 		for (topic = 0; topic < trnModel.K; topic++){
134 | 			if (trnModel.p[topic] > u) //sample topic w.r.t distribution p
135 | 				break;
136 | 		}
137 | 		
138 | 		// add newly estimated z_i to count variables
139 | 		trnModel.nw[w][topic] += 1;
140 | 		trnModel.nd[m][topic] += 1;
141 | 		trnModel.nwsum[topic] += 1;
142 | 		trnModel.ndsum[m] += 1;
143 | 		
144 |  		return topic;
145 | 	}
146 | 	
147 | 	public void computeTheta(){
148 | 		for (int m = 0; m < trnModel.M; m++){
149 | 			for (int k = 0; k < trnModel.K; k++){
150 | 				trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha);
151 | 			}
152 | 		}
153 | 	}
154 | 	
155 | 	public void computePhi(){
156 | 		for (int k = 0; k < trnModel.K; k++){
157 | 			for (int w = 0; w < trnModel.V; w++){
158 | 				trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta);
159 | 			}
160 | 		}
161 | 	}
162 | }
163 | 
164 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/RunTopicTilingOnFile.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *	Martin Riedl
  3 |  *	riedl@cs.tu-darmstadt.de
  4 |  *  FG Language Technology
  5 |  * 	Technische Universität Darmstadt, Germany
  6 |  * 
  7 |  * 
  8 |  *  This file is part of TopicTiling.
  9 |  *
 10 |  *  TopicTiling is free software: you can redistribute it and/or modify
 11 |  *  it under the terms of the GNU General Public License as published by
 12 |  *  the Free Software Foundation, either version 3 of the License, or
 13 |  *  (at your option) any later version.
 14 |  *
 15 |  *  TopicTiling is distributed in the hope that it will be useful,
 16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *  GNU General Public License for more details.
 19 |  *
 20 |  *  You should have received a copy of the GNU General Public License
 21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
 22 |  */
 23 | 
 24 | 
 25 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter;
 26 | 
 27 | 
 28 | import java.io.IOException;
 29 | 
 30 | import org.apache.uima.UIMAException;
 31 | import org.apache.uima.analysis_engine.AnalysisEngine;
 32 | import org.apache.uima.collection.CollectionReader;
 33 | import org.apache.uima.resource.ResourceInitializationException;
 34 | import org.kohsuke.args4j.CmdLineException;
 35 | import org.kohsuke.args4j.CmdLineParser;
 36 | import org.kohsuke.args4j.Option;
 37 | import org.uimafit.factory.AnalysisEngineFactory;
 38 | import org.uimafit.factory.CollectionReaderFactory;
 39 | import org.uimafit.pipeline.SimplePipeline;
 40 | 
 41 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.OutputSegments;
 42 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.SimpleSegmenter;
 43 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator.TopicTilingSegmenterAnnotator;
 44 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader;
 45 | import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter;
 46 | 
 47 | public class RunTopicTilingOnFile {
 48 | 	
 49 | 	private static class Options {
 50 | 		@Option(name="-tmd",usage="Directory of the topic model (GibbsLDA should be used)",required = true)
 51 | 		String topicModelDirectory;
 52 | 		@Option(name="-tmn",usage="Name of the topic model (GibbsLDA should be used)",required = true)
 53 | 		String topicModelName;
 54 | 		@Option(name="-dn",usage="Use the direct neighbor otherwise the highest neighbor will be used (default false)",required=false)
 55 | 		boolean useDirectNeighbor=false;
 56 | 		@Option(name="-d",usage="Print debugging output (default false)",required=false)
 57 | 		boolean debug=false;
 58 | 		@Option(name="-i",usage="Number of inference iterations used to annotate words with topic IDs (default 100)",required=false)
 59 | 		int inferenceIterations=100;
 60 | 		@Option(name="-s",usage="Use simple segmentation (default=false)",required=false)
 61 | 		boolean useSimpleSegmentation=false;
 62 | 		
 63 | 		@Option(name="-m",usage="Use mode counting (true/false) (default=true)",required=false)
 64 | 		boolean modeCounting=true;
 65 | 		@Option(name="-w",usage="Window size used to calculate the sentence similarity", required=false)
 66 | 		int windowSize=1;
 67 | 		@Option(name="-ri",usage="Use the repeated inference method",required = false)
 68 | 		int repeatedInference=1;
 69 | 		@Option(name="-rs",usage="Use the repeated segmentation",required = false)
 70 | 		int repeatedSegmentation=1;
 71 | 		@Option(name="-fd",usage="Directory fo the test files",required = true)
 72 | 		public String fileDirectory;
 73 | 		@Option(name="-fp",usage="File pattern for the test files",required = true)
 74 | 		public String filePattern;
 75 | 		@Option(name="-out",usage="File the content is written to (otherwise stdout will be used)",required = false)
 76 | 		public String output=null;
 77 | //		@Option(name="-n",usage="Number of segments that should be made (the value -1 indicates, that segments are searched automatically)",required = true)
 78 | //		public String segmentNumber;
 79 | 	}
 80 | 
 81 | 	public static void main(final String[] args)
 82 | 		throws ResourceInitializationException, UIMAException, IOException {
 83 | 		Options options = new Options();
 84 | 		CmdLineParser parser = new CmdLineParser(options);
 85 | 		try {
 86 | 		    parser.parseArgument(args);
 87 | 		} catch( CmdLineException e ) {
 88 | 		    System.err.println(e.getMessage());
 89 | 		    System.err.println("java -jar myprogram.jar [options...] arguments...");
 90 | 		    parser.printUsage(System.err);
 91 | 		    return;
 92 | 		}
 93 | 
 94 | 		new RunTopicTilingOnFile(options);
 95 | 
 96 | 	}
 97 | 
 98 | 	public RunTopicTilingOnFile(Options opt) throws UIMAException, IOException {
 99 | 		String neighbor = "HIGHEST_NEIGHBOR";
100 | 		if (opt.useDirectNeighbor)
101 | 			neighbor = "DIRECT_NEIGHBOR";
102 | 		final CollectionReader reader = CollectionReaderFactory.createCollectionReader(
103 | 				TextReader.class,
104 | 				TextReader.PARAM_PATH, opt.fileDirectory,
105 | 				TextReader.PARAM_PATTERNS, new String[] { "[+]" + opt.filePattern });
106 | 		
107 | 		AnalysisEngine segmenter = AnalysisEngineFactory.createPrimitive(StanfordSegmenter.class);
108 | 		if(opt.useSimpleSegmentation){
109 | 			segmenter = AnalysisEngineFactory.createPrimitive(SimpleSegmenter.class);
110 | 		}
111 | 		AnalysisEngine topicTiling = AnalysisEngineFactory
112 | 					.createPrimitive(
113 | 							TopicTilingSegmenterAnnotator.class,
114 | 							TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_DIRECTORY,
115 | 							opt.topicModelDirectory,
116 | 							TopicTilingSegmenterAnnotator.PARAM_LDA_MODEL_NAME,
117 | 							opt.topicModelName,
118 | 							TopicTilingSegmenterAnnotator.PARAM_INFERENCE_ITERATION,
119 | 							opt.inferenceIterations,
120 | 							TopicTilingSegmenterAnnotator.PARAM_REPEAT_INFERENCE,
121 | 							opt.repeatedInference,
122 | 							TopicTilingSegmenterAnnotator.PARAM_REPEAT_SEGMENTATION,
123 | 							opt.repeatedSegmentation,
124 | 							TopicTilingSegmenterAnnotator.PARAM_WINDOW,
125 | 							opt.windowSize,
126 | 							TopicTilingSegmenterAnnotator.PARAM_DEPTH_SCORE,
127 | 							neighbor,
128 | 							TopicTilingSegmenterAnnotator.PARAM_DEBUG,
129 | 							opt.debug,
130 | 							TopicTilingSegmenterAnnotator.PARAM_MODE_COUNTING,
131 | 							opt.modeCounting);
132 | 		AnalysisEngine outputSegments = AnalysisEngineFactory.createPrimitive(OutputSegments.class,OutputSegments.PARAM_OUTPUT,opt.output);
133 | 		SimplePipeline.runPipeline(reader, segmenter, topicTiling,outputSegments);
134 | 
135 | 	}
136 | 
137 | }
138 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/Inferencer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2007 by
  3 |  * 
  4 |  * 	Xuan-Hieu Phan
  5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
  6 |  * 	Graduate School of Information Sciences
  7 |  * 	Tohoku University
  8 |  * 
  9 |  *  Cam-Tu Nguyen
 10 |  *  ncamtu@gmail.com
 11 |  *  College of Technology
 12 |  *  Vietnam National University, Hanoi
 13 |  *  
 14 |  *  Martin Riedl
 15 |  *  riedl@cs.tu-darmstadt.de
 16 |  *  FG Language Technology
 17 |  *  Technische Universität Darmstadt, Germany
 18 |  *
 19 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
 20 |  * it under the terms of the GNU General Public License as published
 21 |  * by the Free Software Foundation; either version 2 of the License,
 22 |  * or (at your option) any later version.
 23 |  *
 24 |  * JGibbsLDA is distributed in the hope that it will be useful, but
 25 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 26 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 27 |  * GNU General Public License for more details.
 28 |  *
 29 |  * You should have received a copy of the GNU General Public License
 30 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
 31 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 32 |  */
 33 | 
 34 | package jgibbslda;
 35 | 
 36 | import java.util.ArrayList;
 37 | import java.util.List;
 38 | 
 39 | import org.apache.uima.UIMAFramework;
 40 | import org.apache.uima.util.Level;
 41 | import org.apache.uima.util.Logger;
 42 | 
 43 | 
 44 | 
 45 | 
 46 | public class Inferencer {	
 47 | 	// Train model
 48 | 	public Model trnModel;
 49 | 	public Dictionary globalDict;
 50 | 	private LDACmdOption option;
 51 | 	public ArrayList<int[][]> values ;
 52 | 	private Model newModel;
 53 | 	public int niters = 100;
 54 | 	public static Logger logger = UIMAFramework.getLogger(Inferencer.class);
 55 | 	//-----------------------------------------------------
 56 | 	// Init method
 57 | 	//-----------------------------------------------------
 58 | 	public boolean init(LDACmdOption option){
 59 | 		this.option = option;
 60 | 		trnModel = new Model();
 61 | 		
 62 | 		if (!trnModel.initEstimatedModel(option))
 63 | 			return false;		
 64 | 		
 65 | 		globalDict = trnModel.data.localDict;
 66 | 		computeTrnTheta();
 67 | 		computeTrnPhi();
 68 | 		
 69 | 		return true;
 70 | 	}
 71 | 	
 72 | 	//inference new model ~ getting data from a specified dataset
 73 | 	public Model inference( LDADataset newData){
 74 | 		logger.log(Level.FINE,"init new model");
 75 | 		Model newModel = new Model();		
 76 | 		
 77 | 		newModel.initNewModel(option, newData, trnModel);		
 78 | 		this.newModel = newModel;		
 79 | 		
 80 | 		//initialiaze for repeated mode (RIEDL)
 81 | 		values = new ArrayList<int[][]>();
 82 | 		for (int doc = 0; doc < newModel.z.length; doc++) {
 83 | 			values.add(new int[newModel.z[doc].size()][newModel.K]);
 84 | 		}
 85 | 		
 86 | 		//-----------------------
 87 | 		logger.log(Level.FINE,"Sampling " + niters + " iteration for inference!");		
 88 | //		TopicTiling.printDim(newModel.z);
 89 | 		
 90 | 		
 91 | 		for (newModel.liter = 1; newModel.liter <= niters; newModel.liter++){
 92 | 			//System.out.println("Iteration " + newModel.liter + " ...");
 93 | 			
 94 | 			// for all newz_i
 95 | 			for (int m = 0; m < newModel.M; ++m){//num of docs
 96 | 				for (int n = 0; n < newModel.data.docs[m].length; n++){
 97 | 					// (newz_i = newz[m][n]
 98 | 					// sample from p(z_i|z_-1,w)
 99 | 					int topic = infSampling(m, n);
100 | 					newModel.z[m].set(n, topic);
101 | 					//MR
102 | 					values.get(m)[n][topic]++;
103 | 					//END MR
104 | 				}
105 | 			}//end foreach new doc
106 | 			
107 | 		}// end iterations
108 | 		
109 | 		
110 | 		logger.log(Level.FINE,"Gibbs sampling for inference completed!");
111 | 		
112 | 		computeNewTheta();
113 | 		computeNewPhi();
114 | 		newModel.liter--;
115 | 		
116 | 		return this.newModel;
117 | 	}
118 | 	
119 | 	public Model inference(String [] strs){
120 | 		//System.out.println("inference");
121 | //		Model newModel = new Model();
122 | 		
123 | 		//System.out.println("read dataset");
124 | 		LDADataset dataset = LDADataset.readDataSet(strs, globalDict);
125 | 		
126 | 		return inference(dataset);
127 | 	}
128 | 	
129 | 	public Model inference(List<String> [] strs){
130 | 		//System.out.println("inference");
131 | //		Model newModel = new Model();
132 | 		
133 | 		//System.out.println("read dataset");
134 | 		LDADataset dataset = LDADataset.readDataSet(strs, globalDict);
135 | 		
136 | 		return inference(dataset);
137 | 	}
138 | 	
139 | 	//inference new model ~ getting dataset from file specified in option
140 | 	public Model inference(){	
141 | 		//System.out.println("inference");
142 | 		
143 | 		newModel = new Model();
144 | 		if (!newModel.initNewModel(option, trnModel)) return null;
145 | 		
146 | 		logger.log(Level.INFO,"Sampling " + niters + " iteration for inference!");
147 | 		
148 | 		for (newModel.liter = 1; newModel.liter <= niters; newModel.liter++){
149 | 			//System.out.println("Iteration " + newModel.liter + " ...");
150 | 			
151 | 			// for all newz_i
152 | 			for (int m = 0; m < newModel.M; ++m){
153 | 				for (int n = 0; n < newModel.data.docs[m].length; n++){
154 | 					// (newz_i = newz[m][n]
155 | 					// sample from p(z_i|z_-1,w)
156 | 					int topic = infSampling(m, n);
157 | 					newModel.z[m].set(n, topic);
158 | 					
159 | 				}
160 | 			}//end foreach new doc
161 | 			
162 | 		}// end iterations
163 | 		
164 | 		logger.log(Level.FINE,"Gibbs sampling for inference completed!");		
165 | 		logger.log(Level.FINE,"Saving the inference outputs!");
166 | 		
167 | 		computeNewTheta();
168 | 		computeNewPhi();
169 | 		newModel.liter--;
170 | 		newModel.saveModel(newModel.dfile + "." + newModel.modelName);		
171 | 		
172 | 		return newModel;
173 | 	}
174 | 	
175 | 	/**
176 | 	 * do sampling for inference
177 | 	 * m: document number
178 | 	 * n: word number?
179 | 	 */
180 | 	protected int infSampling(int m, int n){
181 | 		// remove z_i from the count variables
182 | 		int topic = newModel.z[m].get(n);
183 | 		int _w = newModel.data.docs[m].words[n];
184 | 		int w = newModel.data.lid2gid.get(_w);
185 | 		newModel.nw[_w][topic] -= 1;
186 | 		newModel.nd[m][topic] -= 1;
187 | 		newModel.nwsum[topic] -= 1;
188 | 		newModel.ndsum[m] -= 1;
189 | 		
190 | 		double Vbeta = trnModel.V * newModel.beta;
191 | 		double Kalpha = trnModel.K * newModel.alpha;
192 | 		
193 | 		// do multinomial sampling via cummulative method		
194 | 		for (int k = 0; k < newModel.K; k++){			
195 | 			newModel.p[k] = (trnModel.nw[w][k] + newModel.nw[_w][k] + newModel.beta)/(trnModel.nwsum[k] +  newModel.nwsum[k] + Vbeta) *
196 | 					(newModel.nd[m][k] + newModel.alpha)/(newModel.ndsum[m] + Kalpha);
197 | 		}
198 | 		
199 | 		// cummulate multinomial parameters
200 | 		for (int k = 1; k < newModel.K; k++){
201 | 			newModel.p[k] += newModel.p[k - 1];
202 | 		}
203 | 		
204 | 		// scaled sample because of unnormalized p[]
205 | 		double u = Math.random() * newModel.p[newModel.K - 1];
206 | 		
207 | 		for (topic = 0; topic < newModel.K; topic++){
208 | 			if (newModel.p[topic] > u)
209 | 				break;
210 | 		}
211 | 		
212 | 		// add newly estimated z_i to count variables
213 | 		newModel.nw[_w][topic] += 1;
214 | 		newModel.nd[m][topic] += 1;
215 | 		newModel.nwsum[topic] += 1;
216 | 		newModel.ndsum[m] += 1;
217 | 		
218 | 		return topic;
219 | 	}
220 | 	
221 | 	protected void computeNewTheta(){
222 | 		for (int m = 0; m < newModel.M; m++){
223 | 			for (int k = 0; k < newModel.K; k++){
224 | 				newModel.theta[m][k] = (newModel.nd[m][k] + newModel.alpha) / (newModel.ndsum[m] + newModel.K * newModel.alpha);
225 | 			}//end foreach topic
226 | 		}//end foreach new document
227 | 	}
228 | 	
229 | 	protected void computeNewPhi(){
230 | 		for (int k = 0; k < newModel.K; k++){
231 | 			for (int _w = 0; _w < newModel.V; _w++){
232 | 				Integer id = newModel.data.lid2gid.get(_w);
233 | 				
234 | 				if (id != null){
235 | 					newModel.phi[k][_w] = (trnModel.nw[id][k] + newModel.nw[_w][k] + newModel.beta) / (newModel.nwsum[k] + newModel.nwsum[k] + trnModel.V * newModel.beta);
236 | 				}
237 | 			}//end foreach word
238 | 		}// end foreach topic
239 | 	}
240 | 	
241 | 	protected void computeTrnTheta(){
242 | 		for (int m = 0; m < trnModel.M; m++){
243 | 			for (int k = 0; k < trnModel.K; k++){
244 | 				trnModel.theta[m][k] = (trnModel.nd[m][k] + trnModel.alpha) / (trnModel.ndsum[m] + trnModel.K * trnModel.alpha);
245 | 			}
246 | 		}
247 | 	}
248 | 	
249 | 	protected void computeTrnPhi(){
250 | 		for (int k = 0; k < trnModel.K; k++){
251 | 			for (int w = 0; w < trnModel.V; w++){
252 | 				trnModel.phi[k][w] = (trnModel.nw[w][k] + trnModel.beta) / (trnModel.nwsum[k] + trnModel.V * trnModel.beta);
253 | 			}
254 | 		}
255 | 	}
256 | }
257 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TopicTiling
  2 | 
  3 | <img src="https://github.com/riedlma/topictiling/raw/master/topictiling.png" alt="TopicTiling" width="400">
  4 | 
  5 | Topic Tiling is an LDA-based Text Segmentation algorithm. 
  6 | The algorithm is inspired by the well-known [TextTiling](http://www.aclweb.org/anthology/J97-1003) 
  7 | algorithm developed by [Marti Hearst](http://people.ischool.berkeley.edu/~hearst/), and segments documents using the Latent 
  8 | Dirichlet Allocation (LDA) topic model. TopicTiling performs 
  9 | the segmentation in linear time and thus is computationally 
 10 | less expensive than other LDA-based segmentation methods. 
 11 | 
 12 | I have moved the project from SourceForge to Github. Whereas the code is still the same, I have updated the documentation on this page.
 13 | 
 14 | For the LDA computation we use [JGibbLda](http://jgibblda.sourceforge.net/) in a slightly modified version, making this project to be licenced under GPL. 
 15 | 
 16 | 
 17 | Table of Content
 18 | ================
 19 | 
 20 | 
 21 |   * [Usage of the binaries](#usage-of-the-binaries)
 22 |   * [Usage for non latin languages](#usage-for-non-latin-languages)
 23 |   * [Usage of the source code](#usage-of-the-source-code)
 24 |   * [Compute a topic model](#compute-a-topic-model)
 25 |   * [Split output file by documents](#split-output-file-by-documents)
 26 |   * [Citation](#citation)
 27 |   * [License](#license)
 28 | 
 29 | 
 30 | 
 31 | 
 32 | Usage of the binaries
 33 | ===============
 34 | 
 35 | The tool has been developed and tested using unix-based systems.
 36 | As TopicTiling is written in Java it should also run on Windows
 37 | machines. 
 38 | 
 39 | To start TopicTiling, you have to download the binary ([zip](https://github.com/riedlma/topictiling/releases/download/v1.0/topictiling_v1.0.zip)|[tar.gz](https://github.com/riedlma/topictiling/releases/download/v1.0/topictiling_v1.0.tar.gz)) and decompress the archive. To execute the segmentation method, open the commandline and navigate to the uncompressed folder
 40 | 
 41 | ```
 42 | cd topictiling_v1.0
 43 | ```
 44 | 
 45 | We provide an batch script to start the segmentation for Windows:
 46 | ```
 47 | bash topictiling.bat
 48 | ```
 49 | and a shell script to start the segmentation for unix-based operation systems:
 50 | ```
 51 | sh topictiling.sh
 52 | ```
 53 | 
 54 | These commands will output all parameters of TopicTiling:
 55 | 
 56 | 
 57 | ```
 58 |  [java] Option "-fd" is required
 59 |  [java] java -jar myprogram.jar [options...] arguments...
 60 |  [java]  -dn      : Use the direct neighbor otherwise the highest neighbor will be used
 61 |  [java]             (default false)
 62 |  [java]  -fd VAL  : Directory fo the test files
 63 |  [java]  -fp VAL  : File pattern for the test files
 64 |  [java]  -i N     : Number of inference iterations used to annotate words with topic
 65 |  [java]             IDs (default 100)
 66 |  [java]  -m       : Use mode counting (true/false) (default=true)
 67 |  [java]  -out VAL : File the content is written to (otherwise stdout will be used)
 68 |  [java]  -ri N    : Use the repeated inference method
 69 |  [java]  -rs N    : Use the repeated segmentation
 70 |  [java]  -s       : Use simple segmentation (default=false)
 71 |  [java]  -tmd VAL : Directory of the topic model (GibbsLDA should be used)
 72 |  [java]  -tmn VAL : Name of the topic model (GibbsLDA should be used)
 73 |  [java]  -w N     : Window size used to calculate the sentence similarity
 74 | ```
 75 | 
 76 | We recommend using the mode counting (-m). In each inference iteration of LDA, a topicId is assigned to a word. In the default implementation the assignment is done via sampling. Thus, it could happen that a word has a different topicId in each inference step. To stabelize the topicId assignment, we store each topicId assignment for each inference iteration and at the end we use the one that has been sampled most.
 77 | 
 78 | In order to test TopicTiling, you also require a topic model that has been computed with either [JGibbLDA](http://jgibblda.sourceforge.net/) or [GibbsLda++](http://gibbslda.sourceforge.net/). Some description for the computation is given [here](#compute-a-topic-model).
 79 | 
 80 | Once you have computed a topic model, you might have a folder called *topicmodel* with the following files:
 81 | ```
 82 | topicmodel/model-final.others
 83 | topicmodel/model-final.phi
 84 | topicmodel/model-final.tassign
 85 | topicmodel/model-final.theta
 86 | topicmodel/model-final.twords
 87 | topicmodel/wordmap.txt
 88 | ```
 89 | 
 90 | 
 91 | For the segmentation, we advise to repeat the inference five times (*-ri 5*) (see [paper](http://www.aclweb.org/anthology/W12-0703)). To start the segmentation, you can then use the following command, considering that the files you want to segment are stored in the folder *files_to_segment* and use as file ending "txt":
 92 | 
 93 | ```
 94 | sh topictiling.sh -ri 5 -tmd topicmodel -tmn mode-final -fp "*txt" -fd files_to_segment
 95 | ```
 96 | 
 97 | The output of the algorithms is in XML format:
 98 | 
 99 | ```
100 | <document>
101 | <documentName>…</documentName>
102 | <segment>
103 | <depthScore>score<depthScore>
104 | <text>…</text>
105 | </segment>
106 | …
107 | 
108 | </document>
109 | ```
110 | 
111 | The code returns all maxima where a boundary might be set. If you know the number of segments, you can just select the N semgents with the highest depthScore scores and ignore the remaining ones. 
112 | 
113 | 
114 | Usage for non latin languages
115 | ===============
116 | The current version uses the Stanford segmenter for tokenization. However, this tokenizer does not play well on languages without any latin characters (e.g. Chinese, Arabic, Hebrew, Japanese, etc.). In order to segment such languages, segment the texts beforehand and use the parameter *-s* that disables the tokenization and expects all words segmented by white spaces.
117 | 
118 | Usage of the source code
119 | ===============
120 | Import both projects into Eclipse. The LDA project contains JGibbLda with slight modifications, so the mode method can be computed. Additionally it contains UIMA Annotators, so it can be used within a UIMA Pipeline. The project also has dependencies to DKPro and uimafit. To run the TopicTiling algorithm, execute the class TopicTilingTopicDocument. 
121 | 
122 | Compute a topic model
123 | ===============
124 | 
125 | To compute the topic model with LDA, documents are required that represent the domain of texts, the segmentation method will be applied to. For the computation you can use either [JGibbLDA](http://jgibblda.sourceforge.net/) (written in Java) or the faster C++ version [GibbsLda++](http://gibbslda.sourceforge.net/). To get an impression of the usage of different parameters of LDA you can have a look at our paper: [Sweeping through the Topic Space: Bad luck? Roll again!](http://www.aclweb.org/anthology/W12-0703). In general, we would advise training a topic model with 100 topics, alpha with 50/(number of topics) and alpha equals 0.01.
126 | 
127 | 
128 | Split output file by documents
129 | ===============
130 | 
131 | The output of the standard TopicTiling method returns one file with segments for all documents. If you want to have one file with segments for each document you can use the python script in the repository called: *split_output.py*. For this, The output of TopicTiling should be redirected to a file (e.g. *output_file*). The python script expects two parameters: the output file of TopicTiling (*output_file*) and a folder that is created and where all single document files are stored (*output_folder*)
132 | ```
133 | python split_output.py output_file output_folder
134 | ```
135 | 
136 | 
137 | Citation
138 | ===============
139 | If you use TextTiling, please cite one of the following papers/article:
140 | 
141 | ```
142 | 
143 | @article{Riedl:jlcl,
144 |  author = {Martin Riedl and Chris Biemann},
145 |  title = {{Text Segmentation with Topic Models }},
146 |  journal = {Journal for Language Technology and Computational Linguistics (JLCL)},
147 | year={2012},
148 |   volume={27},
149 |   number={47-69},
150 |   pages={13-24},
151 |   url={http://www.jlcl.org/2012_Heft1/jlcl2012-1-3.pdf}
152 | }
153 | 
154 | @inproceedings{riedl12_acl,
155 | 	author = {Martin Riedl and Chris Biemann},
156 | 	title = {TopicTiling: A Text Segmentation Algorithm based on LDA},
157 | 	year = {2012},
158 | 	address = {Jeju, Republic of Korea},
159 | 	booktitle = {Proceedings of the Student Research Workshop of the 50th Meeting of the Association for
160 |                Computational Linguistics},
161 | 	pages = {37--42},
162 | 	url={http://www.aclweb.org/anthology/W12-3307},
163 | }
164 | 
165 | ```
166 | 
167 | 
168 | 
169 | License
170 | ===============
171 | As JGibbLDA is published under GPL 2.0 license, which is contained in the current repository, I had to license via this license.
172 | 
173 | TopicTiling is a free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation.
174 | 
175 | TopicTiling is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/TopicTilingDocumentSegmenterAnnotator.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *	Martin Riedl
  3 |  *	riedl@cs.tu-darmstadt.de
  4 |  *  FG Language Technology
  5 |  * 	Technische Universität Darmstadt, Germany
  6 |  * 
  7 |  * 
  8 |  *  This file is part of TopicTiling.
  9 |  *
 10 |  *  TopicTiling is free software: you can redistribute it and/or modify
 11 |  *  it under the terms of the GNU General Public License as published by
 12 |  *  the Free Software Foundation, either version 3 of the License, or
 13 |  *  (at your option) any later version.
 14 |  *
 15 |  *  TopicTiling is distributed in the hope that it will be useful,
 16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *  GNU General Public License for more details.
 19 |  *
 20 |  *  You should have received a copy of the GNU General Public License
 21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
 22 |  */
 23 | 
 24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator;
 25 | 
 26 | import java.text.DecimalFormat;
 27 | import java.util.ArrayList;
 28 | import java.util.Iterator;
 29 | import java.util.List;
 30 | 
 31 | import org.apache.uima.UimaContext;
 32 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 33 | import org.apache.uima.jcas.JCas;
 34 | import org.apache.uima.resource.ResourceInitializationException;
 35 | import org.uimafit.component.JCasAnnotator_ImplBase;
 36 | import org.uimafit.descriptor.ConfigurationParameter;
 37 | import org.uimafit.util.JCasUtil;
 38 | 
 39 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TextTilingWindowOptimized;
 40 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TopicTilingTopicDocument;
 41 | import de.tudarmstadt.langtech.semantics.type.Segment;
 42 | import de.tudarmstadt.langtech.semantics.type.SegmentQuantity;
 43 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
 44 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 45 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 46 | 
 47 | public class TopicTilingDocumentSegmenterAnnotator
 48 | 	extends JCasAnnotator_ImplBase {
 49 | 	private boolean printSegments = false;
 50 | 
 51 | 	public static final String PARAM_LDA_MODEL_DIRECTORY = "LdaModelDirectory";
 52 | 	public static final String PARAM_LDA_MODEL_NAME = "LdaModelName";
 53 | 	public static final String PARAM_WINDOW = "TopicTilingWindow";
 54 | 	public static final String PARAM_REPEAT_SEGMENTATION = "RepeatedSegmentation";
 55 | 	public static final String PARAM_INFERENCE_ITERATION = "InferenceIteration";
 56 | 	public static final String PARAM_REPEAT_INFERENCE = "RepeatedInference";
 57 | 	
 58 | 	@ConfigurationParameter(name = PARAM_LDA_MODEL_DIRECTORY, mandatory = true)
 59 | 	private String ldaModelDirectory;
 60 | 	@ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true)
 61 | 	private String ldaModelName;
 62 | 	@ConfigurationParameter(name = PARAM_WINDOW, mandatory = true)
 63 | 	private int window;
 64 | 	@ConfigurationParameter(name = PARAM_REPEAT_INFERENCE, mandatory = true)
 65 | 	private int repeatInferences;
 66 | 	@ConfigurationParameter(name = PARAM_REPEAT_SEGMENTATION, mandatory = true)
 67 | 	private int repeatSegmentation;
 68 | 	@ConfigurationParameter(name = PARAM_INFERENCE_ITERATION, mandatory = true)
 69 | 	private int inferenceIteration;
 70 | 	
 71 | 	
 72 | 
 73 | 	@Override
 74 | 	public void initialize(UimaContext context)
 75 | 		throws ResourceInitializationException {
 76 | 		super.initialize(context);
 77 | 		
 78 | 	}
 79 | 
 80 | 	@Override
 81 | 	public void process(JCas jcas)
 82 | 		throws AnalysisEngineProcessException {
 83 | 
 84 | 		List<List<Token>> s = new ArrayList<List<Token>>();
 85 | 
 86 | 		// int i = 0;
 87 | 		Iterator<Segment> segments = JCasUtil.select(jcas, Segment.class)
 88 | 				.iterator();
 89 | 		Segment seg = null;
 90 | 		if (segments.hasNext())
 91 | 			seg = segments.next();
 92 | 
 93 | 		for (Sentence ss : JCasUtil.select(jcas, Sentence.class)) {
 94 | 
 95 | 			s.add(JCasUtil.selectCovered(Token.class, ss));
 96 | 
 97 | 		}
 98 | 
 99 | 		TopicTilingTopicDocument tttd ;
100 | 		
101 | 		if (JCasUtil.select(jcas, SegmentQuantity.class).size() == 0) {
102 | 			
103 | 			tttd = new TopicTilingTopicDocument(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInferences, inferenceIteration);
104 | 		} else {
105 | 			int segNum = JCasUtil.select(jcas, SegmentQuantity.class)
106 | 					.iterator().next().getSegmentCount();
107 | 			tttd = new TopicTilingTopicDocument(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInferences, inferenceIteration,segNum);
108 | 		}
109 | 
110 | 		
111 | 		List<Integer> segmentPositions = tttd.segment(s);
112 | 		// print(jcas,segmentPositions);
113 | 		// printRcode(jcas, segmentCounts, wtt2, segmentPositionsWnew);
114 | 		annotateSegments(jcas, segmentPositions);
115 | 	}
116 | 
117 | 	private void printRcode(JCas jcas, int segmentCount,
118 | 			TextTilingWindowOptimized tt, List<Integer> segments) {
119 | 		// if (!printRcode)
120 | 		// return;
121 | 		DocumentMetaData metaData = DocumentMetaData.get(jcas);
122 | 		;
123 | 		String main = metaData.getDocumentTitle()
124 | 				+ ": Cosine Similarity between sentences ";
125 | 		if (segmentCount < 0)
126 | 			main = main + " (segments given: " + segmentCount + ")";
127 | 		StringBuffer buffer = new StringBuffer();
128 | 		buffer.append("#Cosine Similarity\n");
129 | 		buffer.append("pdf(file='" + metaData.getDocumentTitle()
130 | 				+ ".pdf',20,7);\n");
131 | 		buffer.append(toListInR(tt.similarityScores, "cos"));
132 | 		buffer.append(toListInR(segments, "estSeg"));
133 | 		buffer.append(toListInR(getGoldSegments(jcas), "seg"));
134 | 		buffer.append(toListInR(tt.minimaPosition, "canSeg"));
135 | 		buffer.append(toListInR(tt.depthScores, "depth"));
136 | 		buffer.append("plot(0:"
137 | 				+ (tt.similarityScores.size() - 1)
138 | 				+ ",cos,type='l',xlab='Sentence',ylab='cosine similarity',main='"
139 | 				+ main + "');\n");
140 | 		buffer.append("abline(v=seg,col='red',lty=5);\n");
141 | 		buffer.append("abline(v=estSeg,col='green',lwd=2,lty=4);\n");
142 | 		buffer.append("abline(v=seg[seg%in%estSeg],col='black',lwd=3);\n");
143 | 		buffer.append("points(estSeg,rep(max(cos)*0.98," + segments.size()
144 | 				+ "),col='green',pch=22);\n");
145 | 		buffer.append("points(canSeg,rep(max(cos)*0.9,"
146 | 				+ tt.minimaPosition.size() + "),col='blue',pch=23);\n");
147 | 		buffer.append("text(canSeg[-length(canSeg)],rep(max(cos)*c(0.84,0.88,0.92,0.94),length="
148 | 				+ tt.depthScores.size() + "),labels=depth);\n");
149 | 		buffer.append("dev.off();dev.off()");
150 | 		System.out.println(buffer.toString());
151 | 
152 | 	}
153 | 
154 | 	private List<Integer> getGoldSegments(JCas jcas) {
155 | 
156 | 		List<Integer> ret = new ArrayList<Integer>();
157 | 		Iterator<Segment> segIt = JCasUtil.iterator(jcas, Segment.class);
158 | 		int sentenceCount = -1;
159 | 		while (segIt.hasNext()) {
160 | 			Segment seg = segIt.next();
161 | 			for (Sentence s : JCasUtil.selectCovered(jcas, Sentence.class, seg)) {
162 | 				sentenceCount++;
163 | 			}
164 | 			ret.add(sentenceCount);
165 | 		}
166 | 		return ret;
167 | 	}
168 | 
169 | 	private <T> StringBuffer toListInR(List<T> list, String name) {
170 | 		StringBuffer buffer = new StringBuffer();
171 | 		buffer.append(name);
172 | 		buffer.append("=c(");
173 | 		for (T sc : list) {
174 | 			if (sc instanceof Double) {
175 | 				DecimalFormat df = new DecimalFormat("#.##");
176 | 				buffer.append(df.format(sc).replace(",", "."));
177 | 			} else {
178 | 				buffer.append(sc);
179 | 			}
180 | 			buffer.append(",");
181 | 		}
182 | 		if (list.size() > 0)
183 | 			buffer.deleteCharAt(buffer.length() - 1);
184 | 		buffer.append(");\n");
185 | 		return buffer;
186 | 	}
187 | 
188 | 	
189 | 	/**
190 | 	 * expects a list with the sentencenumber that will be segmented
191 | 	 * 
192 | 	 * @param jcas
193 | 	 * @param sentenceBreaks
194 | 	 */
195 | 	private void annotateSegments(JCas jcas, List<Integer> sentenceBreaks) {
196 | 		Iterator<Sentence> sentenceItr = JCasUtil
197 | 				.iterator(jcas, Sentence.class);
198 | 		int sentenceCount = -1;
199 | 		int prevBreak = 0;
200 | 		if (printSegments) {
201 | 			System.out.println("Annotated Segments");
202 | 			System.out.println(sentenceBreaks.toString());
203 | 		}
204 | 
205 | 		for (final int sBreak : sentenceBreaks) {
206 | 			final Segment seg = new Segment(jcas);
207 | 
208 | 			Sentence segmentSentence = null;
209 | 
210 | 			int beginOffset = 0;
211 | 			int endOffset = 0;
212 | 
213 | 			// move sentenceItr to last sentence in segment
214 | 			for (; sentenceCount < sBreak; sentenceCount++) {
215 | 				segmentSentence = sentenceItr.next();
216 | 
217 | 				if (sentenceCount == prevBreak) {
218 | 					beginOffset = segmentSentence.getBegin();
219 | 				}
220 | 			}
221 | 
222 | 			if (segmentSentence != null) {
223 | 				endOffset = segmentSentence.getEnd();
224 | 			}
225 | 
226 | 			seg.setBegin(beginOffset);
227 | 			seg.setEnd(endOffset);
228 | 			seg.addToIndexes();
229 | 
230 | 			if (printSegments) {
231 | 				System.out.println(sBreak + "\t" + sentenceCount + "\t"
232 | 						+ beginOffset + "\t" + endOffset);
233 | 			}
234 | 			prevBreak = sBreak;
235 | 		}
236 | 	}
237 | }
238 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/de/tudarmstadt/langtech/lda/annotator/GibbsLdaTopicIdAnnotator.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *	Martin Riedl
  3 |  *	riedl@cs.tu-darmstadt.de
  4 |  *  FG Language Technology
  5 |  * 	Technische Universität Darmstadt, Germany
  6 |  * 
  7 |  * 
  8 |  *  This file is part of TopicTiling.
  9 |  *
 10 |  *  TopicTiling is free software: you can redistribute it and/or modify
 11 |  *  it under the terms of the GNU General Public License as published by
 12 |  *  the Free Software Foundation, either version 3 of the License, or
 13 |  *  (at your option) any later version.
 14 |  *
 15 |  *  TopicTiling is distributed in the hope that it will be useful,
 16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *  GNU General Public License for more details.
 19 |  *
 20 |  *  You should have received a copy of the GNU General Public License
 21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
 22 |  */
 23 | 
 24 | package de.tudarmstadt.langtech.lda.annotator;
 25 | 
 26 | import static org.uimafit.util.JCasUtil.select;
 27 | 
 28 | import java.util.ArrayList;
 29 | import java.util.HashMap;
 30 | import java.util.List;
 31 | import java.util.Random;
 32 | 
 33 | import jgibbslda.Model;
 34 | 
 35 | import org.apache.uima.UIMAFramework;
 36 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 37 | import org.apache.uima.jcas.JCas;
 38 | import org.apache.uima.jcas.cas.DoubleArray;
 39 | import org.apache.uima.util.Level;
 40 | import org.apache.uima.util.Logger;
 41 | import org.uimafit.descriptor.ConfigurationParameter;
 42 | 
 43 | import de.tudarmstadt.langtech.lda.type.Topic;
 44 | import de.tudarmstadt.langtech.lda.type.TopicDistribution;
 45 | import de.tudarmstadt.langtech.lda.type.WordTopicDistribution;
 46 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
 47 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 48 | 
 49 | public abstract class GibbsLdaTopicIdAnnotator extends
 50 | 		GibbsLdaTopicModelAnnotator {
 51 | 	public static final String PARAM_LDA_REPEAT_INFERENCE = "LdaRepeatInference";
 52 | 	public static final String PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION = "LdaAnnotateDocumentTopicDistribution";
 53 | 	public static final String PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION = "LdaAnnotateWordTopicDistribution";
 54 | 
 55 | 	private static final Logger log = UIMAFramework
 56 | 			.getLogger(GibbsLdaTopicIdAnnotator.class);
 57 | 	@ConfigurationParameter(name = PARAM_LDA_REPEAT_INFERENCE, mandatory = false, defaultValue = "1")
 58 | 	private int ldaRepeatInference;
 59 | 
 60 | 	@ConfigurationParameter(name = PARAM_ANNOTATE_DOCUMENT_TOPIC_DISTRIBUTION, mandatory = false, defaultValue = "false")
 61 | 	private boolean ldaAnnotateDocumentTopicDistribution = false;
 62 | 
 63 | 	@ConfigurationParameter(name = PARAM_ANNOTATE_WORD_TOPIC_DISTRIBUTION, mandatory = false, defaultValue = "false")
 64 | 	private boolean ldaAnnotateWordTopicDistribution = false;
 65 | 	
 66 | 	/**
 67 | 	 * Function iterates over all tokens and assigns a topic ID. This can only
 68 | 	 * be performed, when the token is within the model.
 69 | 	 * 
 70 | 	 * @param jcas
 71 | 	 * @param z
 72 | 	 */
 73 | 
 74 | 	private void annotateTokenWithTopicId(JCas jcas, List<Integer>[] modelZ,
 75 | 			List<Integer>[] modelModeZ, List<String>[] documents) {
 76 | 
 77 | 		int si = 0;
 78 | 		int ti = 0;
 79 | 		int zti = 0;
 80 | 		int actDocumentSize = 0;
 81 | 		List<String> wordTokens = null;
 82 | 		StringBuffer output = new StringBuffer();
 83 | 
 84 | 		if (documents.length > 0) {
 85 | 			wordTokens = documents[0];
 86 | 			actDocumentSize = wordTokens.size();
 87 | 		}
 88 | 
 89 | 		for (Token t : select(jcas, Token.class)) {
 90 | 			if (zti >= actDocumentSize) {
 91 | 				ti = 0;
 92 | 				zti = 0;
 93 | 				si++;
 94 | 				wordTokens = documents[si];
 95 | 				actDocumentSize = wordTokens.size();
 96 | 			}
 97 | 			String token = t.getCoveredText();
 98 | 			assert token.equals(wordTokens.get(zti));
 99 | 			// System.out.print("indices: " + si + "\t" + ti + "\tsize: "
100 | 			// + modelZ[si].size() + " " + modelModeZ[si].size());
101 | 			// System.out.println("\t" + token + " "+ wordTokens.get(zti));
102 | 			if (getInferencerGlobalDict().word2id.containsKey(token)) {
103 | 				int topicId = modelZ[si].get(ti);
104 | 				int topicModeId = modelModeZ[si].get(ti);
105 | 				Topic topic = new Topic(jcas, t.getBegin(), t.getEnd());
106 | 				topic.setTopicId(topicId);
107 | 				topic.setTopicModeId(topicModeId);
108 | 				topic.addToIndexes();
109 | 
110 | 				ti++;
111 | 
112 | 				output.append(token).append(":").append(topicId).append(":")
113 | 						.append(topicModeId);
114 | 
115 | 			} else {
116 | 				output.append(token).append(":NA");
117 | 			}
118 | 			output.append(" ");
119 | 			zti++;
120 | 
121 | 		}
122 | 		log.log(Level.FINE, output.toString());
123 | 	}
124 | 
125 | 	@Override
126 | 	public void process(JCas jcas) throws AnalysisEngineProcessException {
127 | 		final List<String>[] documents = getDocuments(jcas);
128 | 		DocumentMetaData metaData = DocumentMetaData.get(jcas);
129 | 		super.setLdaInferenceSaveName(metaData.getDocumentTitle());
130 | 		Model m = inference(documents);
131 | 		// if no inference is repeated z contains the topic IDs that are used
132 | 		List<Integer>[] modelZ = m.z;
133 | 		List<Integer>[] modelModeZ;
134 | 
135 | 		modelModeZ = getTopicListFromRepeated(getInferenceModeValues(),
136 | 				documents, getInferenceNiters(), 1);
137 | 		if (ldaRepeatInference > 1) {
138 | 			// initialize save structure for word wise topic stabilization
139 | 			ArrayList<int[][]> values = new ArrayList<int[][]>();
140 | 			for (int k = 0; k < documents.length; k++) {
141 | 				values.add(new int[modelZ[k].size()][m.K]);
142 | 			}
143 | 			for (int k = 1; k < ldaRepeatInference; k++) {
144 | 				for (int p = 0; p < documents.length; p++) {
145 | 					for (int t = 0; t < modelZ[p].size(); t++) {
146 | 						int topic = modelZ[p].get(t);
147 | 						values.get(p)[t][topic]++;
148 | 					}
149 | 				}
150 | 				m = inference(documents);
151 | 				modelZ = m.z;
152 | 				modelModeZ = getTopicListFromRepeated(getInferenceModeValues(),
153 | 						documents, getInferenceNiters(), 1);
154 | 			}
155 | 		}
156 | 		annotateTokenWithTopicId(jcas, modelZ, modelModeZ, documents);
157 | 		if (ldaAnnotateDocumentTopicDistribution)
158 | 			annotateDocumentsWithTopicDistribution(jcas, documents, m);
159 | 		if(ldaAnnotateWordTopicDistribution)
160 | 			annotateWordsWithTopicDistribution(jcas,m);
161 | 	}
162 | 
163 | 	private void annotateWordsWithTopicDistribution(JCas jcas, Model m) {
164 | 		
165 | 		HashMap<String,DoubleArray> map = new HashMap<String, DoubleArray>();
166 | 		for(int wi =0;wi< m.phi.length;wi++){
167 | 			double[] topics=m.phi[wi];
168 | 			String word = getInferencerGlobalDict().id2word.get(wi);
169 | 			DoubleArray arr = new DoubleArray(jcas, topics.length);
170 | 			for(int ti=0;ti<topics.length;ti++){
171 | 				arr.set(ti, topics[ti]);
172 | 			}
173 | 			map.put(word, arr);
174 | 		}
175 | 		for (Token t : select(jcas, Token.class)) {
176 | 		
177 | 			DoubleArray arr = map.get(t.getCoveredText());
178 | 			if(arr!=null){
179 | 				WordTopicDistribution wtd = new WordTopicDistribution(jcas,t.getBegin(),t.getEnd());
180 | 				wtd.setTopicDistribution(arr);
181 | 				wtd.addToIndexes();
182 | 			}
183 | 			
184 | 		}
185 | 	}
186 | 
187 | 	private void annotateDocumentsWithTopicDistribution(JCas jcas,
188 | 			List<String>[] documents, Model m) {
189 | 		int si = 0;
190 | 		int ti = 0;
191 | 		int start = -1;
192 | 		int docSize = documents[si].size();
193 | 		for (Token t : select(jcas, Token.class)) {
194 | 			if (start < 0) {
195 | 				docSize = documents[si].size();
196 | 				start = t.getBegin();
197 | 			}
198 | 			ti++;
199 | 			if (ti == docSize) {
200 | 				TopicDistribution td = new TopicDistribution(jcas, start,
201 | 						t.getEnd());
202 | 				start = -1;
203 | 				DoubleArray arr = new DoubleArray(jcas, m.K);
204 | 				for (int i = 0; i < m.theta[si].length; i++) {
205 | 					arr.set(i, m.theta[si][i]);
206 | 				}
207 | 				td.setTopicDistribution(arr);
208 | 				td.addToIndexes();
209 | 
210 | 				si++;
211 | 
212 | 				ti = 0;
213 | 			}
214 | 
215 | 		}
216 | 	}
217 | 
218 | 	private List<Integer>[] getTopicListFromRepeated(ArrayList<int[][]> values,
219 | 			List<String>[] partsArray, int max, int min) {
220 | 		@SuppressWarnings("unchecked")
221 | 		List<Integer>[] newZ = new ArrayList[values.size()];
222 | 		Random r = new Random();
223 | 		for (int s = 0; s < values.size(); s++) {
224 | 			int[][] sentence = values.get(s);
225 | 			newZ[s] = new ArrayList<Integer>();
226 | 			for (int t = 0; t < sentence.length; t++) {
227 | 				List<Integer> candidates = getTopicCandidates(sentence[t], max,
228 | 						min);
229 | 				if (candidates.size() > 0) {
230 | 					int topic = candidates.get(r.nextInt(candidates.size()));
231 | 					newZ[s].add(topic);
232 | 				} else {
233 | 					System.out.println("No Candidates found");
234 | 
235 | 					System.out.println();
236 | 				}
237 | 
238 | 			}
239 | 
240 | 		}
241 | 		return newZ;
242 | 
243 | 	}
244 | 
245 | 	private List<Integer> getTopicCandidates(int[] topics, int max, int min) {
246 | 		ArrayList<Integer> candidates = new ArrayList<Integer>();
247 | 		for (int m = max; m >= min; m--) {
248 | 
249 | 			for (int t = 0; t < topics.length; t++) {
250 | 				if (topics[t] == m) {
251 | 					candidates.add(t);
252 | 				}
253 | 			}
254 | 			if (candidates.size() > 0) {
255 | 				return candidates;
256 | 			}
257 | 		}
258 | 		return new ArrayList<Integer>();
259 | 	}
260 | 
261 | 	public abstract List<String>[] getDocuments(JCas jcas);
262 | }
263 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.lda/src/main/java/jgibbslda/LDADataset.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (C) 2007 by
  3 |  * 
  4 |  * 	Xuan-Hieu Phan
  5 |  *	hieuxuan@ecei.tohoku.ac.jp or pxhieu@gmail.com
  6 |  * 	Graduate School of Information Sciences
  7 |  * 	Tohoku University
  8 |  * 
  9 |  *  Cam-Tu Nguyen
 10 |  *  ncamtu@gmail.com
 11 |  *  College of Technology
 12 |  *  Vietnam National University, Hanoi
 13 |  *
 14 |  * JGibbsLDA is a free software; you can redistribute it and/or modify
 15 |  * it under the terms of the GNU General Public License as published
 16 |  * by the Free Software Foundation; either version 2 of the License,
 17 |  * or (at your option) any later version.
 18 |  *
 19 |  * JGibbsLDA is distributed in the hope that it will be useful, but
 20 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 22 |  * GNU General Public License for more details.
 23 |  *
 24 |  * You should have received a copy of the GNU General Public License
 25 |  * along with JGibbsLDA; if not, write to the Free Software Foundation,
 26 |  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 27 |  */
 28 | package jgibbslda;
 29 | 
 30 | import java.io.BufferedReader;
 31 | import java.io.FileInputStream;
 32 | import java.io.InputStreamReader;
 33 | import java.util.HashMap;
 34 | import java.util.List;
 35 | import java.util.Map;
 36 | import java.util.Vector;
 37 | 
 38 | public class LDADataset {
 39 | 	//---------------------------------------------------------------
 40 | 	// Instance Variables
 41 | 	//---------------------------------------------------------------
 42 | 	
 43 | 	public Dictionary localDict;			// local dictionary	
 44 | 	public Document [] docs; 		// a list of documents	
 45 | 	public int M; 			 		// number of documents
 46 | 	public int V;			 		// number of words
 47 | 	
 48 | 	// map from local coordinates (id) to global ones 
 49 | 	// null if the global dictionary is not set
 50 | 	public Map<Integer, Integer> lid2gid; 
 51 | 	
 52 | 	//link to a global dictionary (optional), null for train data, not null for test data
 53 | 	public Dictionary globalDict;	 		
 54 | 	
 55 | 	//--------------------------------------------------------------
 56 | 	// Constructor
 57 | 	//--------------------------------------------------------------
 58 | 	public LDADataset(){
 59 | 		localDict = new Dictionary();
 60 | 		M = 0;
 61 | 		V = 0;
 62 | 		docs = null;
 63 | 	
 64 | 		globalDict = null;
 65 | 		lid2gid = null;
 66 | 	}
 67 | 	
 68 | 	public LDADataset(int M){
 69 | 		localDict = new Dictionary();
 70 | 		this.M = M;
 71 | 		this.V = 0;
 72 | 		docs = new Document[M];	
 73 | 		
 74 | 		globalDict = null;
 75 | 		lid2gid = null;
 76 | 	}
 77 | 	
 78 | 	public LDADataset(int M, Dictionary globalDict){
 79 | 		localDict = new Dictionary();	
 80 | 		this.M = M;
 81 | 		this.V = 0;
 82 | 		docs = new Document[M];	
 83 | 		
 84 | 		this.globalDict = globalDict;
 85 | 		lid2gid = new HashMap<Integer, Integer>();
 86 | 	}
 87 | 	
 88 | 	//-------------------------------------------------------------
 89 | 	//Public Instance Methods
 90 | 	//-------------------------------------------------------------
 91 | 	/**
 92 | 	 * set the document at the index idx if idx is greater than 0 and less than M
 93 | 	 * @param doc document to be set
 94 | 	 * @param idx index in the document array
 95 | 	 */	
 96 | 	public void setDoc(Document doc, int idx){
 97 | 		if (0 <= idx && idx < M){
 98 | 			docs[idx] = doc;
 99 | 		}
100 | 	}
101 | 	/**
102 | 	 * set the document at the index idx if idx is greater than 0 and less than M
103 | 	 * @param str string contains doc
104 | 	 * @param idx index in the document array
105 | 	 */
106 | 	public void setDoc(String str, int idx){
107 | 		if (0 <= idx && idx < M){
108 | 			String [] words = str.split("[ \\t\\n]");
109 | 			
110 | 			Vector<Integer> ids = new Vector<Integer>();
111 | 			
112 | 			for (String word : words){
113 | 				int _id = localDict.word2id.size();
114 | 				
115 | 				if (localDict.contains(word))		
116 | 					_id = localDict.getID(word);
117 | 								
118 | 				if (globalDict != null){
119 | 					//get the global id					
120 | 					Integer id = globalDict.getID(word);
121 | 					//System.out.println(id);
122 | 					
123 | 					if (id != null){
124 | 						localDict.addWord(word);
125 | 						
126 | 						lid2gid.put(_id, id);
127 | 						ids.add(_id);
128 | 					}
129 | 					else { //not in global dictionary
130 | 						//do nothing currently
131 | 					}
132 | 				}
133 | 				else {
134 | 					localDict.addWord(word);
135 | 					ids.add(_id);
136 | 				}
137 | 			}
138 | 			
139 | 			Document doc = new Document(ids, str);
140 | 			docs[idx] = doc;
141 | 			V = localDict.word2id.size();			
142 | 		}
143 | 	}
144 | 	
145 | 	
146 | 	public void setDoc(List<String> words, int idx){
147 | 		String str = "";
148 | 		if (0 <= idx && idx < M){
149 | 			
150 | 			Vector<Integer> ids = new Vector<Integer>();
151 | 			
152 | 			for (String word : words){
153 | 				str+=word+" ";
154 | 				int _id = localDict.word2id.size();
155 | 				
156 | 				if (localDict.contains(word))		
157 | 					_id = localDict.getID(word);
158 | 								
159 | 				if (globalDict != null){
160 | 					//get the global id					
161 | 					Integer id = globalDict.getID(word);
162 | 					//System.out.println(id);
163 | 					
164 | 					if (id != null){
165 | 						localDict.addWord(word);
166 | 						
167 | 						lid2gid.put(_id, id);
168 | 						ids.add(_id);
169 | 					}
170 | 					else { //not in global dictionary
171 | 						//do nothing currently
172 | 					}
173 | 				}
174 | 				else {
175 | 					localDict.addWord(word);
176 | 					ids.add(_id);
177 | 				}
178 | 			}
179 | 			
180 | 			Document doc = new Document(ids, str);
181 | 			docs[idx] = doc;
182 | 			V = localDict.word2id.size();			
183 | 		}
184 | 	}
185 | 	//---------------------------------------------------------------
186 | 	// I/O methods
187 | 	//---------------------------------------------------------------
188 | 	
189 | 	/**
190 | 	 *  read a dataset from a stream, create new dictionary
191 | 	 *  @return dataset if success and null otherwise
192 | 	 */
193 | 	public static LDADataset readDataSet(String filename){
194 | 		try {
195 | 			BufferedReader reader = new BufferedReader(new InputStreamReader(
196 | 					new FileInputStream(filename), "UTF-8"));
197 | 			
198 | 			LDADataset data = readDataSet(reader);
199 | 			
200 | 			reader.close();
201 | 			return data;
202 | 		}
203 | 		catch (Exception e){
204 | 			System.out.println("Read Dataset Error: " + e.getMessage());
205 | 			e.printStackTrace();
206 | 			return null;
207 | 		}
208 | 	}
209 | 	
210 | 	/**
211 | 	 * read a dataset from a file with a preknown vocabulary
212 | 	 * @param filename file from which we read dataset
213 | 	 * @param dict the dictionary
214 | 	 * @return dataset if success and null otherwise
215 | 	 */
216 | 	public static LDADataset readDataSet(String filename, Dictionary dict){
217 | 		try {
218 | 			BufferedReader reader = new BufferedReader(new InputStreamReader(
219 | 					new FileInputStream(filename), "UTF-8"));
220 | 			LDADataset data = readDataSet(reader, dict);
221 | 			
222 | 			reader.close();
223 | 			return data;
224 | 		}
225 | 		catch (Exception e){
226 | 			System.out.println("Read Dataset Error: " + e.getMessage());
227 | 			e.printStackTrace();
228 | 			return null;
229 | 		}
230 | 	}
231 | 	
232 | 	/**
233 | 	 *  read a dataset from a stream, create new dictionary
234 | 	 *  @return dataset if success and null otherwise
235 | 	 */
236 | 	public static LDADataset readDataSet(BufferedReader reader){
237 | 		try {
238 | 			//read number of document
239 | 			String line;
240 | 			line = reader.readLine();
241 | 			int M = Integer.parseInt(line);
242 | 			
243 | 			LDADataset data = new LDADataset(M);
244 | 			for (int i = 0; i < M; ++i){
245 | 				line = reader.readLine();
246 | 				
247 | 				data.setDoc(line, i);
248 | 			}
249 | 			
250 | 			return data;
251 | 		}
252 | 		catch (Exception e){
253 | 			System.out.println("Read Dataset Error: " + e.getMessage());
254 | 			e.printStackTrace();
255 | 			return null;
256 | 		}
257 | 	}
258 | 	
259 | 	/**
260 | 	 * read a dataset from a stream with respect to a specified dictionary
261 | 	 * @param reader stream from which we read dataset
262 | 	 * @param dict the dictionary
263 | 	 * @return dataset if success and null otherwise
264 | 	 */
265 | 	public static LDADataset readDataSet(BufferedReader reader, Dictionary dict){
266 | 		try {
267 | 			//read number of document
268 | 			String line;
269 | 			line = reader.readLine();
270 | 			int M = Integer.parseInt(line);
271 | 			System.out.println("NewM:" + M);
272 | 			
273 | 			LDADataset data = new LDADataset(M, dict);
274 | 			for (int i = 0; i < M; ++i){
275 | 				line = reader.readLine();
276 | 				
277 | 				data.setDoc(line, i);
278 | 			}
279 | 			
280 | 			return data;
281 | 		}
282 | 		catch (Exception e){
283 | 			System.out.println("Read Dataset Error: " + e.getMessage());
284 | 			e.printStackTrace();
285 | 			return null;
286 | 		}
287 | 	}
288 | 	
289 | 	/**
290 | 	 * read a dataset from a string, create new dictionary
291 | 	 * @param str String from which we get the dataset, documents are seperated by newline character 
292 | 	 * @return dataset if success and null otherwise
293 | 	 */
294 | 	public static LDADataset readDataSet(String [] strs){
295 | 		LDADataset data = new LDADataset(strs.length);
296 | 		
297 | 		for (int i = 0 ; i < strs.length; ++i){
298 | 			data.setDoc(strs[i], i);
299 | 		}
300 | 		return data;
301 | 	}
302 | 	
303 | 	/**
304 | 	 * read a dataset from a string with respect to a specified dictionary
305 | 	 * @param str String from which we get the dataset, documents are seperated by newline character	
306 | 	 * @param dict the dictionary
307 | 	 * @return dataset if success and null otherwise
308 | 	 */
309 | 	public static LDADataset readDataSet(String [] strs, Dictionary dict){
310 | 		//System.out.println("readDataset...");
311 | 		LDADataset data = new LDADataset(strs.length, dict);
312 | 		
313 | 		for (int i = 0 ; i < strs.length; ++i){
314 | 			//System.out.println("set doc " + i);
315 | 			data.setDoc(strs[i], i);
316 | 		}
317 | 		return data;
318 | 	}
319 | 	
320 | 	public static LDADataset readDataSet(List<String> [] strs, Dictionary dict){
321 | 		//System.out.println("readDataset...");
322 | 		LDADataset data = new LDADataset(strs.length, dict);
323 | 		
324 | 		for (int i = 0 ; i < strs.length; ++i){
325 | 			//System.out.println("set doc " + i);
326 | 			data.setDoc(strs[i], i);
327 | 		}
328 | 		return data;
329 | 	}
330 | }
331 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/TopicTilingTopicDocument.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *	Martin Riedl
  3 |  *	riedl@cs.tu-darmstadt.de
  4 |  *  FG Language Technology
  5 |  * 	Technische Universität Darmstadt, Germany
  6 |  * 
  7 |  * 
  8 |  *  This file is part of TopicTiling.
  9 |  *
 10 |  *  TopicTiling is free software: you can redistribute it and/or modify
 11 |  *  it under the terms of the GNU General Public License as published by
 12 |  *  the Free Software Foundation, either version 3 of the License, or
 13 |  *  (at your option) any later version.
 14 |  *
 15 |  *  TopicTiling is distributed in the hope that it will be useful,
 16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *  GNU General Public License for more details.
 19 |  *
 20 |  *  You should have received a copy of the GNU General Public License
 21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
 22 |  */
 23 | 
 24 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter;
 25 | 
 26 | import java.util.ArrayList;
 27 | import java.util.Collection;
 28 | import java.util.Collections;
 29 | import java.util.HashMap;
 30 | import java.util.List;
 31 | import java.util.Map.Entry;
 32 | 
 33 | import jgibbslda.Inferencer;
 34 | import jgibbslda.LDACmdOption;
 35 | import jgibbslda.Model;
 36 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 37 | 
 38 | public class TopicTilingTopicDocument {
 39 | 	public List<Double> similarityScores;
 40 | 	public List<Integer> minimaPosition;
 41 | 	public List<Double> depthScores;
 42 | 	private Inferencer inf;
 43 | 	private LDACmdOption opt;
 44 | 
 45 | 	private int segmentNumber = -1;
 46 | 
 47 | 	private int window = 1;
 48 | 	private String ldaModelDirectory;
 49 | 	private String ldaModelName;
 50 | 	private int repeatSegmentation = 1;
 51 | 	private int inferenceIterations = 100;
 52 | 	private int repeatInference = 1;
 53 | 
 54 | 	public TopicTilingTopicDocument(String ldaModelDirectory, String ldaModelName, int window, int repeatSegmentation, int repeatInference, int inferenceIteration) {
 55 | 		this(ldaModelDirectory, ldaModelName, window, repeatSegmentation, repeatInference, inferenceIteration, -1);
 56 | 	}
 57 | 
 58 | 	public TopicTilingTopicDocument(String ldaModelDirectory, String ldaModelName, int window, int repeatSegmentation, int repeatInference, int inferenceIteration, int segmentNumber) {
 59 | 
 60 | 		super();
 61 | 		this.ldaModelDirectory = ldaModelDirectory;
 62 | 		this.ldaModelName = ldaModelName;
 63 | 		this.window = window;
 64 | 		this.repeatInference = repeatInference;
 65 | 		this.repeatSegmentation = repeatSegmentation;
 66 | 		this.inferenceIterations = inferenceIteration;
 67 | 
 68 | 		opt = new LDACmdOption();
 69 | 		opt.dir = this.ldaModelDirectory;
 70 | 		opt.modelName = this.ldaModelName;
 71 | 		this.segmentNumber = segmentNumber;
 72 | 
 73 | 	}
 74 | 
 75 | 	public List<Integer> segment(List<List<Token>> sentences) {
 76 | 		HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
 77 | 		if (segmentNumber < 0) {
 78 | 			return segment2(sentences);
 79 | 		}
 80 | 		for (int i = 0; i < repeatSegmentation; i++) {
 81 | 
 82 | 			List<Integer> segments = segment2(sentences);
 83 | 			System.out.println(segments);
 84 | 			for (int value : segments) {
 85 | 				int count = 0;
 86 | 				if (map.containsKey(value)) {
 87 | 					count = map.get(value);
 88 | 				}
 89 | 				map.put(value, count + 1);
 90 | 
 91 | 			}
 92 | 		}
 93 | 		System.out.println(map);
 94 | 		List<Integer> segments = new ArrayList<Integer>();
 95 | 		for (int i = repeatSegmentation; i >= 0; i--) {
 96 | 			for (Entry<Integer, Integer> e : map.entrySet()) {
 97 | 				if (e.getValue() == i) {
 98 | 					segments.add(e.getKey());
 99 | 					if (segments.size() == segmentNumber) {
100 | 						Collections.sort(segments);
101 | 						return segments;
102 | 					}
103 | 				}
104 | 
105 | 			}
106 | 		}
107 | 		Collections.sort(segments);
108 | 		return segments;
109 | 	}
110 | 
111 | 	public List<Integer> segment2(List<List<Token>> sentences) {
112 | 
113 | 		similarityScores = getSimilarityScores(sentences);
114 | 		System.out.println("SIM_TOPIC_TILING_DT: "+similarityScores);
115 | 		minimaPosition = getMinima();
116 | 		depthScores = getDepthScores();
117 | 		List<Integer> segments = new ArrayList<Integer>();
118 | 		if (segmentNumber < 0)
119 | 			segments = getSegments();
120 | 		else
121 | 			segments = getSegmentsNumberGiven();
122 | 		// add the last sentence as boundary if it is not set
123 | 
124 | 		if (segments.size() > 1 && segments.get(segments.size() - 1) != sentences.size()) {
125 | 			segments.add(sentences.size() - 1);
126 | 		} else {
127 | 			System.err.println("segment size:" + segments.size());
128 | 			System.err.println("similarites: " + similarityScores);
129 | 		}
130 | 		return segments;
131 | 	}
132 | 
133 | 	private List<Integer> getSegmentsNumberGiven() {
134 | 		List<Integer> segments = new ArrayList<Integer>(minimaPosition);
135 | 		List<Double> depths = depthScores;
136 | 		List<Double> depths2 = new ArrayList<Double>(depthScores);
137 | 		if (depths.size() > segmentNumber) {
138 | 
139 | 			Collections.sort(depths);
140 | 			double min = depths.get(depths.size() - segmentNumber + 1);// save
141 | 
142 | 			for (int i = segments.size() - 1; i >= 0; i--) {
143 | 				if (depths2.get(i) < min) {
144 | 					segments.remove(i);
145 | 				}
146 | 			}
147 | 		}
148 | 		
149 | 		return segments;
150 | 	}
151 | 
152 | 	public List<Integer> getSegments() {
153 | 		// copy minima list
154 | 		List<Integer> segments = new ArrayList<Integer>(minimaPosition);
155 | 
156 | 		double mean = calculateMean(depthScores);
157 | 		double variance = calculateVariance(depthScores, mean);
158 | 		double threshold = mean - variance / 2.0;
159 | 
160 | 		for (int i = segments.size() - 1; i >= 0; i--) {
161 | 			if (depthScores.get(i) < threshold) {
162 | 				segments.remove(i);
163 | 			}
164 | 		}
165 | 		return segments;
166 | 	}
167 | 
168 | 	private double calculateVariance(List<Double> vals, double mean) {
169 | 		double variance = 0.0;
170 | 		for (double d : vals) {
171 | 			variance += (d - mean) * (d - mean);
172 | 		}
173 | 		variance /= vals.size();
174 | 		return variance;
175 | 	}
176 | 
177 | 	private double calculateMean(List<Double> vals) {
178 | 		double mean = 0.0;
179 | 		for (double d : vals) {
180 | 			mean += d;
181 | 		}
182 | 		mean /= vals.size();
183 | 		return mean;
184 | 	}
185 | 
186 | 	private List<Double> getDepthScores() {
187 | 		List<Double> depths = new ArrayList<Double>();
188 | 		for (int i : minimaPosition) {
189 | 			depths.add(getDepths(i));
190 | 		}
191 | 		return depths;
192 | 	}
193 | 
194 | 	// //left and right neighbor
195 | 	private double getDepths(int minimumPosition) {
196 | 		int i = minimumPosition;
197 | 		double depths = similarityScores.get(i - 1) - similarityScores.get(i)
198 | 				+ similarityScores.get(i + 1) - similarityScores.get(i);
199 | 		return depths;
200 | 	}
201 | 
202 | 	
203 | 	private List<Integer> getMinima() {
204 | 		List<Integer> minima = new ArrayList<Integer>();
205 | 		double prev = 0;
206 | 		double curr = 0;
207 | 		double next = 1;
208 | 		for (int i = 1; i < similarityScores.size() - 1; i++) {
209 | 			if (next != curr) {
210 | 				prev = similarityScores.get(i - 1);
211 | 			}
212 | 			curr = similarityScores.get(i);
213 | 			next = similarityScores.get(i + 1);
214 | 			if (curr < next && curr < prev) {
215 | 				minima.add(i);
216 | 			}
217 | 		}
218 | 		return minima;
219 | 
220 | 	}
221 | 
222 | 	private List<Double> getSimilarityScores(List<List<Token>> sentences) {
223 | 		List<Double> similarities = new ArrayList<Double>();
224 | 		List<String> parts = new ArrayList<String>();
225 | 		for (int i = 0; i < sentences.size(); i++) {
226 | 			parts.add(getPrev(sentences, i));
227 | 		}
228 | 		for (int i = window - 1; i > 0; i--) {
229 | 			parts.add(getPrev(sentences, sentences.size() - 1, i));
230 | 		}
231 | 		String[] partsArray = new String[parts.size()];
232 | 		int i = 0;
233 | 		for (String ss : parts) {
234 | 			partsArray[i++] = ss;
235 | 		}
236 | 		double[][] topicDocument = null;
237 | 		for (i = 0; i < repeatInference; i++) {
238 | 			Model m = inference(partsArray);
239 | 			if (topicDocument == null) {
240 | 				topicDocument = new double[partsArray.length][m.K];
241 | 				for (int j = 0; j < partsArray.length; j++) {
242 | 					for (int k = 0; k < m.K; k++) {
243 | 						topicDocument[j][k] = 1.0;
244 | 					}
245 | 				}
246 | 			}
247 | 			for (int j = 0; j < partsArray.length; j++) {
248 | 				for (int k = 0; k < m.K; k++) {
249 | 					topicDocument[j][k] *= m.theta[j][k];
250 | 				}
251 | 			}
252 | 		}
253 | 		for (i = 0; i < partsArray.length - window; i++) {
254 | 			double[] v1 = topicDocument[i];
255 | 			double[] v2 = topicDocument[i + window];
256 | 			double sim = calculateDotProduct(v1, v2);
257 | 			similarities.add(sim);
258 | 		}
259 | 		// System.out.println(similarities.size());
260 | 		return similarities;
261 | 	}
262 | 
263 | 	private List<Integer> getTopicCandidates(int[] topics) {
264 | 		ArrayList<Integer> candidates = new ArrayList<Integer>();
265 | 		for (int m = repeatInference; m >= 0; m--) {
266 | 
267 | 			for (int t = 0; t < topics.length; t++) {
268 | 				if (topics[t] == m) {
269 | 					candidates.add(t);
270 | 				}
271 | 			}
272 | 			if (candidates.size() > 0) {
273 | 				return candidates;
274 | 			}
275 | 		}
276 | 		return null;
277 | 	}
278 | 
279 | 	private int[] getVector(int topicNumber, Collection<Integer> topicAssigment) {
280 | 		int[] vec = new int[topicNumber];
281 | 		for (int k : topicAssigment) {
282 | 			vec[k]++;
283 | 		}
284 | 		return vec;
285 | 	}
286 | 
287 | 	private Model inference(String[] sentences) {
288 | 		inf = new Inferencer();
289 | 		inf.init(opt);
290 | 
291 | 		inf.niters = inferenceIterations;
292 | 		// inf.niters = Integer.parseInt(prop.getProperty("infIteration"));
293 | 		Model m = inf.inference(sentences);
294 | 		return m;
295 | 	}
296 | 
297 | 	private String getPrev(List<List<Token>> sentences, int i) {
298 | 
299 | 		return getPrev(sentences, i, window);
300 | 	}
301 | 
302 | 	private String getPrev(List<List<Token>> sentences, int i, int window) {
303 | 		String result = "";
304 | 		for (int k = i; k >= 0 && k > (i - window); k--) {
305 | 			for (Token t : sentences.get(k)) {
306 | 				result += t.getCoveredText() + " ";
307 | 			}
308 | 		}
309 | 		return result;
310 | 	}
311 | 
312 | 	private double calculateDotProduct(int[] curr, int[] next) {
313 | 		int xy = 0;
314 | 		int sumX = 0;
315 | 		int sumY = 0;
316 | 		if (curr.length != next.length) {
317 | 			throw new IllegalArgumentException("Cosine Similarity: X != Y");
318 | 		}
319 | 		for (int i = 0; i < curr.length; i++) {
320 | 			int xi = curr[i];
321 | 			int yi = next[i];
322 | 
323 | 			xy += xi * yi;
324 | 			sumX += xi * xi;
325 | 			sumY += yi * yi;
326 | 		}
327 | 
328 | 		return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY));
329 | 	}
330 | 
331 | 	private double calculateDotProduct(double[] curr, double[] next) {
332 | 		double xy = 0;
333 | 		double sumX = 0;
334 | 		double sumY = 0;
335 | 		if (curr.length != next.length) {
336 | 			throw new IllegalArgumentException("Cosine Similarity: X != Y");
337 | 		}
338 | 		for (int i = 0; i < curr.length; i++) {
339 | 			double xi = curr[i];
340 | 			double yi = next[i];
341 | 
342 | 			xy += xi * yi;
343 | 			sumX += xi * xi;
344 | 			sumY += yi * yi;
345 | 		}
346 | 
347 | 		return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY));
348 | 	}
349 | 
350 | }
351 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/TextTilingWindowOptimized.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *	Martin Riedl
  3 |  *	riedl@cs.tu-darmstadt.de
  4 |  *  FG Language Technology
  5 |  * 	Technische Universität Darmstadt, Germany
  6 |  * 
  7 |  * 
  8 |  *  This file is part of TopicTiling.
  9 |  *
 10 |  *  TopicTiling is free software: you can redistribute it and/or modify
 11 |  *  it under the terms of the GNU General Public License as published by
 12 |  *  the Free Software Foundation, either version 3 of the License, or
 13 |  *  (at your option) any later version.
 14 |  *
 15 |  *  TopicTiling is distributed in the hope that it will be useful,
 16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *  GNU General Public License for more details.
 19 |  *
 20 |  *  You should have received a copy of the GNU General Public License
 21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
 22 |  */
 23 | 
 24 | 
 25 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter;
 26 | 
 27 | import java.io.FileNotFoundException;
 28 | import java.io.FileReader;
 29 | import java.io.IOException;
 30 | import java.util.ArrayList;
 31 | import java.util.Collection;
 32 | import java.util.Collections;
 33 | import java.util.HashMap;
 34 | import java.util.List;
 35 | import java.util.Map.Entry;
 36 | import java.util.Properties;
 37 | import java.util.Random;
 38 | 
 39 | import jgibbslda.Inferencer;
 40 | import jgibbslda.LDACmdOption;
 41 | import jgibbslda.Model;
 42 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 43 | 
 44 | public class TextTilingWindowOptimized {
 45 | 	private int segmentNumber = -1;
 46 | 	private int window = 1;
 47 | 	private int additionalVectorSize = 1;
 48 | 	public List<Double> similarityScores;
 49 | 	public List<Integer> minimaPosition;
 50 | 	public List<Double> depthScores;
 51 | 	private Inferencer inf;
 52 | 	private String ldaModel;
 53 | 	private LDACmdOption opt;
 54 | 	private Properties prop;
 55 | 	private int segmentIteration = 5;
 56 | 	private int inferenceIterationRepeating = 1;
 57 | 	private int inferenceIteration;
 58 | 
 59 | 	public TextTilingWindowOptimized(String ldaModel) {
 60 | 		this(ldaModel, -1);
 61 | 	}
 62 | 
 63 | 	public TextTilingWindowOptimized(String ldaModel, int segmentNumber) {
 64 | 		super();
 65 | 		this.ldaModel = ldaModel;
 66 | 		opt = new LDACmdOption();
 67 | 		opt.dir = ldaModel;
 68 | 		// opt.modelName = "model-final";
 69 | 		this.segmentNumber = segmentNumber;
 70 | 		prop = new Properties();
 71 | 		try {
 72 | 			prop.load(new FileReader("topictiling_config"));
 73 | 		} catch (FileNotFoundException e) {
 74 | 			// TODO Auto-generated catch block
 75 | 			e.printStackTrace();
 76 | 		} catch (IOException e) {
 77 | 			// TODO Auto-generated catch block
 78 | 			e.printStackTrace();
 79 | 		}
 80 | 		opt.modelName = prop.getProperty("model_name");
 81 | 		window = 1;
 82 | 		inferenceIteration = 100;
 83 | 		inferenceIterationRepeating  = 1;
 84 | 		segmentIteration = 1;
 85 | 		if (prop.containsKey("window")) 
 86 | 			window = Integer.parseInt(prop.getProperty("window"));
 87 | 		if (prop.containsKey("infIteration"))
 88 | 			inferenceIteration = Integer.parseInt(prop.getProperty("infIteration"));
 89 | 		if (prop.containsKey("infIterationRepeating"))
 90 | 			inferenceIterationRepeating = Integer.parseInt(prop.getProperty("infIterationRepeating"));
 91 | 		if (prop.containsKey("segmentIteration"))
 92 | 			segmentIteration = Integer.parseInt(prop.getProperty("segmentIteration"));
 93 | 		System.err.println("window:"+window);
 94 | 		System.err.println("inferenceIteration:"+inferenceIteration);
 95 | 		System.err.println("inferenceIterationRepeating:"+inferenceIterationRepeating);
 96 | 		System.err.println("window:"+window);
 97 | 	}
 98 | 
 99 | 	public List<Integer> segment(List<List<Token>> sentences) {
100 | 		HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
101 | 		if (segmentNumber < 0) {
102 | 			return segment2(sentences);
103 | 		}
104 | 		for (int i = 0; i < segmentIteration; i++) {
105 | 
106 | 			List<Integer> segments = segment2(sentences);
107 | 			System.out.println(segments);
108 | 			for (int value : segments) {
109 | 				int count = 0;
110 | 				if (map.containsKey(value)) {
111 | 					count = map.get(value);
112 | 				}
113 | 				map.put(value, count + 1);
114 | 
115 | 			}
116 | 		}
117 | 		System.out.println(map);
118 | 		List<Integer> segments = new ArrayList<Integer>();
119 | 		for (int i = segmentIteration; i >= 0; i--) {
120 | 			for (Entry<Integer, Integer> e : map.entrySet()) {
121 | 				if (e.getValue() == i) {
122 | 					segments.add(e.getKey());
123 | 					if (segments.size() == segmentNumber) {
124 | 						Collections.sort(segments);
125 | 						return segments;
126 | 					}
127 | 				}
128 | 
129 | 			}
130 | 		}
131 | 		Collections.sort(segments);
132 | 		return segments;
133 | 	}
134 | 
135 | 	public List<Integer> segment2(List<List<Token>> sentences) {
136 | 
137 | 		similarityScores = getSimilarityScores(sentences);
138 | 		minimaPosition = getMinima();
139 | 		depthScores = getDepthScores();
140 | 		
141 | 		List<Integer> segments = new ArrayList<Integer>();
142 | 		if (segmentNumber < 0)
143 | 			segments = getSegments();
144 | 		else
145 | 			segments = getSegmentsNumberGiven();
146 | 		// add the last sentence as boundary if it is not set
147 | 		
148 | 		if (segments.size()>1&&segments.get(segments.size() - 1) != sentences.size()) {
149 | 			segments.add(sentences.size() - 1);
150 | 		}else{
151 | 			System.err.println("segment size:"+segments.size());
152 | 			System.err.println("similarites: "+similarityScores);
153 | 		}
154 | 		// System.out.println(segments);
155 | 		return segments;
156 | 	}
157 | 
158 | 	private List<Integer> getSegmentsNumberGiven() {
159 | 		List<Integer> segments = new ArrayList<Integer>(minimaPosition);
160 | 		List<Double> depths = depthScores;
161 | 		List<Double> depths2 = new ArrayList<Double>(depthScores);
162 | 		if (depths.size() > segmentNumber) {
163 | 
164 | 			Collections.sort(depths);
165 | 			double min = depths.get(depths.size() - segmentNumber + 1);// save
166 | 
167 | 			for (int i = segments.size() - 1; i >= 0; i--) {
168 | 				if (depths2.get(i) < min) {
169 | 					segments.remove(i);
170 | 				}
171 | 			}
172 | 		}
173 | 
174 | 		return segments;
175 | 	}
176 | 
177 | 	public List<Integer> getSegments() {
178 | 		// copy minima list
179 | 		List<Integer> segments = new ArrayList<Integer>(minimaPosition);
180 | 
181 | 		double mean = calculateMean(depthScores);
182 | 		double variance = calculateVariance(depthScores, mean);
183 | 		double threshold = mean - variance / 2.0;
184 | 
185 | 		for (int i = segments.size() - 1; i >= 0; i--) {
186 | 			if (depthScores.get(i) < threshold) {
187 | 				segments.remove(i);
188 | 			}
189 | 		}
190 | 		return segments;
191 | 	}
192 | 
193 | 	private double calculateVariance(List<Double> vals, double mean) {
194 | 		double variance = 0.0;
195 | 		for (double d : vals) {
196 | 			variance += (d - mean) * (d - mean);
197 | 		}
198 | 		variance /= vals.size();
199 | 		return variance;
200 | 	}
201 | 
202 | 	private double calculateMean(List<Double> vals) {
203 | 		double mean = 0.0;
204 | 		for (double d : vals) {
205 | 			mean += d;
206 | 		}
207 | 		mean /= vals.size();
208 | 		return mean;
209 | 	}
210 | 
211 | 	private List<Double> getDepthScores() {
212 | 		List<Double> depths = new ArrayList<Double>();
213 | 		double depth;
214 | 		for (int i : minimaPosition) {
215 | 
216 | 			depths.add(getDepths(i));
217 | 		}
218 | 		return depths;
219 | 	}
220 | 
221 | 	// //left and right neighbor
222 | 	private double getDepths(int minimumPosition) {
223 | 		int i = minimumPosition;
224 | 		double depths = similarityScores.get(i - 1) - similarityScores.get(i)
225 | 				+ similarityScores.get(i + 1) - similarityScores.get(i);
226 | 		return depths;
227 | 	}
228 | 
229 | 	private List<Integer> getMinima() {
230 | 		List<Integer> minima = new ArrayList<Integer>();
231 | 		double prev = 0;
232 | 		double curr = 0;
233 | 		double next = 1;
234 | 		for (int i = 1; i < similarityScores.size() - 1; i++) {
235 | 			if (next != curr) {
236 | 				prev = similarityScores.get(i - 1);
237 | 			}
238 | 			curr = similarityScores.get(i);
239 | 			next = similarityScores.get(i + 1);
240 | 			if (curr < next && curr < prev) {
241 | 				minima.add(i);
242 | 			}
243 | 		}
244 | 		return minima;
245 | 
246 | 	}
247 | 
248 | 	private List<Double> getSimilarityScores(List<List<Token>> sentences) {
249 | 		List<Double> similarities = new ArrayList<Double>();
250 | 		List<String> parts = new ArrayList<String>();
251 | 		for (int i = 0; i < sentences.size(); i++) {
252 | 			parts.add(getPrev(sentences, i));
253 | 		}
254 | 		for (int i = window - 1; i > 0; i--) {
255 | 			parts.add(getPrev(sentences, sentences.size() - 1, i));
256 | 		}
257 | 		String[] partsArray = new String[parts.size()];
258 | 		int i = 0;
259 | 		for (String ss : parts) {
260 | 			partsArray[i++] = ss;
261 | 		}
262 | 	
263 | 		Model m = inference(partsArray);
264 | 		if (inferenceIterationRepeating == 1) {
265 | 			for (i = 0; i < partsArray.length - window; i++) {
266 | 				int[] v1 = getVector(m.K, m.z[i]);
267 | 				int[] v2 = getVector(m.K, m.z[i + window]);
268 | 				double sim = calculateDotProduct(v1, v2);
269 | 				similarities.add(sim);
270 | 			}
271 | 			
272 | 		} else {
273 | 			// initialize save structure for word wise topic stabilization
274 | 			ArrayList<int[][]> values = new ArrayList<int[][]>();
275 | 			for (int k = 0; k < partsArray.length; k++) {
276 | 				values.add(new int[m.z[k].size()][m.K]);
277 | 			}
278 | 			for (int k = 1; k < inferenceIterationRepeating; k++) {
279 | 				for (int p = 0; p < partsArray.length; p++) {
280 | 					for (int t = 0; t < m.z[p].size(); t++) {
281 | 						int topic = m.z[p].get(t);
282 | 						values.get(p)[t][topic]++;
283 | 					}
284 | 				}
285 | 				m = inference(partsArray);
286 | 			}
287 | 			
288 | 			List<Integer>[] newZ = new ArrayList[partsArray.length];
289 | 			Random r = new Random();
290 | 			for (int s = 0; s < values.size(); s++) {
291 | 				int[][] sentence = values.get(s);
292 | 				newZ[s] = new ArrayList<Integer>();
293 | 				for (int t = 0; t < sentence.length; t++) {
294 | 					List<Integer> candidates = getTopicCandidates(sentence[t]);
295 | 					
296 | 					int topic = candidates.get(r.nextInt(candidates.size()));
297 | 					newZ[s].add(topic);
298 | 				}
299 | 
300 | 			}
301 | 			for (i = 0; i < newZ.length - window; i++) {
302 | 				int[] v1 = getVector(m.K, newZ[i]);
303 | 				int[] v2 = getVector(m.K, newZ[i + window]);
304 | 				double sim = calculateDotProduct(v1, v2);
305 | 				similarities.add(sim);
306 | 			}
307 | 
308 | 		}
309 | 		
310 | 		return similarities;
311 | 	}
312 | 
313 | 	private List<Integer> getTopicCandidates(int[] topics) {
314 | 		ArrayList<Integer> candidates = new ArrayList<Integer>();
315 | 		for (int m = inferenceIterationRepeating; m >= 0; m--) {
316 | 
317 | 			for (int t = 0; t < topics.length; t++) {
318 | 				if (topics[t] == m) {
319 | 					candidates.add(t);
320 | 				}
321 | 			}
322 | 			if (candidates.size() > 0) {
323 | 				return candidates;
324 | 			}
325 | 		}
326 | 		return null;
327 | 	}
328 | 
329 | 	private int[] getVector(int topicNumber, Collection<Integer> topicAssigment) {
330 | 		int[] vec = new int[topicNumber];
331 | 		for (int k : topicAssigment) {
332 | 			vec[k]++;
333 | 		}
334 | 		return vec;
335 | 	}
336 | 
337 | 	private Model inference(String[] sentences) {
338 | 		inf = new Inferencer();
339 | 		inf.init(opt);
340 | 
341 | 		inf.niters = inferenceIteration;
342 | 		Model m = inf.inference(sentences);
343 | 		return m;
344 | 	}
345 | 
346 | 	private double[] norm(int[] v1) {
347 | 		double sum = 0.0;
348 | 		for (int v : v1) {
349 | 			sum += v;
350 | 		}
351 | 		double[] vd = new double[v1.length];
352 | 		for (int i = 0; i < v1.length; i++) {
353 | 			vd[i] = v1[i] / sum;
354 | 		}
355 | 		return vd;
356 | 	}
357 | 
358 | 	private int[] getVector(int i, Model m) {
359 | 		int[] vec = new int[m.K];
360 | 		for (int k : m.z[i]) {
361 | 			vec[k]++;
362 | 		}
363 | 		return vec;
364 | 	}
365 | 
366 | 	private String getPrev(List<List<Token>> sentences, int i) {
367 | 
368 | 		return getPrev(sentences, i, window);
369 | 	}
370 | 
371 | 	private String getPrev(List<List<Token>> sentences, int i, int window) {
372 | 		String result = "";
373 | 		for (int k = i; k >= 0 && k > (i - window); k--) {
374 | 			for (Token t : sentences.get(k)) {
375 | 				result += t.getCoveredText() + " ";
376 | 			}
377 | 		}
378 | 		return result;
379 | 	}
380 | 
381 | 	private double calculateDotProduct(double[] vd1, double[] vd2) {
382 | 		double xy = 0;
383 | 		double sumX = 0;
384 | 		double sumY = 0;
385 | 		if (vd1.length != vd2.length) {
386 | 			throw new IllegalArgumentException("Cosine Similarity: X != Y");
387 | 		}
388 | 		for (int i = 0; i < vd1.length; i++) {
389 | 			double xi = vd1[i];
390 | 			double yi = vd2[i];
391 | 
392 | 			xy += xi * yi;
393 | 			sumX += xi * xi;
394 | 			sumY += yi * yi;
395 | 		}
396 | 
397 | 		return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY));
398 | 	}
399 | 
400 | 	private double calculateDotProduct(int[] curr, int[] next) {
401 | 		int xy = 0;
402 | 		int sumX = 0;
403 | 		int sumY = 0;
404 | 		if (curr.length != next.length) {
405 | 			throw new IllegalArgumentException("Cosine Similarity: X != Y");
406 | 		}
407 | 		for (int i = 0; i < curr.length; i++) {
408 | 			int xi = curr[i];
409 | 			int yi = next[i];
410 | 
411 | 			xy += xi * yi;
412 | 			sumX += xi * xi;
413 | 			sumY += yi * yi;
414 | 		}
415 | 
416 | 		return 1.0 * xy / (Math.sqrt(sumX) * Math.sqrt(sumY));
417 | 	}
418 | 
419 | }
420 | 


--------------------------------------------------------------------------------
/de.tudarmstadt.langtech.semantics.segmentation.topictiling/src/main/java/de/tudarmstadt/langtech/semantics/segmentation/segmenter/annotator/TopicTilingSegmenterAnnotator.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *	Martin Riedl
  3 |  *	riedl@cs.tu-darmstadt.de
  4 |  *  FG Language Technology
  5 |  * 	Technische Universität Darmstadt, Germany
  6 |  * 
  7 |  * 
  8 |  *  This file is part of TopicTiling.
  9 |  *
 10 |  *  TopicTiling is free software: you can redistribute it and/or modify
 11 |  *  it under the terms of the GNU General Public License as published by
 12 |  *  the Free Software Foundation, either version 3 of the License, or
 13 |  *  (at your option) any later version.
 14 |  *
 15 |  *  TopicTiling is distributed in the hope that it will be useful,
 16 |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *  GNU General Public License for more details.
 19 |  *
 20 |  *  You should have received a copy of the GNU General Public License
 21 |  *  along with TopicTiling.  If not, see <http://www.gnu.org/licenses/>.
 22 |  */
 23 | 
 24 | 
 25 | 
 26 | package de.tudarmstadt.langtech.semantics.segmentation.segmenter.annotator;
 27 | 
 28 | import java.text.DecimalFormat;
 29 | import java.util.ArrayList;
 30 | import java.util.Collection;
 31 | import java.util.Iterator;
 32 | import java.util.List;
 33 | 
 34 | import org.apache.uima.UimaContext;
 35 | import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 36 | import org.apache.uima.cas.impl.ListUtils;
 37 | import org.apache.uima.jcas.JCas;
 38 | import org.apache.uima.resource.ResourceInitializationException;
 39 | import org.uimafit.component.JCasAnnotator_ImplBase;
 40 | import org.uimafit.descriptor.ConfigurationParameter;
 41 | import org.uimafit.util.JCasUtil;
 42 | 
 43 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TextTilingWindowOptimized;
 44 | import de.tudarmstadt.langtech.semantics.segmentation.segmenter.TopicTiling;
 45 | import de.tudarmstadt.langtech.semantics.type.Segment;
 46 | import de.tudarmstadt.langtech.semantics.type.SegmentQuantity;
 47 | import de.tudarmstadt.langtech.semantics.type.SegmentScore;
 48 | import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
 49 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 50 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 51 | 
 52 | public class TopicTilingSegmenterAnnotator extends JCasAnnotator_ImplBase {
 53 | 	private boolean printSegments = true;
 54 | 
 55 | 	public static final String PARAM_USE_ASSIGNED_TOPICS = "UseAssgnedTopics";
 56 | 	public static final String PARAM_LDA_MODEL_DIRECTORY = "LdaModelDirectory";
 57 | 	public static final String PARAM_LDA_MODEL_NAME = "LdaModelName";
 58 | 	public static final String PARAM_WINDOW = "TopicTilingWindow";
 59 | 	public static final String PARAM_REPEAT_SEGMENTATION = "RepeatedSegmentation";
 60 | 	public static final String PARAM_INFERENCE_ITERATION = "InferenceIteration";
 61 | 	public static final String PARAM_REPEAT_INFERENCE = "RepeatedInference";
 62 | 	public static final String PARAM_DEPTH_SCORE = "DepthScore";
 63 | 	public static final String PARAM_MODE_COUNTING = "ModeCounting";
 64 | 	public static final String PARAM_DEBUG="Debug";
 65 | 	@ConfigurationParameter(name = PARAM_USE_ASSIGNED_TOPICS, mandatory = false)
 66 | 	private boolean useAssignedTopics = false;;
 67 | 	@ConfigurationParameter(name = PARAM_LDA_MODEL_DIRECTORY, mandatory = true)
 68 | 	private String ldaModelDirectory;
 69 | 	@ConfigurationParameter(name = PARAM_LDA_MODEL_NAME, mandatory = true)
 70 | 	private String ldaModelName;
 71 | 	@ConfigurationParameter(name = PARAM_WINDOW, mandatory = true)
 72 | 	private int window;
 73 | 	@ConfigurationParameter(name = PARAM_REPEAT_INFERENCE, mandatory = true)
 74 | 	private int repeatInferences;
 75 | 	@ConfigurationParameter(name = PARAM_REPEAT_SEGMENTATION, mandatory = true)
 76 | 	private int repeatSegmentation;
 77 | 	@ConfigurationParameter(name = PARAM_INFERENCE_ITERATION, mandatory = true)
 78 | 	private int inferenceIteration;
 79 | 	@ConfigurationParameter(name = PARAM_MODE_COUNTING, mandatory = true)
 80 | 	private boolean modeCounting;
 81 | 	@ConfigurationParameter(name = PARAM_DEBUG, mandatory = false)
 82 | 	private boolean debug;
 83 | 	@ConfigurationParameter(name = PARAM_DEPTH_SCORE, mandatory = true)
 84 | 	private String depthScore;
 85 | 
 86 | 	@Override
 87 | 	public void initialize(UimaContext context)
 88 | 			throws ResourceInitializationException {
 89 | 		super.initialize(context);
 90 | 
 91 | 	}
 92 | 
 93 | 	@Override
 94 | 	public void process(JCas jcas) throws AnalysisEngineProcessException {
 95 | 
 96 | 		List<List<Token>> s = new ArrayList<List<Token>>();
 97 | 
 98 | 		// int i = 0;
 99 | 		Iterator<Segment> segments = JCasUtil.select(jcas, Segment.class)
100 | 				.iterator();
101 | 		Segment seg = null;
102 | 		if (segments.hasNext())
103 | 			seg = segments.next();
104 | 
105 | 		for (Sentence ss : JCasUtil.select(jcas, Sentence.class)) {
106 | 
107 | 			s.add(JCasUtil.selectCovered(Token.class, ss));
108 | 
109 | 		}
110 | 
111 | 		DocumentMetaData meta = DocumentMetaData.get(jcas);
112 | 		StringBuffer buffer = new StringBuffer();
113 | 		buffer.append(meta.getDocumentTitle());
114 | 
115 | 		buffer.append("\n");
116 | 		// TopicTilingTopicDocument tttd ;
117 | 
118 | 		TopicTiling tt;
119 | 		tt = new TopicTiling(ldaModelDirectory, ldaModelName, window,
120 | 				repeatSegmentation, repeatInferences, inferenceIteration,
121 | 				modeCounting, depthScore, useAssignedTopics,debug);
122 | 		buffer.append("GOL: " + getGoldSegments(jcas) + "\n");
123 | 		List<Integer> segmentPositions;
124 | 		if (JCasUtil.select(jcas, SegmentQuantity.class).size() == 0) {
125 | 			segmentPositions = tt.segment(s);
126 | 		} else {
127 | 			int segNum = JCasUtil.select(jcas, SegmentQuantity.class)
128 | 					.iterator().next().getSegmentCount();
129 | 			segmentPositions = tt.segment(s, segNum);
130 | 		}
131 | 		
132 | 		int j = 0;
133 | 		for (List<Token> ss: s){
134 | 			String l = "";
135 | 			for (Token t:ss){
136 | 				l+=t.getCoveredText()+" ";
137 | 			}
138 | 			if(debug)System.out.println(j+"\t"+l);
139 | 			j++;
140 | 		}
141 | 		if(debug)System.out.println(segmentPositions);
142 | 		annotateSegments(jcas, segmentPositions, tt.depthScores,
143 | 				tt.minimaPosition,tt.similarityScores);
144 | 	}
145 | 
146 | 	private void printRcode(JCas jcas, int segmentCount,
147 | 			TextTilingWindowOptimized tt, List<Integer> segments) {
148 | 		DocumentMetaData metaData = DocumentMetaData.get(jcas);
149 | 		;
150 | 		String main = metaData.getDocumentTitle()
151 | 				+ ": Cosine Similarity between sentences ";
152 | 		if (segmentCount < 0)
153 | 			main = main + " (segments given: " + segmentCount + ")";
154 | 		StringBuffer buffer = new StringBuffer();
155 | 		buffer.append("#Cosine Similarity\n");
156 | 		buffer.append("pdf(file='" + metaData.getDocumentTitle()
157 | 				+ ".pdf',20,7);\n");
158 | 		buffer.append(toListInR(tt.similarityScores, "cos"));
159 | 		buffer.append(toListInR(segments, "estSeg"));
160 | 		buffer.append(toListInR(getGoldSegments(jcas), "seg"));
161 | 		buffer.append(toListInR(tt.minimaPosition, "canSeg"));
162 | 		buffer.append(toListInR(tt.depthScores, "depth"));
163 | 		buffer.append("plot(0:"
164 | 				+ (tt.similarityScores.size() - 1)
165 | 				+ ",cos,type='l',xlab='Sentence',ylab='cosine similarity',main='"
166 | 				+ main + "');\n");
167 | 		buffer.append("abline(v=seg,col='red',lty=5);\n");
168 | 		buffer.append("abline(v=estSeg,col='green',lwd=2,lty=4);\n");
169 | 		buffer.append("abline(v=seg[seg%in%estSeg],col='black',lwd=3);\n");
170 | 		buffer.append("points(estSeg,rep(max(cos)*0.98," + segments.size()
171 | 				+ "),col='green',pch=22);\n");
172 | 		buffer.append("points(canSeg,rep(max(cos)*0.9,"
173 | 				+ tt.minimaPosition.size() + "),col='blue',pch=23);\n");
174 | 		buffer.append("text(canSeg[-length(canSeg)],rep(max(cos)*c(0.84,0.88,0.92,0.94),length="
175 | 				+ tt.depthScores.size() + "),labels=depth);\n");
176 | 		buffer.append("dev.off();dev.off()");
177 | 		System.out.println(buffer.toString());
178 | 
179 | 	}
180 | 
181 | 	private List<Integer> getGoldSegments(JCas jcas) {
182 | 
183 | 		List<Integer> ret = new ArrayList<Integer>();
184 | 		Iterator<Segment> segIt = JCasUtil.iterator(jcas, Segment.class);
185 | 		int sentenceCount = -1;
186 | 		while (segIt.hasNext()) {
187 | 			Segment seg = segIt.next();
188 | 			for (Sentence s : JCasUtil.selectCovered(jcas, Sentence.class, seg)) {
189 | 				sentenceCount++;
190 | 			}
191 | 			ret.add(sentenceCount);
192 | 		}
193 | 		return ret;
194 | 	}
195 | 
196 | 	private <T> StringBuffer toListInR(List<T> list, String name) {
197 | 		StringBuffer buffer = new StringBuffer();
198 | 		buffer.append(name);
199 | 		buffer.append("=c(");
200 | 		for (T sc : list) {
201 | 			if (sc instanceof Double) {
202 | 				DecimalFormat df = new DecimalFormat("#.##");
203 | 				buffer.append(df.format(sc).replace(",", "."));
204 | 			} else {
205 | 				buffer.append(sc);
206 | 			}
207 | 			buffer.append(",");
208 | 		}
209 | 		if (list.size() > 0)
210 | 			buffer.deleteCharAt(buffer.length() - 1);
211 | 		buffer.append(");\n");
212 | 		return buffer;
213 | 	}
214 | 	private String getSimilarityScores(List<Double> similarityScores, int from, int to){
215 | 		String scores = "";
216 | 		int f = from-1;
217 | 		if (f<0)f=0;
218 | 		if(debug)System.out.println(f+"\t"+(to-1));
219 | 		for(int i =f;i<=to-1;i++){
220 | 			scores+=","+similarityScores.get(i);
221 | 		}
222 | 		if (scores.length()>0)scores=scores.substring(1);
223 | 		return scores;
224 | 	}
225 | 	private void annotateSegments(JCas jcas, List<Integer> segmentPositions,
226 | 			List<Double> depthScores, List<Integer> minimaPosition,List<Double> similarityScores) {
227 | 		List<Sentence> sentences = new ArrayList<Sentence>(JCasUtil.select(jcas, Sentence.class));
228 | 		
229 | 		//add first segment which has no score 
230 | 		int endIdx;
231 | 		if (segmentPositions.get(segmentPositions.size()-1)!=(sentences.size()-1)){
232 | 			segmentPositions.add(sentences.size()-1);
233 | 			depthScores.add(0.0);
234 | 		}
235 | 		int endSentece;
236 | 		if (segmentPositions.size()>0){
237 | 			endIdx=sentences.get(segmentPositions.get(0)).getEnd();
238 | 			endSentece=segmentPositions.get(0);
239 | 		}else{
240 | 			endIdx=sentences.get(sentences.size()-1).getEnd();
241 | 			endSentece=sentences.size()-1;
242 | 		}
243 | 		addSegment(sentences.get(0).getBegin(),endIdx,0.0,getSimilarityScores(similarityScores, 0,endSentece),jcas);
244 | 		int segEnd;
245 | 		int segStart;
246 | 		for(int i=1;i<segmentPositions.size();i++){
247 | 			segStart = segmentPositions.get(i-1)+1;
248 | 			segEnd = segmentPositions.get(i);
249 | 			String similarities = getSimilarityScores(similarityScores, segStart, segEnd);
250 | 			addSegment(sentences.get(segStart).getBegin(), sentences.get(segEnd).getEnd(), depthScores.get(i),similarities, jcas);
251 | 		}
252 | 		
253 | 	}
254 | 	private void addSegment(int startIdx, int endIdx, double score, String similarities,JCas jcas) {
255 | 		SegmentScore seg = new SegmentScore(jcas,startIdx,endIdx);
256 | 		seg.setScore(score);
257 | 		seg.setSimilarityScores(similarities);
258 | 		seg.addToIndexes();
259 | 		
260 | 	}
261 | 
262 | 	private void cannotateSegments(JCas jcas, List<Integer> segmentPositions,
263 | 			List<Double> depthScores, List<Integer> minimaPosition) {
264 | 		Iterator<Sentence> sentenceItr = JCasUtil
265 | 				.iterator(jcas, Sentence.class);
266 | 		int sentenceCount = -1;
267 | 		int prevBreak = 0;
268 | 
269 | 		for (final int sBreak : segmentPositions) {
270 | 			final SegmentScore score = new SegmentScore(jcas);
271 | 
272 | 			Sentence segmentSentence = null;
273 | 
274 | 			int beginOffset = 0;
275 | 			int endOffset = 0;
276 | 
277 | 			// move sentenceItr to last sentence in segment
278 | 			for (; sentenceCount < sBreak; sentenceCount++) {
279 | 				segmentSentence = sentenceItr.next();
280 | 
281 | 				if (sentenceCount == prevBreak) {
282 | 					beginOffset = segmentSentence.getBegin();
283 | 					System.out.println("BeginOffset: "+ beginOffset);
284 | 				}
285 | 			}
286 | 
287 | 			if (segmentSentence != null) {
288 | 				endOffset = segmentSentence.getEnd();
289 | 				System.out.println("end offset "+endOffset);
290 | 			}
291 | 			score.setBegin(beginOffset);
292 | 			score.setEnd(endOffset);
293 | 			int idx = minimaPosition.indexOf(sBreak);
294 | 			if (idx < 0) {
295 | 				score.setScore(1.0);
296 | 			} else {
297 | 				score.setScore(depthScores.get(idx));
298 | 			}
299 | 			score.addToIndexes();
300 | 			if (printSegments) {
301 | 				System.out.println(sBreak + "\t" + sentenceCount + "\t"
302 | 						+ beginOffset + "\t" + endOffset);
303 | 			}
304 | 			prevBreak = sBreak;
305 | 		}
306 | 	}
307 | 
308 | 	/**
309 | 	 * expects a list with the sentencenumber that will be segmented
310 | 	 * 
311 | 	 * @param jcas
312 | 	 * @param sentenceBreaks
313 | 	 */
314 | 	private void annotateSegments(JCas jcas, List<Integer> sentenceBreaks) {
315 | 		Iterator<Sentence> sentenceItr = JCasUtil
316 | 				.iterator(jcas, Sentence.class);
317 | 		int sentenceCount = -1;
318 | 		int prevBreak = 0;
319 | 		if (printSegments) {
320 | 			System.out.println("Annotated Segments");
321 | 			System.out.println(sentenceBreaks.toString());
322 | 		}
323 | 
324 | 		for (final int sBreak : sentenceBreaks) {
325 | 			final Segment seg = new Segment(jcas);
326 | 
327 | 			Sentence segmentSentence = null;
328 | 
329 | 			int beginOffset = 0;
330 | 			int endOffset = 0;
331 | 
332 | 			// move sentenceItr to last sentence in segment
333 | 			for (; sentenceCount < sBreak; sentenceCount++) {
334 | 				segmentSentence = sentenceItr.next();
335 | 
336 | 				if (sentenceCount == prevBreak) {
337 | 					beginOffset = segmentSentence.getBegin();
338 | 				}
339 | 			}
340 | 
341 | 			if (segmentSentence != null) {
342 | 				endOffset = segmentSentence.getEnd();
343 | 			}
344 | 
345 | 			seg.setBegin(beginOffset);
346 | 			seg.setEnd(endOffset);
347 | 			seg.addToIndexes();
348 | 
349 | 			if (printSegments) {
350 | 				System.out.println(sBreak + "\t" + sentenceCount + "\t"
351 | 						+ beginOffset + "\t" + endOffset);
352 | 			}
353 | 			prevBreak = sBreak;
354 | 		}
355 | 	}
356 | }
357 | 


--------------------------------------------------------------------------------