├── tml ├── tml │ ├── corpora │ │ ├── invalidDocuments │ │ │ ├── empty.txt │ │ │ ├── empty2.txt │ │ │ └── justStopWords.txt │ │ ├── BerryDumais │ │ │ ├── B09.txt │ │ │ ├── Q01.txt │ │ │ ├── B01.txt │ │ │ ├── B10.txt │ │ │ ├── B02.txt │ │ │ ├── B04.txt │ │ │ ├── B12.txt │ │ │ ├── B07.txt │ │ │ ├── B14.txt │ │ │ ├── B11.txt │ │ │ ├── B06.txt │ │ │ ├── B08.txt │ │ │ ├── B13.txt │ │ │ ├── B16.txt │ │ │ ├── B15.txt │ │ │ ├── B03.txt │ │ │ ├── B17.txt │ │ │ └── B05.txt │ │ ├── introLSA │ │ │ ├── m4.txt │ │ │ ├── c3.txt │ │ │ ├── m2.txt │ │ │ ├── m1.txt │ │ │ ├── c4.txt │ │ │ ├── c1.txt │ │ │ ├── c2.txt │ │ │ ├── c5.txt │ │ │ └── m3.txt │ │ ├── handbookOfLSA │ │ │ ├── q5.txt │ │ │ ├── m3.txt │ │ │ ├── b2.txt │ │ │ ├── b3.txt │ │ │ ├── m1.txt │ │ │ ├── m4.txt │ │ │ ├── m5.txt │ │ │ ├── b1.txt │ │ │ ├── m2.txt │ │ │ └── b4.txt │ │ ├── identical │ │ │ ├── D1.txt │ │ │ ├── D2.txt │ │ │ └── D3.txt │ │ └── uppsala │ │ │ └── 0100.a1.txt │ ├── log │ │ └── README.txt │ ├── lucene │ │ └── README.txt │ ├── svd │ │ └── README.txt │ ├── lanczos │ │ ├── windows │ │ │ ├── README.txt │ │ │ ├── svd.exe │ │ │ └── sample.matrix │ │ ├── tmp │ │ │ └── README.txt │ │ └── unix │ │ │ ├── README.txt │ │ │ └── svd │ ├── processed │ │ └── README.txt │ ├── stanford │ │ └── README.txt │ ├── uploaded │ │ └── README │ ├── tmp │ │ └── README.txt │ └── tml.properties ├── svd │ └── README.txt ├── src │ ├── lanczos │ │ ├── win32 │ │ │ ├── lap2 │ │ │ ├── las2.make │ │ │ ├── makefile │ │ │ ├── matrix │ │ │ ├── matrix.orig │ │ │ ├── matrix.prob │ │ │ └── timersun.c │ │ └── unix │ │ │ ├── lap2 │ │ │ ├── matrix │ │ │ ├── makefile │ │ │ └── las2.h │ ├── main │ │ └── java │ │ │ ├── tml │ │ │ ├── tml.sqlite │ │ │ ├── vectorspace │ │ │ │ ├── operations │ │ │ │ │ ├── visualizations │ │ │ │ │ │ ├── Visualization.java │ │ │ │ │ │ ├── AbstractVisualization.java │ │ │ │ │ │ └── TagClouds.java │ │ │ │ │ ├── OperationListener.java │ │ │ │ │ ├── results │ │ │ │ │ │ ├── FactorAnalysisPlotResult.java │ │ │ │ │ │ ├── NullResult.java │ │ │ │ │ │ ├── package.html │ │ │ │ │ │ ├── LastPassageResult.java │ │ │ │ │ │ ├── TermRankedResult.java │ │ │ │ │ │ ├── ParagraphCoherenceIndexResult.java │ │ │ │ │ │ ├── AbstractResult.java │ │ │ │ │ │ ├── RapidAutomaticKeywordExtractionResult.java │ │ │ │ │ │ ├── TagCloudsResult.java │ │ │ │ │ │ ├── SummaryResult.java │ │ │ │ │ │ ├── PassageSimilarityResult.java │ │ │ │ │ │ ├── PassageDistancesResult.java │ │ │ │ │ │ ├── LexiconAnalysisResult.java │ │ │ │ │ │ ├── Summary.java │ │ │ │ │ │ ├── PassageClusteringLingoResult.java │ │ │ │ │ │ ├── RelationshipExtractionResult.java │ │ │ │ │ │ ├── PassageExtractionSummarizationResult.java │ │ │ │ │ │ └── TermsExtractionSummarizationResult.java │ │ │ │ │ ├── OperationEvent.java │ │ │ │ │ ├── summarization │ │ │ │ │ │ ├── SummarizationOperation.java │ │ │ │ │ │ ├── VectorLengthSummarization.java │ │ │ │ │ │ └── LatentSemanticAnalysisSummarization.java │ │ │ │ │ ├── package.html │ │ │ │ │ ├── LastPassage.java │ │ │ │ │ ├── ConceptExtraction.java │ │ │ │ │ ├── Summary.java │ │ │ │ │ ├── FactorAnalysisPlot.java │ │ │ │ │ ├── LexiconAnalysis.java │ │ │ │ │ ├── TagClouds.java │ │ │ │ │ ├── RelationshipExtraction.java │ │ │ │ │ ├── CompoundNounsSummarized.java │ │ │ │ │ ├── ParagraphCoherenceIndex.java │ │ │ │ │ └── PassageExtractionSummarization.java │ │ │ │ ├── factorisation │ │ │ │ │ ├── SingularValueDecomposition.java │ │ │ │ │ ├── MatrixFactorisation.java │ │ │ │ │ └── SpaceDecomposition.java │ │ │ │ ├── TermWeightingException.java │ │ │ │ ├── NoDocumentsInCorpusException.java │ │ │ │ ├── package.html │ │ │ │ ├── EmptyTextPassageException.java │ │ │ │ ├── NotEnoughTermsInCorpusException.java │ │ │ │ └── SVD.java │ │ │ ├── storage │ │ │ │ ├── RepositoryListener.java │ │ │ │ ├── importers │ │ │ │ │ ├── PdfImporter.java │ │ │ │ │ ├── package.html │ │ │ │ │ ├── TextImporter.java │ │ │ │ │ ├── Importer.java │ │ │ │ │ ├── HtmlImporter.java │ │ │ │ │ └── AbstractImporter.java │ │ │ │ ├── TmlCleanupTask.java │ │ │ │ ├── TmlAnnotatorTask.java │ │ │ │ ├── package.html │ │ │ │ ├── RepositoryEvent.java │ │ │ │ ├── DocumentAnnotator.java │ │ │ │ └── DocumentCleanup.java │ │ │ ├── utils │ │ │ │ ├── package.html │ │ │ │ ├── Highlighting.java │ │ │ │ ├── RegexUtils.java │ │ │ │ ├── JDBCUtils.java │ │ │ │ ├── LuceneUtils.java │ │ │ │ ├── DBUtils.java │ │ │ │ └── MatrixUtils.java │ │ │ ├── corpus │ │ │ │ ├── RepositoryCorpus.java │ │ │ │ ├── SearchResultsCorpus.java │ │ │ │ ├── package.html │ │ │ │ ├── ParagraphCorpus.java │ │ │ │ └── SentenceCorpus.java │ │ │ ├── package.html │ │ │ ├── annotators │ │ │ │ ├── AbstractAnnotator.java │ │ │ │ └── Annotator.java │ │ │ ├── tml.properties │ │ │ ├── overview.html │ │ │ ├── log4j.properties │ │ │ ├── log4j.debug.properties │ │ │ ├── test │ │ │ │ └── AbstractTmlIndexingTest.java │ │ │ └── tml.conceptmap.rules.xml │ │ │ └── package.html │ └── test │ │ ├── java │ │ └── tml │ │ │ └── test │ │ │ ├── FactorAnalysisPlotTest.java │ │ │ ├── TagCloudsTest.java │ │ │ ├── DbConnectionTest.java │ │ │ ├── StemmingTest.java │ │ │ ├── NonNegativeMatrixFactorizationTest.java │ │ │ ├── LanczosTest.java │ │ │ ├── SimpleCorpusTest.java │ │ │ ├── IndexingHtmlTest.java │ │ │ ├── IndexingPlainTextTest.java │ │ │ └── ReadabilityTest.java │ │ └── resources │ │ └── tml.properties ├── lanczos │ ├── unix │ │ └── svd │ └── windows │ │ └── svd.exe ├── www │ ├── doc │ │ ├── resources │ │ │ ├── h1_hdr.png │ │ │ ├── bkgheader.png │ │ │ ├── inherit.gif │ │ │ ├── bkg_blkheader.png │ │ │ └── bkg_gradient.gif │ │ └── stylesheet.css │ └── tml.css ├── corpusparameters.properties ├── copyright.txt ├── javadoc.xml └── pom.xml └── README.md /tml/tml/corpora/invalidDocuments/empty.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tml/tml/corpora/invalidDocuments/empty2.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tml/svd/README.txt: -------------------------------------------------------------------------------- 1 | Cache directory for SVD binaries -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B09.txt: -------------------------------------------------------------------------------- 1 | Nonlinear Systems -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/m4.txt: -------------------------------------------------------------------------------- 1 | Graph minors: A survey -------------------------------------------------------------------------------- /tml/tml/log/README.txt: -------------------------------------------------------------------------------- 1 | Folder to store the log files -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/Q01.txt: -------------------------------------------------------------------------------- 1 | Application and Theory -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/q5.txt: -------------------------------------------------------------------------------- 1 | Recipe for White bread -------------------------------------------------------------------------------- /tml/tml/lucene/README.txt: -------------------------------------------------------------------------------- 1 | Folder to store the Lucene index -------------------------------------------------------------------------------- /tml/tml/svd/README.txt: -------------------------------------------------------------------------------- 1 | Cache directory for SVD binaries -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/m3.txt: -------------------------------------------------------------------------------- 1 | Drum and Bass Composition -------------------------------------------------------------------------------- /tml/tml/lanczos/windows/README.txt: -------------------------------------------------------------------------------- 1 | Win32 executable of Lanczos -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B01.txt: -------------------------------------------------------------------------------- 1 | A Course on Integral Equations -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B10.txt: -------------------------------------------------------------------------------- 1 | Ordinary Differential Equations -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/b2.txt: -------------------------------------------------------------------------------- 1 | Ingredients for Crescent Rolls -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/b3.txt: -------------------------------------------------------------------------------- 1 | A Recipe for Sor dough Bread -------------------------------------------------------------------------------- /tml/src/lanczos/win32/lap2: -------------------------------------------------------------------------------- 1 | 'matrix' 9 9 -1.0e-30 1.0e-30 TRUE 1.0e-6 0 -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/m1.txt: -------------------------------------------------------------------------------- 1 | Rock and Roll Music in the 1960's -------------------------------------------------------------------------------- /tml/tml/corpora/identical/D1.txt: -------------------------------------------------------------------------------- 1 | Un documento que tiene varias palabras. -------------------------------------------------------------------------------- /tml/tml/corpora/identical/D2.txt: -------------------------------------------------------------------------------- 1 | Un documento que tiene varias palabras. -------------------------------------------------------------------------------- /tml/tml/corpora/identical/D3.txt: -------------------------------------------------------------------------------- 1 | Otro archivo de texto sin texto en comun. -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/c3.txt: -------------------------------------------------------------------------------- 1 | The EPS user interface management system -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/m2.txt: -------------------------------------------------------------------------------- 1 | The intersection graph of paths in trees -------------------------------------------------------------------------------- /tml/tml/lanczos/tmp/README.txt: -------------------------------------------------------------------------------- 1 | Temporary folder for Lanczos calculations -------------------------------------------------------------------------------- /tml/src/lanczos/unix/lap2: -------------------------------------------------------------------------------- 1 | 'introLSA' 9 9 -1.0e-30 1.0e-30 TRUE 1.0e-6 0 2 | -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/m4.txt: -------------------------------------------------------------------------------- 1 | A Perspective of Rock Music in the 90's -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/m5.txt: -------------------------------------------------------------------------------- 1 | Music and Composition of Popular Bands -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/m1.txt: -------------------------------------------------------------------------------- 1 | The generation of random, binary, ordered trees -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/b1.txt: -------------------------------------------------------------------------------- 1 | How to Make Bread and Rolls, a Demonstration -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/c4.txt: -------------------------------------------------------------------------------- 1 | System and human system engineering testing of EPS -------------------------------------------------------------------------------- /tml/lanczos/unix/svd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/lanczos/unix/svd -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B02.txt: -------------------------------------------------------------------------------- 1 | Attractors for Semigroups and Evolution Equations -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B04.txt: -------------------------------------------------------------------------------- 1 | Geometrical Aspects of Partial Differential Equations -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B12.txt: -------------------------------------------------------------------------------- 1 | Oscillation theory of Delay Differential Equations -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/m2.txt: -------------------------------------------------------------------------------- 1 | Different Drum Rolls, a Demonstration of Techniques -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/c1.txt: -------------------------------------------------------------------------------- 1 | Human machine interface for ABC computer applications -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/c2.txt: -------------------------------------------------------------------------------- 1 | A survey of user opinion of computer system response time -------------------------------------------------------------------------------- /tml/tml/processed/README.txt: -------------------------------------------------------------------------------- 1 | Document files that were processed from the uploaded dir. -------------------------------------------------------------------------------- /tml/tml/stanford/README.txt: -------------------------------------------------------------------------------- 1 | Here you should have the PCFG file from Stanford's NLP parser -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B07.txt: -------------------------------------------------------------------------------- 1 | Knapsack Problems: Algorithms and Computer Implementations -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B14.txt: -------------------------------------------------------------------------------- 1 | Sinc Methods for Quadrature and Differential Equations -------------------------------------------------------------------------------- /tml/tml/corpora/handbookOfLSA/b4.txt: -------------------------------------------------------------------------------- 1 | A Quick Recipe for Pizza Dough using Organic Ingredients -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/c5.txt: -------------------------------------------------------------------------------- 1 | Relation of user perceived response time to error measurement -------------------------------------------------------------------------------- /tml/tml/corpora/introLSA/m3.txt: -------------------------------------------------------------------------------- 1 | Graph minors IV: Widths of trees and well-quasi-wakaordering -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tml 2 | === 3 | 4 | Text Mining Library with a focus on Latent Semantic Analysis 5 | -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B11.txt: -------------------------------------------------------------------------------- 1 | Oscillation Theory of Neutral Differential Equations with Delay -------------------------------------------------------------------------------- /tml/tml/lanczos/unix/README.txt: -------------------------------------------------------------------------------- 1 | Unix version of lanczos, compiled as binary for i386 architecture -------------------------------------------------------------------------------- /tml/tml/lanczos/unix/svd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/tml/lanczos/unix/svd -------------------------------------------------------------------------------- /tml/lanczos/windows/svd.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/lanczos/windows/svd.exe -------------------------------------------------------------------------------- /tml/tml/corpora/invalidDocuments/justStopWords.txt: -------------------------------------------------------------------------------- 1 | one two 2 | three 3 | 4 | 5 | 6 | 7 | four -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B06.txt: -------------------------------------------------------------------------------- 1 | Introduction to Hamiltonian Dynnamical Systems and the N-Body Problem -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B08.txt: -------------------------------------------------------------------------------- 1 | Methods of Solving Singular Systems of Ordinary Differential Equations -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B13.txt: -------------------------------------------------------------------------------- 1 | Pseudodifferential operators and nonlinear Partial Differential Equations -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B16.txt: -------------------------------------------------------------------------------- 1 | The Boundary Integral Approach to Static and Dynamic Contact Problems -------------------------------------------------------------------------------- /tml/src/lanczos/win32/las2.make: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/src/lanczos/win32/las2.make -------------------------------------------------------------------------------- /tml/src/main/java/tml/tml.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/src/main/java/tml/tml.sqlite -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B15.txt: -------------------------------------------------------------------------------- 1 | Stability of Stocastic Differential Equations with Respect to Semi-Martingales -------------------------------------------------------------------------------- /tml/tml/lanczos/windows/svd.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/tml/lanczos/windows/svd.exe -------------------------------------------------------------------------------- /tml/www/doc/resources/h1_hdr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/www/doc/resources/h1_hdr.png -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B03.txt: -------------------------------------------------------------------------------- 1 | Automatic Diferentiation of Algorithms: Theory, Implementation, and Application -------------------------------------------------------------------------------- /tml/tml/uploaded/README: -------------------------------------------------------------------------------- 1 | This directory is visited by the indexer thread to check for new files to add to the repository -------------------------------------------------------------------------------- /tml/www/doc/resources/bkgheader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/www/doc/resources/bkgheader.png -------------------------------------------------------------------------------- /tml/www/doc/resources/inherit.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/www/doc/resources/inherit.gif -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B17.txt: -------------------------------------------------------------------------------- 1 | The Double Melling-Barnes Type Integrals and Their Applications to Convolution Theory -------------------------------------------------------------------------------- /tml/www/doc/resources/bkg_blkheader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/www/doc/resources/bkg_blkheader.png -------------------------------------------------------------------------------- /tml/www/doc/resources/bkg_gradient.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/villalon/tml/HEAD/tml/www/doc/resources/bkg_gradient.gif -------------------------------------------------------------------------------- /tml/tml/corpora/BerryDumais/B05.txt: -------------------------------------------------------------------------------- 1 | Ideals, Varieties, and Algorithms - An Introduction to Commputational Algebraic Geometry and Commutative Algebra -------------------------------------------------------------------------------- /tml/tml/tmp/README.txt: -------------------------------------------------------------------------------- 1 | Directory for temporary file processing. Files are obtained from the uploaded dir and then sent to the processed dir if successful. -------------------------------------------------------------------------------- /tml/corpusparameters.properties: -------------------------------------------------------------------------------- 1 | termselcrit = MIN_DF 2 | termselthre = 2 3 | reduxcrit = PCT 4 | reduxthre = 50 5 | localtw = LOGTF 6 | globaltw = Entropy 7 | maxdocs = 9999 8 | normalize = true 9 | lanczos = false 10 | -------------------------------------------------------------------------------- /tml/www/tml.css: -------------------------------------------------------------------------------- 1 | @CHARSET "ISO-8859-1"; 2 | 3 | BODY { 4 | font-family: verdana; 5 | font-size: small; 6 | width: 50%; 7 | } 8 | 9 | H1, H2 { 10 | background-color: #CCCCCC; 11 | color: black; 12 | } 13 | 14 | .code { 15 | background-color: #EEEEEE; 16 | font-family: monospace; 17 | } -------------------------------------------------------------------------------- /tml/src/lanczos/win32/makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS= -O -c 3 | LIB= -lm 4 | TIMER= timersun.o 5 | 6 | all: las2 7 | 8 | las2.o: las2.h 9 | 10 | .c.o: $*.c 11 | ${CC} ${CFLAGS} $*.c 12 | 13 | las2: las2.o ${TIMER} 14 | ${CC} -o $@ las2.o ${TIMER} ${LIB} 15 | 16 | clean: 17 | del las2.o 18 | del timersun.o 19 | del las2.exe 20 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/visualizations/Visualization.java: -------------------------------------------------------------------------------- 1 | package tml.vectorspace.operations.visualizations; 2 | 3 | import tml.vectorspace.operations.Operation; 4 | 5 | public interface Visualization { 6 | public Operation getOperation(); 7 | 8 | public void setOperation(Operation operation); 9 | 10 | public String getHTML(); 11 | } 12 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/OperationListener.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this template, choose Tools | Templates 3 | * and open the template in the editor. 4 | */ 5 | 6 | package tml.vectorspace.operations; 7 | 8 | import java.util.EventListener; 9 | 10 | /** 11 | * 12 | * @author Jorge Villalon 13 | */ 14 | public interface OperationListener extends EventListener { 15 | void operationAction(OperationEvent evt); 16 | } 17 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/RepositoryListener.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this template, choose Tools | Templates 3 | * and open the template in the editor. 4 | */ 5 | 6 | package tml.storage; 7 | 8 | import java.util.EventListener; 9 | /** 10 | * This interface defines the required methods to implement 11 | * a Repository listener, which will be called everytime 12 | * the Repository performs a step in its process. 13 | * 14 | * @author Jorge Villalon 15 | */ 16 | public interface RepositoryListener extends EventListener { 17 | public void repositoryAction(RepositoryEvent evt); 18 | } 19 | -------------------------------------------------------------------------------- /tml/copyright.txt: -------------------------------------------------------------------------------- 1 | Copyright ${date} Jorge Villalon (jorge.villalon@uai.cl) 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/visualizations/AbstractVisualization.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.vectorspace.operations.visualizations; 5 | 6 | import tml.vectorspace.operations.Operation; 7 | 8 | /** 9 | * @author Jorge 10 | * 11 | */ 12 | public abstract class AbstractVisualization implements Visualization { 13 | 14 | protected Operation operation; 15 | 16 | @Override 17 | public Operation getOperation() { 18 | return operation; 19 | } 20 | 21 | @Override 22 | public void setOperation(Operation operation) { 23 | this.operation = operation; 24 | } 25 | 26 | @Override 27 | public abstract String getHTML(); 28 | } 29 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/FactorAnalysisPlotResult.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.vectorspace.operations.results; 5 | 6 | /** 7 | * @author Jorge 8 | * 9 | */ 10 | public class FactorAnalysisPlotResult extends AbstractResult { 11 | 12 | String name; 13 | double x; 14 | double y; 15 | public String getName() { 16 | return name; 17 | } 18 | public void setName(String name) { 19 | this.name = name; 20 | } 21 | public double getX() { 22 | return x; 23 | } 24 | public void setX(double x) { 25 | this.x = x; 26 | } 27 | public double getY() { 28 | return y; 29 | } 30 | public void setY(double y) { 31 | this.y = y; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tml/src/lanczos/win32/matrix: -------------------------------------------------------------------------------- 1 | Learning Systems Group University of Sydney matrix 2 | # 3 | rra 12 9 28 0 4 | (10i8) (10i8) (8f10.3) (8f10.3) 5 | 1 4 10 14 17 20 21 23 26 29 6 | 1 4 5 1 7 8 9 10 12 2 5 9 12 2 4 7 | 9 7 10 12 11 3 11 3 6 11 3 6 8 8 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 9 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 2.000 10 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 11 | 1.000 1.000 1.000 1.000 12 | -------------------------------------------------------------------------------- /tml/tml/lanczos/windows/sample.matrix: -------------------------------------------------------------------------------- 1 | Learning Systems Group University of Sydney matrix 2 | # 3 | rra 12 9 28 0 4 | (10i8) (10i8) (8f10.3) (8f10.3) 5 | 1 4 10 14 17 20 21 23 26 29 6 | 1 4 5 1 7 8 9 10 12 2 5 9 12 2 4 7 | 9 7 10 12 11 3 11 3 6 11 3 6 8 8 | 1,000 1,000 1,000 1,000 1,000 1,000 1,000 1,000 9 | 1,000 1,000 1,000 1,000 1,000 1,000 1,000 2,000 10 | 1,000 1,000 1,000 1,000 1,000 1,000 1,000 1,000 11 | 1,000 1,000 1,000 1,000 12 | -------------------------------------------------------------------------------- /tml/src/lanczos/unix/matrix: -------------------------------------------------------------------------------- 1 | Bellcore ADI Linguistics Data belladit 2 | # 3 | rra 12 9 28 0 4 | (10i8) (10i8) (8f10.3) (8f10.3) 5 | 1 4 10 14 17 20 21 23 26 6 | 1 2 3 3 4 5 6 7 9 2 7 | 4 5 8 1 5 8 4 6 7 10 8 | 10 11 10 11 12 9 11 12 9 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 10 | 1.000 1.000 1.000 1.000 1.000 1.000 2.000 1.000 11 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 12 | 1.000 1.000 1.000 1.000 13 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/OperationEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this template, choose Tools | Templates 3 | * and open the template in the editor. 4 | */ 5 | 6 | package tml.vectorspace.operations; 7 | 8 | import java.util.EventObject; 9 | 10 | /** 11 | * 12 | * @author Jorge Villalon 13 | */ 14 | public class OperationEvent extends EventObject { 15 | /** 16 | * 17 | */ 18 | private static final long serialVersionUID = -6181484640186835815L; 19 | private int maximum; 20 | private int current; 21 | 22 | public int getCurrent() { 23 | return current; 24 | } 25 | 26 | public int getMaximum() { 27 | return maximum; 28 | } 29 | 30 | public OperationEvent(Object source, int max, int curr) { 31 | super(source); 32 | this.current = curr; 33 | this.maximum = max; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tml/src/lanczos/win32/matrix.orig: -------------------------------------------------------------------------------- 1 | Bellcore ADI Linguistics Data belladit 2 | # 3 | rra 12 9 28 0 4 | (10i8) (10i8) (8f10.3) (8f10.3) 5 | 1 4 10 14 17 20 21 23 26 6 | 1 2 3 3 4 5 6 7 9 2 7 | 4 5 8 1 5 8 4 6 7 10 8 | 10 11 10 11 12 9 11 12 9 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 10 | 1.000 1.000 1.000 1.000 1.000 1.000 2.000 1.000 11 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 12 | 1.000 1.000 1.000 1.000 -------------------------------------------------------------------------------- /tml/src/lanczos/win32/matrix.prob: -------------------------------------------------------------------------------- 1 | Title: using stdin 2 | # 3 | rra 9 12 28 0 4 | (10i8) (10i8) (8f10.3) (8f10.3) 5 | 1 3 5 8 10 12 14 16 18 21 6 | 23 26 29 7 | 1 2 3 4 7 8 9 1 4 1 8 | 3 8 9 2 5 2 9 2 3 4 9 | 2 5 6 7 8 2 3 5 10 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 11 | 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 12 | 1.000 1.000 1.000 2.000 1.000 1.000 1.000 1.000 13 | 1.000 1.000 1.000 1.000 14 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/importers/PdfImporter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.storage.importers; 17 | 18 | public class PdfImporter { 19 | 20 | } 21 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/NullResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.results; 17 | 18 | /** 19 | * This result applies to operations were no complex results are stored. 20 | * 21 | * @author Jorge Villalon 22 | * 23 | */ 24 | public class NullResult extends AbstractResult { 25 | 26 | } 27 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/summarization/SummarizationOperation.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.summarization; 17 | 18 | import tml.vectorspace.operations.Operation; 19 | import tml.vectorspace.operations.results.Summary; 20 | 21 | public interface SummarizationOperation extends Operation { 22 | } 23 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | Operation results implement storage for the results of each operation. 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/utils/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | Implements methods that are not directly related to pure TM, such as NLP parsing and matrix management. 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /tml/src/lanczos/win32/timersun.c: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | (c) Copyright 1993 3 | University of Tennessee 4 | All Rights Reserved 5 | *************************************************************************/ 6 | #include 7 | //#include 8 | /*********************************************************************** 9 | * * 10 | * timer() * 11 | * User-supplied function returns elapsed cpu time (float) * 12 | * * 13 | ***********************************************************************/ 14 | //int getrusage(int who, struct rusage *rusage); 15 | float timer(void) 16 | 17 | { 18 | /* long elapsed_time; 19 | struct rusage mytime; 20 | getrusage(RUSAGE_SELF,&mytime);*/ 21 | 22 | /* convert elapsed time to milliseconds */ 23 | /* elapsed_time = (mytime.ru_utime.tv_sec * 1000 + 24 | mytime.ru_utime.tv_usec / 1000);*/ 25 | 26 | /* return elapsed time in seconds */ 27 | // return((float)elapsed_time/1000.); 28 | return((float)1000.); 29 | } 30 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/corpus/RepositoryCorpus.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.corpus; 17 | 18 | /** 19 | * This class represents a corpus with all the documents in the repository 20 | * 21 | * @author Jorge Villalon 22 | * 23 | */ 24 | public class RepositoryCorpus extends Corpus { 25 | 26 | /** 27 | * Creates a {@link Corpus} with all documents in the repository 28 | */ 29 | public RepositoryCorpus() { 30 | this.luceneQuery = "type:document"; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/corpus/SearchResultsCorpus.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.corpus; 17 | 18 | /** 19 | * This class represents a general corpus where any search criteria 20 | * can be used 21 | * @author Jorge Villalon 22 | * 23 | */ 24 | public class SearchResultsCorpus extends Corpus { 25 | 26 | /** 27 | * @param query the query to search the repository 28 | */ 29 | public SearchResultsCorpus(String query) { 30 | this.luceneQuery = query; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/FactorAnalysisPlotTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.test; 5 | 6 | import org.junit.BeforeClass; 7 | import org.junit.Test; 8 | 9 | import tml.Configuration; 10 | import tml.corpus.RepositoryCorpus; 11 | import tml.corpus.CorpusParameters.DimensionalityReduction; 12 | import tml.vectorspace.operations.FactorAnalysisPlot; 13 | import static org.junit.Assert.*; 14 | 15 | /** 16 | * @author Jorge 17 | * 18 | */ 19 | public class FactorAnalysisPlotTest extends AbstractTmlIndexingTest { 20 | 21 | static FactorAnalysisPlot operation; 22 | 23 | @BeforeClass 24 | public static void setUpBeforeClass() throws Exception { 25 | AbstractTmlIndexingTest.setUpBeforeClass(); 26 | repository.addDocumentsInFolder(Configuration.getTmlFolder() + "/corpora/handbookOfLSA"); 27 | RepositoryCorpus corpus = new RepositoryCorpus(); 28 | corpus.getParameters().setDimensionalityReduction(DimensionalityReduction.PCT); 29 | corpus.getParameters().setDimensionalityReductionThreshold(20); 30 | corpus.load(repository); 31 | 32 | operation = new FactorAnalysisPlot(); 33 | operation.setCorpus(corpus); 34 | operation.start(); 35 | } 36 | 37 | @Test 38 | public void checkTagClouds() { 39 | assertNotNull(operation); 40 | 41 | operation.printResults(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/importers/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | Importers implement text cleaning from different file formats. 27 | 28 |

Package Specification

29 | 30 |

Each importer parses a file depending on its extension, and implements a method to return the content as plain text.

31 | 32 | 33 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/TmlCleanupTask.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.storage; 5 | 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.util.TimerTask; 9 | 10 | import org.apache.log4j.Logger; 11 | 12 | import tml.storage.Repository; 13 | 14 | /** 15 | * @author Jorge Villalon 16 | * 17 | */ 18 | public class TmlCleanupTask extends TimerTask { 19 | 20 | private static Logger logger = Logger.getLogger(TmlCleanupTask.class); 21 | private Repository repository; 22 | /** 23 | * 24 | */ 25 | public TmlCleanupTask(Repository repo) { 26 | this.repository = repo; 27 | } 28 | 29 | /* (non-Javadoc) 30 | * @see java.util.TimerTask#run() 31 | */ 32 | @Override 33 | public void run() { 34 | File lock = new File("tml.cleanup.lock"); 35 | if (lock.exists()) { 36 | logger.debug("Cleanup Timer still running! Skipping execution."); 37 | } 38 | try { 39 | lock.createNewFile(); 40 | } catch (IOException e) { 41 | e.printStackTrace(); 42 | logger.error("Couldn't create annotator lock file"); 43 | return; 44 | } 45 | 46 | Thread th = repository.cleanup(); 47 | try { 48 | th.join(); 49 | } catch (InterruptedException e) { 50 | e.printStackTrace(); 51 | logger.error(e.getMessage()); 52 | } 53 | 54 | lock.delete(); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/TmlAnnotatorTask.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.storage; 5 | 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.util.TimerTask; 9 | 10 | import org.apache.log4j.Logger; 11 | 12 | import tml.storage.Repository; 13 | 14 | /** 15 | * @author Jorge Villalon 16 | * 17 | */ 18 | public class TmlAnnotatorTask extends TimerTask { 19 | 20 | private static Logger logger = Logger.getLogger(TmlAnnotatorTask.class); 21 | private Repository repository; 22 | /** 23 | * 24 | */ 25 | public TmlAnnotatorTask(Repository repo) { 26 | this.repository = repo; 27 | } 28 | 29 | /* (non-Javadoc) 30 | * @see java.util.TimerTask#run() 31 | */ 32 | @Override 33 | public void run() { 34 | File lock = new File("tml.annotator.lock"); 35 | if (lock.exists()) { 36 | logger.debug("Annotator Timer still running! Skipping execution."); 37 | } 38 | try { 39 | lock.createNewFile(); 40 | } catch (IOException e) { 41 | e.printStackTrace(); 42 | logger.error("Couldn't create annotator lock file"); 43 | return; 44 | } 45 | 46 | Thread th = repository.annotateDocuments(); 47 | try { 48 | th.join(); 49 | } catch (InterruptedException e) { 50 | e.printStackTrace(); 51 | logger.error(e.getMessage()); 52 | } 53 | 54 | lock.delete(); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | Implements a repository of documents, from which several corpora can be obtained. 27 | 28 |

Package Specification

29 | 30 |

This package implements a simple repository for document management. It uses Lucene for tokenizing, stemming and removing stop words.

31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/TagCloudsTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.test; 5 | 6 | import org.junit.BeforeClass; 7 | import org.junit.Test; 8 | 9 | import tml.Configuration; 10 | import tml.corpus.Corpus; 11 | import tml.corpus.RepositoryCorpus; 12 | import tml.vectorspace.operations.TagClouds; 13 | import static org.junit.Assert.*; 14 | 15 | /** 16 | * @author Jorge 17 | * 18 | */ 19 | public class TagCloudsTest extends AbstractTmlIndexingTest { 20 | 21 | static TagClouds operation; 22 | 23 | @BeforeClass 24 | public static void setUpBeforeClass() throws Exception { 25 | AbstractTmlIndexingTest.setUpBeforeClass(); 26 | repository.addDocumentsInFolder(Configuration.getTmlFolder() + "/corpora/uppsala"); 27 | Corpus corpus = new RepositoryCorpus(); 28 | corpus.load(repository); 29 | 30 | operation = new TagClouds(); 31 | operation.setCorpus(corpus); 32 | operation.start(); 33 | } 34 | 35 | @Test 36 | public void checkTagClouds() { 37 | assertNotNull(operation); 38 | 39 | operation.printResults(); 40 | } 41 | 42 | @Test 43 | public void checkVisualization() { 44 | tml.vectorspace.operations.visualizations.TagClouds visualization = new tml.vectorspace.operations.visualizations.TagClouds(); 45 | visualization.setOperation(operation); 46 | System.out.println(visualization.getHTML()); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | General package, only used as container for the configuration of the library. 27 | 28 |

Package Specification

29 | 30 |

This package is a general container for the rest of the packages. It also contains a Configuration class that reads the default configuration from the jar file or from the filesystem.

31 | 32 | 33 | -------------------------------------------------------------------------------- /tml/src/test/resources/tml.properties: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Copyright (C) 2001, 2007 University of Sydney 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program; if not, write to the Free Software 16 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 17 | # USA 18 | # 19 | # http://www.gnu.org/licenses/gpl.txt 20 | ############################################################################### 21 | # Set root logger level to DEBUG and its only appender to A1. 22 | log4j.rootLogger=DEBUG, A1 23 | 24 | # A1 is set to be a ConsoleAppender. 25 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 26 | 27 | # A1 uses PatternLayout. 28 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 29 | log4j.appender.A1.layout.ConversionPattern=[%-5p] %-4r [%t] %-20c{2} - %m %x %n 30 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | Operations are predefined sets of interesting information that can be obtained from a Semantic Space. 27 | 28 |

Package Specification

29 | 30 |

Operations implement algorithms to obtain patterns of interest from a Semantic Space, such as extracting the most relevant passages or terms, or to obtain a set of labelled clusters.

31 | 32 | 33 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/LastPassageResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations.results; 18 | 19 | /** 20 | * The content of the last passage. 21 | * 22 | * @author Jorge Villalon 23 | * 24 | */ 25 | public class LastPassageResult extends AbstractResult { 26 | 27 | String passage; 28 | 29 | /** 30 | * @return the passage 31 | */ 32 | public String getPassage() { 33 | return passage; 34 | } 35 | 36 | /** 37 | * @param passage the passage to set 38 | */ 39 | public void setPassage(String passage) { 40 | this.passage = passage; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/importers/TextImporter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.storage.importers; 18 | 19 | /** 20 | * TextImporter implements the simples importer of plain text, therefore 21 | * it just returns the content as it is. 22 | * 23 | * @author Jorge Villalon 24 | * 25 | */ 26 | public class TextImporter extends AbstractImporter implements Importer { 27 | 28 | @Override 29 | public String getCleanContent(String content) { 30 | return content; 31 | } 32 | 33 | @Override 34 | protected String[] getFileExtensions() { 35 | String[] extensions = { "txt" }; 36 | return extensions; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/factorisation/SingularValueDecomposition.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.factorisation; 17 | 18 | import Jama.Matrix; 19 | 20 | public class SingularValueDecomposition extends MatrixFactorisation { 21 | 22 | private Jama.SingularValueDecomposition svd = null; 23 | 24 | @Override 25 | public void process(Matrix v) { 26 | svd = new Jama.SingularValueDecomposition(v); 27 | this.decomposition = new SpaceDecomposition(); 28 | this.decomposition.setUkdata(svd.getU().getArray()); 29 | this.decomposition.setSkdata(svd.getS().getArray()); 30 | this.decomposition.setVkdata(svd.getV().getArray()); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/DbConnectionTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.test; 5 | 6 | 7 | import java.io.File; 8 | import java.io.IOException; 9 | 10 | import org.apache.lucene.index.CorruptIndexException; 11 | import org.apache.lucene.store.LockObtainFailedException; 12 | import org.junit.BeforeClass; 13 | import org.junit.Test; 14 | 15 | import tml.Configuration; 16 | 17 | import static org.junit.Assert.*; 18 | 19 | /** 20 | * @author Jorge Villalon 21 | * 22 | */ 23 | public class DbConnectionTest extends AbstractTmlIndexingTest { 24 | 25 | /** 26 | * @throws java.lang.Exception 27 | */ 28 | @BeforeClass 29 | public static void setUpBeforeClass() throws Exception { 30 | AbstractTmlIndexingTest.setUpBeforeClass(); 31 | } 32 | 33 | @Test 34 | public void checkConnection() { 35 | assertNotNull(repository.getDbConnection()); 36 | } 37 | 38 | @Test 39 | public void addMetaData() throws LockObtainFailedException, CorruptIndexException, IOException { 40 | File[] files = new File[1]; 41 | files[0] = new File(Configuration.getTmlFolder() + "/corpora/uppsala/0100.a1.txt"); 42 | repository.addDocumentsInList(files); 43 | } 44 | 45 | @Test 46 | public void getNullMetaData() { 47 | String metadata = repository.getAnnotations("0100.a1", "penntree"); 48 | assertNull(metadata); 49 | metadata = repository.getAnnotations("p1d0100.a1", "penntree"); 50 | assertNull(metadata); 51 | metadata = repository.getAnnotations("s1d0100.a1", "penntree"); 52 | assertNull(metadata); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/corpus/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | Implements all the classes required for corpora management as Bags of Words, it also includes NLP for sentences. 27 | 28 |

Package Specification

29 | 30 |

This package implements the bag of words approach for documents at three levels: Document, paragraph and sentences. As grammatical information is available at the sentence level, it also includes the PennTree bank tree parse of each sentence.

31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/TermWeightingException.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace; 18 | 19 | /** 20 | * Exception occurred while applying the term weighting criteria 21 | * @author Jorge Villalon 22 | * 23 | */ 24 | public class TermWeightingException extends Exception { 25 | 26 | /** 27 | * 28 | */ 29 | private static final long serialVersionUID = -7804139372695995041L; 30 | 31 | /** 32 | * 33 | */ 34 | public TermWeightingException() { 35 | this(null); 36 | } 37 | 38 | /** 39 | * @param e 40 | */ 41 | public TermWeightingException(Exception e) { 42 | super("Exception while calculating Term weighting scheme", e); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/corpus/ParagraphCorpus.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.corpus; 17 | 18 | /** 19 | * Corpus that represents the paragraphs of a {@link TextDocument} 20 | * 21 | * @author Jorge Villalon 22 | * 23 | */ 24 | public class ParagraphCorpus extends Corpus { 25 | 26 | /** 27 | * @param document the {@link TextDocument} to which the paragraphs belong 28 | * @throws Exception if the document is null 29 | */ 30 | public ParagraphCorpus(TextDocument document) throws Exception { 31 | 32 | if(document == null) 33 | throw new Exception("A paragraph corpus must belong to a document"); 34 | 35 | this.luceneQuery = "type:paragraph AND reference:" + document.getExternalId(); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/annotators/AbstractAnnotator.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.annotators; 17 | 18 | import java.io.IOException; 19 | import java.util.ArrayList; 20 | 21 | public class AbstractAnnotator { 22 | 23 | private String fieldName; 24 | protected ArrayList types; 25 | 26 | public ArrayList getTypes() { 27 | return types; 28 | } 29 | public AbstractAnnotator(String fieldName, String[] types) throws IOException { 30 | this.fieldName = fieldName; 31 | this.types = new ArrayList(); 32 | for(String type : types) { 33 | this.types.add(type); 34 | } 35 | } 36 | /** 37 | * @return the fieldName 38 | */ 39 | public String getFieldName() { 40 | return fieldName; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/TermRankedResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.results; 17 | 18 | public class TermRankedResult extends AbstractResult { 19 | 20 | String term; 21 | double rank; 22 | /** 23 | * @param term the term to set 24 | */ 25 | public void setTerm(String term) { 26 | this.term = term; 27 | } 28 | /** 29 | * @return the term 30 | */ 31 | public String getTerm() { 32 | return term; 33 | } 34 | /** 35 | * @param rank the rank to set 36 | */ 37 | public void setRank(double rank) { 38 | this.rank = rank; 39 | } 40 | /** 41 | * @return the rank 42 | */ 43 | public double getRank() { 44 | return rank; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/NoDocumentsInCorpusException.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace; 18 | 19 | /** 20 | * Exception raised when no documents are found in the {@link Corpus} 21 | * @author Jorge Villalon 22 | * 23 | */ 24 | public class NoDocumentsInCorpusException extends Exception { 25 | 26 | private static final long serialVersionUID = 5607315201790740186L; 27 | 28 | /** 29 | * Constructor 30 | */ 31 | public NoDocumentsInCorpusException() { 32 | super("No documents to build the corpus"); 33 | } 34 | 35 | /** 36 | * Constructor 37 | * @param e 38 | */ 39 | public NoDocumentsInCorpusException(Exception e) { 40 | super("No documents to build the corpus", e); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/importers/Importer.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.storage.importers; 18 | 19 | /** 20 | * Interface for all importers. It defines what kind of files it can manage, 21 | * basically by extension, and implements a method to obtain the plain text 22 | * version of the content. 23 | * 24 | * @author Jorge Villalon 25 | * 26 | */ 27 | public interface Importer { 28 | /** 29 | * @param content the text to clean 30 | * @return the plain text version of the content 31 | */ 32 | public String getCleanContent(String content); 33 | 34 | /** 35 | * @param fileExtension 36 | * @return true if the importer can manage the extension 37 | */ 38 | public boolean isValidFileExtension(String fileExtension); 39 | } 40 | -------------------------------------------------------------------------------- /tml/tml/tml.properties: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ############################################################################### 16 | 17 | # TML specifics 18 | 19 | # Default value for tml folder 20 | tml.folder=. 21 | 22 | # Log file for tml 23 | log4j.appender.ROLLING.File=./log/tml.log 24 | 25 | # Annotators that will be active by default 26 | # tml.annotators=PennTreeAnnotator 27 | tml.annotators= 28 | 29 | #SQlite configuration for Meta data 30 | tml.database.driver=com.mysql.jdbc.Driver 31 | tml.database.url.protocol=jdbc:mysql: 32 | tml.database.url.db=//localhost/tml_metadata 33 | 34 | tml.database.username=tmluser 35 | tml.database.password=password 36 | 37 | # Indexer process 38 | tml.indexer.interval=8 39 | tml.indexer.run=false 40 | 41 | # Annotator process 42 | tml.annotator.interval=10 43 | tml.annotator.run=false 44 | 45 | # Cleanup process 46 | tml.cleanup.interval=600 47 | tml.cleanup.run=false 48 | 49 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/utils/Highlighting.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.utils; 17 | 18 | import java.util.regex.Pattern; 19 | 20 | public class Highlighting { 21 | 22 | public static String highlightText(String text, String[] tokens, String[] cssClass) { 23 | String output = text.toLowerCase(); 24 | 25 | for(int i=0; i" + token + ""); 30 | } 31 | 32 | return output; 33 | } 34 | 35 | public static String htmlFormat(String txt) { 36 | txt = txt.replaceAll(" ", "  "); 37 | txt = txt.replaceAll("\r", ""); 38 | txt = txt.replaceAll("\n", "

"); 39 | return "

" + txt + "

"; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/ParagraphCoherenceIndexResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.results; 17 | 18 | public class ParagraphCoherenceIndexResult extends AbstractResult { 19 | 20 | String paragraphId; 21 | double index; 22 | /** 23 | * @return the paragraphId 24 | */ 25 | public String getParagraphId() { 26 | return paragraphId; 27 | } 28 | /** 29 | * @param paragraphId the paragraphId to set 30 | */ 31 | public void setParagraphId(String paragraphId) { 32 | this.paragraphId = paragraphId; 33 | } 34 | /** 35 | * @return the index 36 | */ 37 | public double getIndex() { 38 | return index; 39 | } 40 | /** 41 | * @param index the index to set 42 | */ 43 | public void setIndex(double index) { 44 | this.index = index; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/tml.properties: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ############################################################################### 16 | 17 | # TML specifics 18 | 19 | # Default value for tml folder 20 | tml.folder=/tml 21 | 22 | # Log file for tml 23 | log4j.appender.ROLLING.File=/tml/log/tml.log 24 | 25 | # Annotators that will be active by default 26 | # tml.annotators=PennTreeAnnotator 27 | tml.annotators= 28 | 29 | #SQlite configuration for Meta data 30 | tml.database.driver=com.mysql.jdbc.Driver 31 | tml.database.url.protocol=jdbc:mysql: 32 | tml.database.url.db=//localhost/tml 33 | 34 | tml.database.username=tml 35 | tml.database.password=itsyourfault 36 | 37 | # Indexer process 38 | tml.indexer.interval=8 39 | tml.indexer.run=false 40 | 41 | # Annotator process 42 | tml.annotator.interval=10 43 | tml.annotator.run=false 44 | 45 | # Cleanup process 46 | tml.cleanup.interval=600 47 | tml.cleanup.run=false 48 | 49 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | Implements a Vector Space Model, that can be later transformed using Latent Semantic Analysis. 27 | 28 |

Package Specification

29 | 30 |

This package implements the transformation of a Corpus into a VSM, it also implements the possibility of using LSA to obtain a Semantic Space.

31 |

The package is closely integrated with Weka, providing Data Mining functionalities in case a developer wants operations that are not implemented in TML.

32 |

Patterns that can be obtained from a VSM or semantic space are implemented via operations, that can be found in the operations subpackage.

33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/RepositoryEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this template, choose Tools | Templates 3 | * and open the template in the editor. 4 | */ 5 | 6 | package tml.storage; 7 | 8 | import java.util.EventObject; 9 | 10 | /** 11 | * This class represents an event that was fired by a Repository 12 | * and indicates the current step of the running process and the 13 | * maximum number of steps. 14 | * It also includes a descriptive name of the operation being 15 | * executed. 16 | * 17 | * @author Jorge Villalon 18 | */ 19 | public class RepositoryEvent extends EventObject { 20 | 21 | /** 22 | * 23 | */ 24 | private static final long serialVersionUID = 4688981006009818932L; 25 | private String action = null; 26 | private int current = 0; 27 | private int maximum = 100; 28 | 29 | public String getAction() { 30 | return action; 31 | } 32 | 33 | public void setAction(String action) { 34 | this.action = action; 35 | } 36 | 37 | public int getCurrent() { 38 | return current; 39 | } 40 | 41 | public void setCurrent(int current) { 42 | this.current = current; 43 | } 44 | 45 | public int getMaximum() { 46 | return maximum; 47 | } 48 | 49 | public void setMaximum(int maximum) { 50 | this.maximum = maximum; 51 | } 52 | 53 | public RepositoryEvent(Object source, String action, int curr, int max) { 54 | super(source); 55 | this.action = action; 56 | this.current = curr; 57 | this.maximum = max; 58 | } 59 | 60 | @Override 61 | public String toString() { 62 | return "Action:" + this.action + " " + this.current + " of " + this.maximum; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/overview.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 |

TML (Text Mining Library) is a general Text Mining library with a focus on Latent Semantic Analysis (LSA). 27 | It allows to create semantic spaces (see Deerwester, 1998) from a corpus of documents with detailed parameters. 28 | This spaces can then be used as background knowledge to calculate distances between documents (or passages) and terms of a different corpus. 29 | Typical operations are the similarity between each document in a corpus, or the distances between consecutive sentences in a document.

30 |

Please visit the website in http://kiama.ee.usyd.edu.au/tml/ for downloading TML, a quick start guide and tutorials.

31 | 32 | 33 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/factorisation/MatrixFactorisation.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.factorisation; 17 | 18 | import Jama.Matrix; 19 | 20 | public abstract class MatrixFactorisation { 21 | 22 | /** Terms matrix in the semantic space */ 23 | protected Matrix Uk = null; 24 | /** Singular values in the semantic space */ 25 | protected Matrix Sk = null; 26 | /** Documents matrix in the semantic space */ 27 | protected Matrix Vk = null; 28 | /** The number of dimensions that were kept */ 29 | protected int dimensionsKept = -1; 30 | 31 | protected SpaceDecomposition decomposition; 32 | protected int K; 33 | 34 | public int getK() { 35 | return K; 36 | } 37 | 38 | public void setK(int K) { 39 | this.K = K; 40 | } 41 | 42 | public abstract void process(Matrix v); 43 | 44 | public SpaceDecomposition getDecomposition() { 45 | return this.decomposition; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/StemmingTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | /** 17 | * 18 | */ 19 | package tml.test; 20 | 21 | import org.junit.Test; 22 | 23 | import tml.utils.LuceneUtils; 24 | 25 | import static org.junit.Assert.*; 26 | 27 | 28 | 29 | /** 30 | * This class test that the stemming algorithm is working appropriately. 31 | * 32 | * @author Jorge Villalon 33 | * 34 | */ 35 | public class StemmingTest { 36 | 37 | @Test 38 | public void testStemming() { 39 | String[] words = {"increase","increasing","increased","increases","dog","dogs"}; 40 | String[] stemmedWords = {"increas","increas","increas","increas","dog","dog"}; 41 | 42 | for(int i=0; i list, String word) { 43 | for(String w : list) { 44 | if(w.equals(word) || stringContained(w, word) || stringContained(word, w)) 45 | return true; 46 | } 47 | return false; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/DocumentAnnotator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.storage; 5 | 6 | import java.io.IOException; 7 | 8 | import org.apache.log4j.Logger; 9 | 10 | import tml.annotators.Annotator; 11 | 12 | /** 13 | * @author jorge 14 | * 15 | */ 16 | public class DocumentAnnotator implements Runnable { 17 | 18 | private static Logger logger = Logger.getLogger(DocumentAnnotator.class); 19 | private Repository repository; 20 | 21 | public DocumentAnnotator(Repository repo) { 22 | this.repository = repo; 23 | } 24 | 25 | /* (non-Javadoc) 26 | * @see java.lang.Runnable#run() 27 | */ 28 | @Override 29 | public void run() { 30 | 31 | int total = 0; 32 | String[][] docs = this.repository.getDbConnection().getUnannotatedDocument(); 33 | 34 | if(docs == null) { 35 | logger.debug("No documents to annotate"); 36 | return; 37 | } 38 | 39 | for(String[] doc : docs) { 40 | String externalid = doc[0]; 41 | String type = doc[1]; 42 | String content = null; 43 | try { 44 | content = this.repository.getDocumentField(externalid, this.repository.getLuceneContentField()); 45 | } catch (IOException e) { 46 | e.printStackTrace(); 47 | logger.error("No content found in Lucene index for document " + externalid); 48 | return; 49 | } 50 | for (Annotator annotator : this.repository.getAnnotators()) { 51 | String metadata = null; 52 | if (annotator.getTypes().contains(type)) { 53 | metadata = annotator.getAnnotations(content); 54 | } else { 55 | metadata = "Not available"; 56 | } 57 | this.repository.getDbConnection().setAnnotation(externalid, annotator.getFieldName(), metadata); 58 | } 59 | total++; 60 | 61 | } 62 | if(total > 0) 63 | logger.info("Annotated " + total + " documents"); 64 | else 65 | logger.debug("Nothing to annotate"); 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/summarization/VectorLengthSummarization.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.summarization; 17 | 18 | import Jama.Matrix; 19 | 20 | public class VectorLengthSummarization extends AbstractSummarizationOperation { 21 | 22 | public VectorLengthSummarization() { 23 | this.name = "VectLength"; 24 | } 25 | 26 | @Override 27 | protected double calculatePassageLoading(int doc) { 28 | Matrix termDoc = this.corpus.getTermDocMatrix(); 29 | double total = 0; 30 | for(int term = 0; term < termDoc.getRowDimension(); term++) { 31 | total += Math.pow(termDoc.get(term, doc),2); 32 | } 33 | return Math.sqrt(total); 34 | } 35 | 36 | @Override 37 | protected double calculateTermLoading(int term) { 38 | Matrix termDoc = this.corpus.getTermDocMatrix(); 39 | double total = 0; 40 | for(int doc = 0; doc < termDoc.getRowDimension(); doc++) { 41 | total += Math.pow(termDoc.get(term, doc),2); 42 | } 43 | return Math.sqrt(total); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/RapidAutomaticKeywordExtractionResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2010 Stephen O'Rourke (stephen.orourke@sydney.edu.au) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.results; 17 | 18 | /** 19 | * This class represents the result of a {@link RapidAutomaticKeywordExtraction} 20 | * operation. 21 | * 22 | * @author Stephen O'Rourke 23 | * 24 | */ 25 | public class RapidAutomaticKeywordExtractionResult extends AbstractResult implements Comparable { 26 | private String keyword; 27 | private Double weighting; 28 | 29 | public String getKeyword() { 30 | return keyword; 31 | } 32 | 33 | public Double getWeighting() { 34 | return weighting; 35 | } 36 | 37 | public void setKeyword(String keyword) { 38 | this.keyword = keyword; 39 | } 40 | 41 | public void setWeighting(Double weighting) { 42 | this.weighting = weighting; 43 | } 44 | 45 | @Override 46 | public int compareTo(RapidAutomaticKeywordExtractionResult result) { 47 | return this.weighting.compareTo(result.weighting); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/TagCloudsResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations.results; 18 | 19 | /** 20 | * @author Jorge Villalon 21 | * 22 | */ 23 | public class TagCloudsResult extends AbstractResult { 24 | 25 | String term; 26 | double weight; 27 | 28 | /** 29 | * @param term 30 | * @param weight 31 | */ 32 | public TagCloudsResult(String term, double weight) { 33 | super(); 34 | this.term = term; 35 | this.weight = weight; 36 | } 37 | 38 | /** 39 | * @param weight the weight to set 40 | */ 41 | public void setWeight(double weight) { 42 | this.weight = weight; 43 | } 44 | 45 | /** 46 | * @return the weight 47 | */ 48 | public double getWeight() { 49 | return weight; 50 | } 51 | 52 | /** 53 | * @param term the term to set 54 | */ 55 | public void setTerm(String term) { 56 | this.term = term; 57 | } 58 | 59 | /** 60 | * @return the term 61 | */ 62 | public String getTerm() { 63 | return term; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/visualizations/TagClouds.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.vectorspace.operations.visualizations; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Collections; 8 | import java.util.Comparator; 9 | import java.util.List; 10 | 11 | import tml.vectorspace.operations.results.TagCloudsResult; 12 | 13 | /** 14 | * @author Jorge 15 | * 16 | */ 17 | public class TagClouds extends AbstractVisualization { 18 | 19 | private int maxSizePixels = 24; 20 | private int maxResults = 50; 21 | 22 | public int getMaxSizePixels() { 23 | return maxSizePixels; 24 | } 25 | 26 | public void setMaxSizePixels(int maxSizePixels) { 27 | this.maxSizePixels = maxSizePixels; 28 | } 29 | 30 | @SuppressWarnings("unchecked") 31 | @Override 32 | public String getHTML() { 33 | List newResults = new ArrayList(); 34 | int i=0; 35 | for(TagCloudsResult result : (List) operation.getResults()) { 36 | newResults.add(result); 37 | i++; 38 | if(i>maxResults) 39 | break; 40 | } 41 | Collections.sort(newResults,new Comparator() { 42 | @Override 43 | public int compare(TagCloudsResult o1, TagCloudsResult o2) { 44 | return o1.getTerm().compareTo(o2.getTerm()); 45 | } 46 | }); 47 | StringBuffer buffer = new StringBuffer(); 48 | buffer.append("
"); 49 | for(TagCloudsResult result : newResults) { 50 | buffer.append(""); 51 | buffer.append(result.getTerm()); 52 | buffer.append(" "); 53 | } 54 | buffer.append("
"); 55 | return buffer.toString(); 56 | } 57 | 58 | private int calculateSize(double weight) { 59 | double size = (double) maxSizePixels; 60 | size = size * weight; 61 | return (int) size; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/SummaryResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations.results; 18 | 19 | /** 20 | * @author Jorge Villalon 21 | * 22 | */ 23 | public class SummaryResult extends AbstractResult { 24 | 25 | String item; 26 | String value; 27 | String comment; 28 | 29 | /** 30 | * @return the comment 31 | */ 32 | public String getComment() { 33 | return comment; 34 | } 35 | /** 36 | * @return the item 37 | */ 38 | public String getItem() { 39 | return item; 40 | } 41 | /** 42 | * @return the value 43 | */ 44 | public String getValue() { 45 | return value; 46 | } 47 | /** 48 | * @param comment the comment to set 49 | */ 50 | public void setComment(String comment) { 51 | this.comment = comment; 52 | } 53 | /** 54 | * @param item the item to set 55 | */ 56 | public void setItem(String item) { 57 | this.item = item; 58 | } 59 | /** 60 | * @param value the value to set 61 | */ 62 | public void setValue(String value) { 63 | this.value = value; 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/utils/JDBCUtils.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.utils; 17 | 18 | import java.sql.Connection; 19 | import java.sql.DriverManager; 20 | import java.sql.ResultSet; 21 | import java.sql.SQLException; 22 | import java.sql.Statement; 23 | 24 | public class JDBCUtils { 25 | 26 | private Connection m_conn; 27 | private Statement m_stmt; 28 | 29 | public JDBCUtils(String driver, String url, String username, String password) throws Exception 30 | { 31 | try { 32 | Class.forName(driver); 33 | m_conn = DriverManager.getConnection(url, username, password); 34 | m_stmt = m_conn.createStatement(); 35 | } catch (Exception e) { 36 | throw e; 37 | } 38 | } 39 | public ResultSet sendQuery(String sql) { 40 | try { 41 | ResultSet m_rs = m_stmt.executeQuery(sql); 42 | return m_rs; 43 | } catch (SQLException e) { 44 | e.printStackTrace(); 45 | return null; 46 | } 47 | } 48 | public int sendUpdate(String sql) { 49 | try { 50 | return m_stmt.executeUpdate(sql); 51 | } catch (SQLException e) { 52 | e.printStackTrace(); 53 | return -1; 54 | } 55 | } 56 | 57 | 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/PassageSimilarityResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.results; 17 | 18 | public class PassageSimilarityResult extends AbstractResult { 19 | 20 | String documentA; 21 | String documentB; 22 | double similarity; 23 | 24 | /** 25 | * @return the documentA 26 | */ 27 | public String getDocumentA() { 28 | return documentA; 29 | } 30 | /** 31 | * @return the documentB 32 | */ 33 | public String getDocumentB() { 34 | return documentB; 35 | } 36 | /** 37 | * @return the similarity 38 | */ 39 | public double getSimilarity() { 40 | return similarity; 41 | } 42 | /** 43 | * @param documentA the documentA to set 44 | */ 45 | public void setDocumentA(String documentA) { 46 | this.documentA = documentA; 47 | } 48 | /** 49 | * @param documentB the documentB to set 50 | */ 51 | public void setDocumentB(String documentB) { 52 | this.documentB = documentB; 53 | } 54 | /** 55 | * @param similarity the similarity to set 56 | */ 57 | public void setSimilarity(double similarity) { 58 | this.similarity = similarity; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/corpus/SentenceCorpus.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.corpus; 17 | 18 | import java.io.IOException; 19 | 20 | import tml.storage.Repository; 21 | import tml.vectorspace.NoDocumentsInCorpusException; 22 | import tml.vectorspace.NotEnoughTermsInCorpusException; 23 | import tml.vectorspace.TermWeightingException; 24 | 25 | 26 | /** 27 | * Class representing a corpus formed with the sentences of a document 28 | * @author Jorge Villalon 29 | * 30 | */ 31 | public class SentenceCorpus extends Corpus { 32 | 33 | /** 34 | * @param document the document to which the sentences belong 35 | * @throws Exception if the document is null 36 | */ 37 | public SentenceCorpus(TextDocument document) throws Exception { 38 | if(document == null) 39 | throw new Exception("A sentence corpus must belong to a document"); 40 | 41 | this.luceneQuery = "type:sentence AND reference:p*d" + document.getExternalId(); 42 | } 43 | 44 | @Override 45 | public void load(Repository storage) 46 | throws NotEnoughTermsInCorpusException, IOException, 47 | NoDocumentsInCorpusException, TermWeightingException { 48 | super.load(storage); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /tml/src/main/java/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | TML (Text Mining Library) is a general purpose Text Mining library which purpose is to support the development of educational applications by providing TM functionalities (For a quick start please start in Repository). 27 | 28 |

TML design principles

29 | 30 |

The Storage and Corpus packages implement the storing of documents in a repository, and searching the repository to form a corpus. The Vectorspace package implements the transformation of a corpus into a VSM representation, and the use of this model with data mining algorithms. The Utils package contains those routines that are required for specific processes, not directly related with the TM process (e.g. grammar parsing and matrix operations). Finally, a Configuration class allows TML to process Java properties files to read default parameters for its operation.

31 |

Please read each package documentation for examples.

32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/utils/LuceneUtils.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.utils; 17 | 18 | import java.io.IOException; 19 | import java.io.StringReader; 20 | 21 | import org.apache.log4j.Logger; 22 | import org.apache.lucene.analysis.Token; 23 | import org.apache.lucene.analysis.TokenStream; 24 | import org.apache.lucene.analysis.snowball.SnowballFilter; 25 | import org.apache.lucene.analysis.standard.StandardTokenizer; 26 | import org.apache.lucene.util.Version; 27 | 28 | public class LuceneUtils { 29 | 30 | private static Logger logger = Logger.getLogger(LuceneUtils.class); 31 | 32 | @SuppressWarnings("deprecation") 33 | public static String stemWords(String words) { 34 | TokenStream stream = new StandardTokenizer(Version.LUCENE_29, new StringReader(words)); 35 | SnowballFilter filter = new SnowballFilter(stream, "English"); 36 | Token token = new Token(); 37 | StringBuffer stemmed = new StringBuffer(); 38 | try { 39 | while((token = filter.next(token)) != null) { 40 | stemmed.append(token.term()); 41 | stemmed.append(" "); 42 | } 43 | } catch (IOException e) { 44 | e.printStackTrace(); 45 | logger.error(e); 46 | } 47 | return stemmed.toString().trim(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/importers/HtmlImporter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.storage.importers; 18 | 19 | import org.apache.log4j.Logger; 20 | import org.htmlparser.Parser; 21 | import org.htmlparser.beans.StringBean; 22 | import org.htmlparser.util.ParserException; 23 | 24 | /** 25 | * This importer uses org.htmlpraser to obtain plain text from an HTML file. 26 | * 27 | * @author Jorge Villalon 28 | * 29 | */ 30 | public class HtmlImporter extends AbstractImporter implements Importer { 31 | 32 | private static Logger logger = Logger.getLogger(HtmlImporter.class); 33 | 34 | @Override 35 | public String getCleanContent(String content) { 36 | 37 | String clean = null; 38 | try { 39 | Parser parser = new Parser(); 40 | parser.setInputHTML(content); 41 | StringBean bean = new StringBean(); 42 | parser.visitAllNodesWith(bean); 43 | clean = bean.getStrings(); 44 | } catch (ParserException e) { 45 | logger.error(e); 46 | } 47 | return clean; 48 | } 49 | 50 | @Override 51 | protected String[] getFileExtensions() { 52 | String[] extensions = new String[3]; 53 | extensions[0] = "xhtml"; 54 | extensions[1] = "html"; 55 | extensions[2] = "htm"; 56 | return extensions; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/NonNegativeMatrixFactorizationTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.test; 17 | 18 | import org.junit.BeforeClass; 19 | import org.junit.Test; 20 | 21 | import tml.Configuration; 22 | import tml.corpus.TextDocument; 23 | import tml.vectorspace.factorisation.NonnegativeMatrixFactorisationED; 24 | 25 | 26 | import Jama.Matrix; 27 | 28 | public class NonNegativeMatrixFactorizationTest extends AbstractTmlIndexingTest { 29 | 30 | private static TextDocument document; 31 | 32 | @BeforeClass 33 | public static void setUpBeforeClass() throws Exception { 34 | AbstractTmlIndexingTest.setUpBeforeClass(); 35 | repository.addDocumentsInFolder(Configuration.getTmlFolder() + "/corpora/uppsala"); 36 | 37 | document = repository.getTextDocument("0100.a1"); 38 | document.load(repository); 39 | } 40 | 41 | @Test 42 | public void testMatrices() { 43 | Matrix m = document.getSentenceCorpus().getTermDocMatrix(); 44 | m.print(10, 5); 45 | 46 | NonnegativeMatrixFactorisationED f = new NonnegativeMatrixFactorisationED(); 47 | f.setK(5); 48 | f.process(m); 49 | 50 | new Matrix(f.getDecomposition().getUkdata()).print(10, 5); 51 | new Matrix(f.getDecomposition().getSkdata()).print(10, 5); 52 | new Matrix(f.getDecomposition().getVkdata()).print(10, 5); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/test/AbstractTmlIndexingTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | /** 17 | * 18 | */ 19 | package tml.test; 20 | 21 | 22 | import java.io.File; 23 | import java.util.Properties; 24 | 25 | import org.apache.log4j.Logger; 26 | import org.junit.BeforeClass; 27 | 28 | import tml.Configuration; 29 | import tml.storage.Repository; 30 | 31 | 32 | /** 33 | * This class implements a base class for all tests that require indexing all the documents 34 | * within a specific folder. 35 | * 36 | * @author Jorge Villalon 37 | * 38 | */ 39 | public abstract class AbstractTmlIndexingTest { 40 | 41 | protected static Logger logger = Logger.getLogger(AbstractTmlIndexingTest.class); 42 | 43 | protected static Repository repository; 44 | protected static String repositoryFolder = null; 45 | protected static String documentsFolder; 46 | protected static File[] filesToAdd = null; 47 | protected static Properties prop; 48 | 49 | /** 50 | * @throws java.lang.Exception 51 | */ 52 | @BeforeClass 53 | public static void setUpBeforeClass() throws Exception { 54 | prop = Configuration.getTmlProperties(true); 55 | Repository.cleanStorage(Configuration.getTmlFolder() + "/test/lucene"); 56 | repository = new Repository(Configuration.getTmlFolder() + "/test/lucene"); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/LastPassage.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations; 18 | 19 | import java.io.IOException; 20 | import java.util.ArrayList; 21 | 22 | import tml.vectorspace.operations.results.LastPassageResult; 23 | 24 | 25 | /** 26 | * Extracts the last passage of the corpus, given the linearity expected in the 27 | * index. 28 | * 29 | * @author Jorge Villalon 30 | * 31 | */ 32 | public class LastPassage extends AbstractOperation { 33 | 34 | /** 35 | * 36 | */ 37 | public LastPassage() { 38 | this.name = "Last passage"; 39 | this.requiresSemanticSpace = false; 40 | } 41 | 42 | @Override 43 | public void start() throws Exception { 44 | super.start(); 45 | this.results = new ArrayList(); 46 | try { 47 | String externalId = this.corpus.getPassages()[this.corpus.getPassages().length-1]; 48 | String content = this.repository.getDocumentField(externalId, this.repository.getLuceneContentField()); 49 | LastPassageResult result = new LastPassageResult(); 50 | result.setPassage(content); 51 | this.results.add(result); 52 | } catch (IOException e) { 53 | e.printStackTrace(); 54 | logger.error(e); 55 | } 56 | super.end(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/PassageDistancesResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations.results; 18 | 19 | /** 20 | * Results that represent the distances between two passages. 21 | * 22 | * @author Jorge Villalon 23 | * 24 | */ 25 | public class PassageDistancesResult extends AbstractResult { 26 | int documentAId; 27 | int documentBId; 28 | double distance; 29 | /** 30 | * @return the documentAId 31 | */ 32 | public int getDocumentAId() { 33 | return documentAId; 34 | } 35 | /** 36 | * @param documentAId the documentAId to set 37 | */ 38 | public void setDocumentAId(int documentAId) { 39 | this.documentAId = documentAId; 40 | } 41 | /** 42 | * @return the documentBId 43 | */ 44 | public int getDocumentBId() { 45 | return documentBId; 46 | } 47 | /** 48 | * @param documentBId the documentBId to set 49 | */ 50 | public void setDocumentBId(int documentBId) { 51 | this.documentBId = documentBId; 52 | } 53 | /** 54 | * @return the distance 55 | */ 56 | public double getDistance() { 57 | return distance; 58 | } 59 | /** 60 | * @param distance the distance to set 61 | */ 62 | public void setDistance(double distance) { 63 | this.distance = distance; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /tml/src/lanczos/unix/makefile: -------------------------------------------------------------------------------- 1 | ####################################################################### 2 | # SVDPACKC (Ver 1.0) Makefile # 3 | ####################################################################### 4 | # # 5 | # las1: Single-Vector Lanczos SVD via 2-Cyclic Eigensystems # 6 | # las2: Single-Vector Lanczos SVD via A'A Eigensystems # 7 | # bls1: Block Lanczos SVD via 2-Cyclic Eigensystems # 8 | # bls2: Block Lanczos SVD via A'A Eigensystems # 9 | # sis1: Subspace Iteration SVD via 2_Cyclic Eigensystems # 10 | # sis2: Subspace Iteration SVD via A'A Eigensystems # 11 | # tms1: Trace Minimization SVD via 2_Cyclic Eigensystems # 12 | # tms2: Trace Minimization SVD via A'A Eigensystems # 13 | # # 14 | ####################################################################### 15 | 16 | CC = gcc 17 | CFLAGS= -O -c 18 | LIB= -lm 19 | TIMER= timersun.o 20 | 21 | all: las1 las2 bls1 bls2 sis1 sis2 tms1 tms2 22 | 23 | las1.o: las1.h 24 | las2.o: las2.h 25 | 26 | bls1.o: bls1.h 27 | bls2.o: bls2.h 28 | 29 | sis1.o: sisg.h sisc.h 30 | sis2.o: sisg.h sisc.h 31 | 32 | tms1.o: tmsg.h tmsc.h 33 | tms2.o: tmsg.h tmsc.h 34 | 35 | .c.o: $*.c 36 | ${CC} ${CFLAGS} $*.c 37 | 38 | las1: las1.o ${TIMER} 39 | ${CC} -o $@ las1.o ${TIMER} ${LIB} 40 | 41 | las2: las2.o ${TIMER} 42 | ${CC} -o $@ las2.o ${TIMER} ${LIB} 43 | 44 | bls1: bls1.o ${TIMER} 45 | ${CC} -o $@ bls1.o ${TIMER} ${LIB} 46 | 47 | bls2: bls2.o ${TIMER} 48 | ${CC} -o $@ bls2.o ${TIMER} ${LIB} 49 | 50 | sis1: sis1.o ${TIMER} 51 | ${CC} -o $@ sis1.o ${TIMER} ${LIB} 52 | 53 | sis2: sis2.o ${TIMER} 54 | ${CC} -o $@ sis2.o ${TIMER} ${LIB} 55 | 56 | tms1: tms1.o ${TIMER} 57 | ${CC} -o $@ tms1.o ${TIMER} ${LIB} 58 | 59 | tms2: tms2.o ${TIMER} 60 | ${CC} -o $@ tms2.o ${TIMER} ${LIB} 61 | 62 | clean: rm \ 63 | las1.o las2.o bls1.o bls2.o sis1.o sis2.o tms1.o tms2.o \ 64 | timersun.o timermac.o las1 las2 bls1 bls2 sis1 sis2 tms1 tms2 65 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/LexiconAnalysisResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations.results; 18 | 19 | /** 20 | * The result of a {@link LexiconAnalysis} operation. 21 | * @author Jorge Villalon 22 | * 23 | */ 24 | public class LexiconAnalysisResult extends AbstractResult { 25 | String document; 26 | int terms; 27 | int newTerms; 28 | 29 | /** 30 | * @return the document 31 | */ 32 | public String getDocument() { 33 | return document; 34 | } 35 | 36 | /** 37 | * @param document the document to set 38 | */ 39 | public void setDocument(String document) { 40 | this.document = document; 41 | } 42 | 43 | /** 44 | * @return the terms 45 | */ 46 | public int getTerms() { 47 | return terms; 48 | } 49 | 50 | /** 51 | * @param terms the terms to set 52 | */ 53 | public void setTerms(int terms) { 54 | this.terms = terms; 55 | } 56 | 57 | /** 58 | * @return the newTerms 59 | */ 60 | public int getNewTerms() { 61 | return newTerms; 62 | } 63 | 64 | /** 65 | * @param newTerms the newTerms to set 66 | */ 67 | public void setNewTerms(int newTerms) { 68 | this.newTerms = newTerms; 69 | } 70 | 71 | @Override 72 | public String toString() { 73 | return "Document: " + this.getDocument() + " Terms: " + this.terms 74 | + " Accumulated: " + this.newTerms; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/LanczosTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.test; 17 | 18 | import org.junit.BeforeClass; 19 | import org.junit.Test; 20 | 21 | import tml.Configuration; 22 | import tml.corpus.SearchResultsCorpus; 23 | import tml.corpus.CorpusParameters.DimensionalityReduction; 24 | import tml.corpus.CorpusParameters.TermSelection; 25 | import tml.vectorspace.TermWeighting.GlobalWeight; 26 | import tml.vectorspace.TermWeighting.LocalWeight; 27 | 28 | 29 | 30 | public class LanczosTest extends AbstractTmlIndexingTest { 31 | 32 | @BeforeClass 33 | public static void setUpBeforeClass() throws Exception { 34 | AbstractTmlIndexingTest.setUpBeforeClass(); 35 | repository.addDocumentsInFolder(Configuration.getTmlFolder() + "/corpora/introLSA"); 36 | } 37 | 38 | @Test 39 | public void timeBigCorpus() throws Exception { 40 | SearchResultsCorpus corpus = new SearchResultsCorpus("type:document"); 41 | corpus.getParameters().setTermSelectionCriterion(TermSelection.DF); 42 | corpus.getParameters().setTermSelectionThreshold(2); 43 | corpus.getParameters().setDimensionalityReduction(DimensionalityReduction.NUM); 44 | corpus.getParameters().setDimensionalityReductionThreshold(2); 45 | corpus.getParameters().setTermWeightGlobal(GlobalWeight.None); 46 | corpus.getParameters().setTermWeightLocal(LocalWeight.TF); 47 | corpus.load(repository); 48 | corpus.getParameters().setLanczosSVD(true); 49 | corpus.getSemanticSpace().calculate(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/Summary.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.results; 17 | 18 | public class Summary { 19 | 20 | int[] passagesRank; 21 | int[] termsRank; 22 | double[] passagesLoads; 23 | double[] termsLoads; 24 | 25 | /** 26 | * @return the passagesRank 27 | */ 28 | public int[] getPassagesRank() { 29 | return passagesRank; 30 | } 31 | /** 32 | * @param passagesRank the passagesRank to set 33 | */ 34 | public void setPassagesRank(int[] passagesRank) { 35 | this.passagesRank = passagesRank; 36 | } 37 | /** 38 | * @return the termsRank 39 | */ 40 | public int[] getTermsRank() { 41 | return termsRank; 42 | } 43 | /** 44 | * @param termsRank the termsRank to set 45 | */ 46 | public void setTermsRank(int[] termsRank) { 47 | this.termsRank = termsRank; 48 | } 49 | /** 50 | * @return the passagesLoads 51 | */ 52 | public double[] getPassagesLoads() { 53 | return passagesLoads; 54 | } 55 | /** 56 | * @param passagesLoads the passagesLoads to set 57 | */ 58 | public void setPassagesLoads(double[] passagesLoads) { 59 | this.passagesLoads = passagesLoads; 60 | } 61 | /** 62 | * @return the termsLoads 63 | */ 64 | public double[] getTermsLoads() { 65 | return termsLoads; 66 | } 67 | /** 68 | * @param termsLoads the termsLoads to set 69 | */ 70 | public void setTermsLoads(double[] termsLoads) { 71 | this.termsLoads = termsLoads; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/tml.conceptmap.rules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | conj_and 6 | 7 | 9 | prep_of 10 | 11 | 12 | amod 13 | nn 14 | number 15 | num 16 | 18 | 19 | 20 | neg 21 | 22 | 23 | advmod 24 | aux 25 | auxpass 26 | 27 | 28 | det 29 | 30 | 55 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/ConceptExtraction.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | import tml.vectorspace.operations.results.TermRankedResult; 23 | import tml.vectorspace.operations.results.TermsExtractionSummarizationResult; 24 | 25 | 26 | /** 27 | * Concept Extraction operation based on CMM. 28 | * @author Jorge Villalon 29 | * 30 | */ 31 | public class ConceptExtraction extends TermExtractionSummarization { 32 | 33 | List newResults = 34 | new ArrayList(); 35 | 36 | public ConceptExtraction() { 37 | this.name = "Concept extraction"; 38 | } 39 | 40 | @Override 41 | public void start() throws Exception { 42 | this.maxResults = 35; 43 | super.start(); 44 | 45 | logger.info("Originally " + this.results.size() + " results"); 46 | 47 | CompoundNounsSummarized op = new CompoundNounsSummarized(); 48 | op.setCorpus(corpus); 49 | op.start(); 50 | 51 | // TODO: Iterate through compound nouns 52 | for (TermRankedResult result : op.getResults()) { 53 | String noun = result.getTerm(); 54 | if (noun.trim().length() == 0) 55 | continue; 56 | TermsExtractionSummarizationResult newResult = new TermsExtractionSummarizationResult(); 57 | newResult.setTerm(noun); 58 | newResults.add(newResult); 59 | } 60 | 61 | this.results.clear(); 62 | this.results.addAll(newResults); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/SimpleCorpusTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (C) 2001, 2007 University of Sydney 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 2 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 17 | * USA 18 | * 19 | * http://www.gnu.org/licenses/gpl.txt 20 | *******************************************************************************/ 21 | 22 | package tml.test; 23 | 24 | import org.junit.Test; 25 | 26 | import tml.Configuration; 27 | import tml.corpus.SimpleCorpus; 28 | import static org.junit.Assert.*; 29 | 30 | 31 | /** 32 | * This test creates a simple corpus that loads a set of documents and then it 33 | * can be used directly to create a {@link SemanticSpace}. 34 | * 35 | * @author Jorge Villalon 36 | * @see SimpleCorpus 37 | */ 38 | public class SimpleCorpusTest extends AbstractTmlIndexingTest { 39 | 40 | /** 41 | * @throws Exception 42 | */ 43 | @Test 44 | public void CreateSimpleCorpus() throws Exception { 45 | SimpleCorpus corpus = new SimpleCorpus(Configuration.getTmlFolder() + "/corpora/introLSA", prop.getProperty("tml.lucene.indexpath")); 46 | 47 | for (String term : corpus.getTerms()) 48 | System.out.print(term + " "); 49 | System.out.println(); 50 | for (String doc : corpus.getDocuments()) 51 | System.out.print(doc + " "); 52 | System.out.println(); 53 | double[][] m = corpus.getMatrix(); 54 | for (int i = 0; i < corpus.getTerms().length; i++) { 55 | for (int j = 0; j < corpus.getDocuments().length; j++) { 56 | System.out.print(m[i][j] + " "); 57 | } 58 | System.out.println(); 59 | } 60 | System.out.println(); 61 | assertNotNull(corpus); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/PassageClusteringLingoResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations.results; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | /** 23 | * @author Jorge Villalon 24 | * 25 | */ 26 | public class PassageClusteringLingoResult extends AbstractResult { 27 | 28 | int cluster; 29 | String clusterPhrase; 30 | List documents; 31 | 32 | /** 33 | * @return the cluster 34 | */ 35 | public int getCluster() { 36 | return cluster; 37 | } 38 | 39 | /** 40 | * @param cluster the cluster to set 41 | */ 42 | public void setCluster(int cluster) { 43 | this.cluster = cluster; 44 | } 45 | 46 | /** 47 | * @return the clusterPhrase 48 | */ 49 | public String getClusterPhrase() { 50 | return clusterPhrase; 51 | } 52 | 53 | /** 54 | * @param clusterPhrase the clusterPhrase to set 55 | */ 56 | public void setClusterPhrase(String clusterPhrase) { 57 | this.clusterPhrase = clusterPhrase; 58 | } 59 | 60 | /** 61 | * @return the documents 62 | */ 63 | public List getDocuments() { 64 | if(documents == null) 65 | documents = new ArrayList(); 66 | return documents; 67 | } 68 | 69 | /** 70 | * @param documents the documents to set 71 | */ 72 | public void setDocuments(List documents) { 73 | this.documents = documents; 74 | } 75 | 76 | @Override 77 | public String toString() { 78 | return this.clusterPhrase + " [" + this.getDocuments().size() + "]"; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/Summary.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations; 18 | 19 | import java.util.ArrayList; 20 | 21 | import tml.vectorspace.operations.results.SummaryResult; 22 | 23 | 24 | /** 25 | * This operation returns several descriptive statistics on the corpus. 26 | * 27 | * @author Jorge Villalon 28 | * 29 | */ 30 | public class Summary extends AbstractOperation { 31 | 32 | /** 33 | * 34 | */ 35 | public Summary() { 36 | this.name = "Summary"; 37 | } 38 | 39 | @Override 40 | public void start() throws Exception { 41 | super.start(); 42 | 43 | this.results = new ArrayList(); 44 | SummaryResult result = new SummaryResult(); 45 | result.setItem("Documents"); 46 | result.setValue(Integer.toString(this.corpus.getPassages().length)); 47 | result.setComment("Number of documents in the corpus"); 48 | results.add(result); 49 | result = new SummaryResult(); 50 | result.setItem("Terms"); 51 | result.setValue(Integer.toString(this.corpus.getTerms().length)); 52 | result.setComment("Number of terms in the corpus"); 53 | results.add(result); 54 | result = new SummaryResult(); 55 | result.setItem("Term selection criteria"); 56 | result.setValue(this.corpus.getParameters().getTermSelectionCriterion() + " [" 57 | + this.corpus.getParameters().getTermSelectionThreshold() + "]"); // "Value" 58 | result 59 | .setComment("The selection criteria used to create the dictionary"); 60 | results.add(result); 61 | 62 | super.end(); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/FactorAnalysisPlot.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations; 18 | 19 | import tml.vectorspace.operations.results.FactorAnalysisPlotResult; 20 | import Jama.Matrix; 21 | 22 | /** 23 | * This operation simply presents the content of the reconstructed term/doc 24 | * matrix with a column for documents and the first row showing the terms 25 | * 26 | * @author Jorge Villalon 27 | * 28 | */ 29 | public class FactorAnalysisPlot extends AbstractOperation { 30 | 31 | public FactorAnalysisPlot() { 32 | this.name = "Factor analysis"; 33 | this.requiresSemanticSpace = true; 34 | } 35 | 36 | @Override 37 | public void start() throws Exception { 38 | super.start(); 39 | Matrix u = this.corpus.getSemanticSpace().getUk(); 40 | Matrix v = this.corpus.getSemanticSpace().getVk(); 41 | 42 | for (int i = 0; i < u.getRowDimension(); i++) { 43 | FactorAnalysisPlotResult result = new FactorAnalysisPlotResult(); 44 | result.setName(this.corpus.getTerms()[i]); 45 | result.setX(u.get(i, 0)); 46 | result.setY(u.get(i, 1)); 47 | results.add(result); 48 | } 49 | for (int i = u.getRowDimension(); i < u.getRowDimension() 50 | + v.getRowDimension(); i++) { 51 | FactorAnalysisPlotResult result = new FactorAnalysisPlotResult(); 52 | result.setName(this.corpus.getPassages()[i - u.getRowDimension()]); 53 | result.setX(v.get(i - u.getRowDimension(), 0)); 54 | result.setY(v.get(i - u.getRowDimension(), 1)); 55 | results.add(result); 56 | } 57 | super.end(); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/LexiconAnalysis.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | import tml.corpus.Corpus.PassageFreqs; 23 | import tml.vectorspace.operations.results.LexiconAnalysisResult; 24 | 25 | 26 | /** 27 | * LexiconAnalysis returns the accumulated lexicon per passage used in the document. It 28 | * is important to consider that stopwords are removed and words that are kept are stemmed, 29 | * therefore this doesn't correspond to the actual total number of different words. 30 | * 31 | * @author Jorge Villalon 32 | * 33 | */ 34 | public class LexiconAnalysis extends AbstractOperation { 35 | 36 | /** 37 | * @param corpus 38 | */ 39 | public LexiconAnalysis() { 40 | this.name = "Lexicon analysis"; 41 | } 42 | 43 | @Override 44 | public void start() throws Exception { 45 | super.start(); 46 | 47 | this.results = new ArrayList(); 48 | 49 | List list = new ArrayList(); 50 | 51 | for (int i=0; i 2 | 3 | 4 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/IndexingHtmlTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (C) 2001, 2007 University of Sydney 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 2 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 17 | * USA 18 | * 19 | * http://www.gnu.org/licenses/gpl.txt 20 | *******************************************************************************/ 21 | 22 | package tml.test; 23 | 24 | import java.io.IOException; 25 | 26 | import org.junit.BeforeClass; 27 | import org.junit.Test; 28 | 29 | import tml.Configuration; 30 | import tml.corpus.TextDocument; 31 | import tml.corpus.CorpusParameters.DimensionalityReduction; 32 | import tml.corpus.CorpusParameters.TermSelection; 33 | import static org.junit.Assert.*; 34 | 35 | 36 | /** 37 | * @author Jorge Villalon 38 | * 39 | */ 40 | public class IndexingHtmlTest extends AbstractTmlIndexingTest { 41 | 42 | private static String TESTS_DOCUMENTS_FOLDER = null; 43 | 44 | @BeforeClass 45 | public static void setUpBeforeClass() throws Exception { 46 | AbstractTmlIndexingTest.setUpBeforeClass(); 47 | TESTS_DOCUMENTS_FOLDER = Configuration.getTmlFolder() + "/corpora/html"; 48 | repository.addDocumentsInFolder(TESTS_DOCUMENTS_FOLDER); 49 | } 50 | 51 | @Test 52 | public void readPage() throws IOException { 53 | TextDocument doc = repository.getTextDocument("Automobile"); 54 | assertNotNull(doc); 55 | } 56 | 57 | @Test 58 | public void loadPageCorpus() throws Exception { 59 | TextDocument doc = repository.getTextDocument("Automobile"); 60 | doc.getParameters().setTermSelectionCriterion(TermSelection.DF); 61 | doc.getParameters().setTermSelectionThreshold(2); 62 | doc.getParameters().setDimensionalityReduction(DimensionalityReduction.NO); 63 | doc.load(repository); 64 | assertNotNull(doc.getSentenceCorpus()); 65 | assertNotNull(doc.getParagraphCorpus()); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/RelationshipExtractionResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations.results; 18 | 19 | /** 20 | * @author Jorge Villalon 21 | * 22 | */ 23 | public class RelationshipExtractionResult extends AbstractResult { 24 | 25 | String conceptA; 26 | String conceptB; 27 | String linkingWord; 28 | 29 | private boolean directed; 30 | 31 | /** 32 | * @return the first concept in the relationship 33 | */ 34 | public String getConceptA() { 35 | return conceptA; 36 | } 37 | 38 | /** 39 | * @return the second concept in the relationship 40 | */ 41 | public String getConceptB() { 42 | return conceptB; 43 | } 44 | 45 | /** 46 | * @return the linking word for the relationship 47 | */ 48 | public String getLinkingWord() { 49 | return linkingWord; 50 | } 51 | 52 | /** 53 | * @return if the relationship is directed (from A to B) or non-directed 54 | */ 55 | public boolean isDirected() { 56 | return directed; 57 | } 58 | 59 | /** 60 | * @param conceptA 61 | * the first concept 62 | */ 63 | public void setConceptA(String conceptA) { 64 | this.conceptA = conceptA; 65 | } 66 | 67 | /** 68 | * @param conceptB 69 | * the second concept 70 | */ 71 | public void setConceptB(String conceptB) { 72 | this.conceptB = conceptB; 73 | } 74 | 75 | /** 76 | * @param directed 77 | * if A points to B 78 | */ 79 | public void setDirected(boolean directed) { 80 | this.directed = directed; 81 | } 82 | 83 | /** 84 | * @param linkingWord 85 | * the linking word/phrase 86 | */ 87 | public void setLinkingWord(String linkingWord) { 88 | this.linkingWord = linkingWord; 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/annotators/Annotator.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | /** 17 | * 18 | */ 19 | package tml.annotators; 20 | 21 | import java.util.ArrayList; 22 | 23 | /** 24 | * Common interface for all annotators. Each annotator will be called 25 | * from the {@link Repository} to analyze each sentence and then 26 | * store the annotated text in a Lucene field while indexing. 27 | * 28 | * @author Jorge Villalon 29 | * 30 | */ 31 | public interface Annotator { 32 | 33 | /** 34 | * This method returns the XML annotated 35 | * version of a text. E.g if we have 36 | * "Rafa is in the US" the annotated version 37 | * would be "RafaUS". 38 | * 39 | * TODO: Analyze if UIMA provides a better annotation schema 40 | * 41 | * @param text the text to be annotated 42 | * @return the XML 43 | */ 44 | public String getAnnotations(String text); 45 | 46 | /** 47 | * The Lucene field name where this annotations are 48 | * going to be stored. 49 | * 50 | * @return the Lucene field name 51 | */ 52 | public String getFieldName(); 53 | 54 | /** 55 | * The schema by which these annotations can be verified. 56 | * 57 | * @return null if no schema is attached 58 | */ 59 | public Object getSchema(); 60 | 61 | /** 62 | * Returns the pieces of text (words or phrases) in the text that 63 | * are annotated with a particular label. 64 | * 65 | * @param annotationLabel the label to search 66 | * @return a list of text. Null if no text is found. 67 | */ 68 | public String[] getAnnotatedText(String annotationLabel); 69 | 70 | /** 71 | * This method initialises any static attributes required for the annotator to run 72 | */ 73 | public void init(); 74 | 75 | public ArrayList getTypes(); 76 | } 77 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/TagClouds.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations; 18 | 19 | import java.util.ArrayList; 20 | import java.util.Collections; 21 | import java.util.Comparator; 22 | 23 | import tml.vectorspace.operations.results.TagCloudsResult; 24 | 25 | 26 | /** 27 | * TagClouds returns the list of terms in the {@link Corpus} weighted by 28 | * the term weighting scheme used in the {@link SemanticSpace}. 29 | * 30 | * @author Jorge Villalon 31 | * 32 | */ 33 | public class TagClouds extends AbstractOperation { 34 | 35 | /** 36 | * 37 | */ 38 | public TagClouds() { 39 | this.name = "Tagclouds"; 40 | this.requiresSemanticSpace = false; 41 | } 42 | 43 | @Override 44 | public void start() throws Exception { 45 | super.start(); 46 | 47 | this.results = new ArrayList(); 48 | 49 | double max = 0; 50 | for (int termIndex = 0; termIndex < corpus.getTerms().length; termIndex++) { 51 | String term = corpus.getTerms()[termIndex]; 52 | double weight = corpus.getTermStats()[termIndex].sum; 53 | TagCloudsResult result = new TagCloudsResult(term, weight); 54 | if(weight > max) 55 | max = weight; 56 | this.results.add(result); 57 | } 58 | 59 | if(max == 0) 60 | max = 1; 61 | 62 | for (TagCloudsResult result : this.results) { 63 | result.setWeight(result.getWeight()/max); 64 | } 65 | 66 | Collections.sort(this.results, 67 | new Comparator() { 68 | 69 | @Override 70 | public int compare(TagCloudsResult arg0, 71 | TagCloudsResult arg1) { 72 | int weight0 = (int) (arg0.getWeight() * 100); 73 | int weight1 = (int) (arg1.getWeight() * 100); 74 | return weight1 - weight0; 75 | } 76 | }); 77 | 78 | super.end(); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/RelationshipExtraction.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations; 18 | 19 | import java.io.IOException; 20 | import java.util.ArrayList; 21 | import java.util.Collections; 22 | import java.util.List; 23 | 24 | import tml.annotators.PennTreeAnnotator; 25 | import tml.utils.StanfordUtils; 26 | import tml.vectorspace.operations.results.RelationshipExtractionResult; 27 | 28 | import edu.stanford.nlp.trees.Tree; 29 | 30 | 31 | /** 32 | * Relationship extraction aims to extract the labeled relationships from a set 33 | * of concepts 34 | * 35 | * @author Jorge Villalon 36 | * 37 | */ 38 | public class RelationshipExtraction extends 39 | AbstractOperation { 40 | 41 | @Override 42 | public void start() throws Exception { 43 | super.start(); 44 | 45 | List rels = new ArrayList(); 46 | for (String passageId : this.corpus.getPassages()) { 47 | Tree pennTree = null; 48 | try { 49 | pennTree = StanfordUtils.getTreeFromString(passageId, repository.getDocumentField(passageId, PennTreeAnnotator.FIELD_NAME)); 50 | } catch (IOException e) { 51 | e.printStackTrace(); 52 | logger.error(e); 53 | return; 54 | } 55 | List verbs = StanfordUtils.extractVerbs(pennTree); 56 | if(verbs != null) 57 | for (String verb : verbs) { 58 | verb = verb.trim().toLowerCase(); 59 | if (rels.contains(verb)) 60 | continue; 61 | if (verb.length() == 0) 62 | continue; 63 | rels.add(verb); 64 | } 65 | } 66 | 67 | Collections.sort(rels); 68 | 69 | for (String verb : rels) { 70 | RelationshipExtractionResult result = new RelationshipExtractionResult(); 71 | result.setLinkingWord(verb); 72 | this.results.add(result); 73 | } 74 | super.end(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/PassageExtractionSummarizationResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations.results; 18 | 19 | /** 20 | * Represents a passage (document, paragraph or sentence), its load 21 | * and the corresponding eigenvector by which it was selected. 22 | * 23 | * @author Jorge Villalon 24 | * 25 | */ 26 | public class PassageExtractionSummarizationResult extends AbstractResult { 27 | int eigenVectorIndex; 28 | 29 | double load; 30 | 31 | String textPassageContent; 32 | 33 | int textPassageId; 34 | 35 | /** 36 | * @return the eigenVectorIndex 37 | */ 38 | public int getEigenVectorIndex() { 39 | return eigenVectorIndex; 40 | } 41 | 42 | /** 43 | * @param eigenVectorIndex the eigenVectorIndex to set 44 | */ 45 | public void setEigenVectorIndex(int eigenVectorIndex) { 46 | this.eigenVectorIndex = eigenVectorIndex; 47 | } 48 | 49 | /** 50 | * @return the load 51 | */ 52 | public double getLoad() { 53 | return load; 54 | } 55 | 56 | /** 57 | * @param load the load to set 58 | */ 59 | public void setLoad(double load) { 60 | this.load = load; 61 | } 62 | 63 | /** 64 | * @return the textPassageContent 65 | */ 66 | public String getTextPassageContent() { 67 | return textPassageContent; 68 | } 69 | 70 | /** 71 | * @param textPassageContent the textPassageContent to set 72 | */ 73 | public void setTextPassageContent(String textPassageContent) { 74 | this.textPassageContent = textPassageContent; 75 | } 76 | 77 | /** 78 | * @return the textPassageId 79 | */ 80 | public int getTextPassageId() { 81 | return textPassageId; 82 | } 83 | 84 | /** 85 | * @param textPassageId the textPassageId to set 86 | */ 87 | public void setTextPassageId(int textPassageId) { 88 | this.textPassageId = textPassageId; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/utils/DBUtils.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.utils; 17 | 18 | import java.sql.Connection; 19 | import java.sql.DriverManager; 20 | import java.sql.ResultSet; 21 | import java.sql.SQLException; 22 | import java.sql.Statement; 23 | import java.util.ArrayList; 24 | 25 | public class DBUtils { 26 | 27 | private Connection m_conn; 28 | private Statement m_stmt; 29 | private String url; 30 | private String username ; 31 | private String password; 32 | 33 | public DBUtils(String driver, String url, String username, String password) throws ClassNotFoundException 34 | { 35 | this.url=url; 36 | this.username=username; 37 | this.password=password; 38 | this.setDriver(driver); 39 | } 40 | public boolean setConnection() { 41 | try { 42 | m_conn = DriverManager.getConnection(url, username, password); 43 | m_stmt = m_conn.createStatement(); 44 | return true; 45 | } catch (Exception e) { 46 | e.printStackTrace(); 47 | } 48 | return false; 49 | } 50 | public boolean setDriver(String driver) throws ClassNotFoundException { 51 | Class.forName(driver); 52 | return true; 53 | } 54 | public ArrayList sendQuery(String sql,String fieldname) { 55 | try { 56 | ArrayList al = new ArrayList(); 57 | ResultSet m_rs = m_stmt.executeQuery(sql); 58 | while (m_rs.next()) { 59 | al.add(m_rs.getString(fieldname)); 60 | } 61 | m_rs.getStatement().close(); 62 | return al; 63 | } catch (SQLException e) { 64 | e.printStackTrace(); 65 | return null; 66 | } 67 | } 68 | public int sendUpdate(String sql) { 69 | try { 70 | return m_stmt.executeUpdate(sql); 71 | } catch (SQLException e) { 72 | e.printStackTrace(); 73 | return -1; 74 | } 75 | } 76 | public void closeConnection() 77 | { 78 | try { 79 | m_conn.close(); 80 | } catch (SQLException e) { 81 | e.printStackTrace(); 82 | } 83 | } 84 | 85 | 86 | 87 | 88 | } 89 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/CompoundNounsSummarized.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations; 17 | 18 | import java.io.IOException; 19 | import java.util.ArrayList; 20 | import java.util.Collections; 21 | import java.util.Comparator; 22 | import java.util.List; 23 | 24 | import tml.annotators.PennTreeAnnotator; 25 | import tml.utils.StanfordUtils; 26 | import tml.vectorspace.operations.results.TermRankedResult; 27 | 28 | import edu.stanford.nlp.trees.Tree; 29 | 30 | public class CompoundNounsSummarized extends AbstractOperation implements 31 | Operation { 32 | 33 | public CompoundNounsSummarized() { 34 | this.name = "Compound nounds summarized"; 35 | } 36 | 37 | @Override 38 | public void start() throws Exception { 39 | super.start(); 40 | 41 | List nouns = new ArrayList(); 42 | for(String passageId : corpus.getPassages()) { 43 | String annotation = null; 44 | try { 45 | annotation = this.repository.getDocumentField(passageId, PennTreeAnnotator.FIELD_NAME); 46 | } catch (IOException e) { 47 | e.printStackTrace(); 48 | logger.error(e); 49 | } 50 | if(annotation != null) { 51 | Tree pennTree = StanfordUtils.getTreeFromString(passageId, annotation); 52 | List allNouns = StanfordUtils.extractNouns(pennTree); 53 | if(allNouns != null) 54 | for(String noun : allNouns) { 55 | noun = noun.toLowerCase(); 56 | if(!nouns.contains(noun)) { 57 | nouns.add(noun); 58 | TermRankedResult result = new TermRankedResult(); 59 | result.setTerm(noun.toLowerCase()); 60 | result.setRank(0); 61 | this.results.add(result); 62 | } 63 | } 64 | } 65 | } 66 | 67 | Collections.sort(this.results, new Comparator() { 68 | 69 | @Override 70 | public int compare(TermRankedResult o1, TermRankedResult o2) { 71 | return o1.getTerm().compareTo(o2.getTerm()); 72 | } 73 | 74 | }); 75 | 76 | super.end(); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /tml/www/doc/stylesheet.css: -------------------------------------------------------------------------------- 1 | /* Javadoc style sheet */ 2 | 3 | /* Define colors, fonts and other style attributes here to override the defaults */ 4 | 5 | /* Page background color */ 6 | body { 7 | background-color: #FFFFFF; 8 | background-image:url(resources/bkg_gradient.gif); 9 | background-repeat: repeat-x; 10 | margin:0 auto; 11 | font-family:'Lucida Grande', Geneva, Verdana, Arial, sans-serif; 12 | font-size:12px; 13 | padding:0em 2em; 14 | color:#333; 15 | 16 | } 17 | 18 | /* Common elements */ 19 | 20 | font { 21 | font-family: inherit; 22 | font-size: inherit; 23 | color: inherit; 24 | font-weight: inherit; } 25 | 26 | hr { display: none; } 27 | 28 | a:link { color:#0066cc; } 29 | a:visited { color:#8b5caf; } 30 | a:hover { color:#6699cc; } 31 | 32 | 33 | /* Headings */ 34 | h1 { 35 | font-size: 145%; 36 | background-image:url(resources/h1_hdr.png); 37 | background-repeat:no-repeat; 38 | border-top:1px dotted #CCCCCC; 39 | line-height:1.2em; 40 | color:#182737; 41 | font-size:2em; 42 | padding:1.5em; 43 | margin-top: 0px; 44 | text-align:left; 45 | } 46 | 47 | 48 | /* Default Table elements and colors */ 49 | 50 | th, table { border-collapse:collapse;border-color: #E6E7E8; } 51 | 52 | 53 | .TableHeadingColor { 54 | background:#000000 url(resources/bkg_blkheader.png) repeat-x scroll left top; 55 | color:#FFFFFF; 56 | font-size:12px; 57 | font-weight:bold; 58 | height:31px; 59 | text-align:left; 60 | padding:1.5em; 61 | } 62 | 63 | .TableHeadingColor th { 64 | padding-left: 10px; 65 | } 66 | 67 | 68 | .TableSubHeadingColor { background: #EEEEFF } /* Light mauve */ 69 | .TableRowColor { background: #FFFFFF; border-color: #E6E7E8;} 70 | .TableRowColor td { line-height: 175%; padding-left: 10px;} 71 | 72 | /* Font used in left-hand frame lists */ 73 | .FrameTitleFont { font-size: 125%; font-family: Helvetica, Arial, sans-serif; font-weight: bold; margin-top: 1em; display: block; } 74 | .FrameHeadingFont { font-size: 125%; font-family: 'Lucida Grande', Geneva, Verdana, Arial, sans-serif; font-weight: bold; margin-top: 1em; display: block; } 75 | .FrameItemFont { font-size: 100%; font-family: Helvetica, Arial, sans-serif } 76 | 77 | /* Navigation bar fonts and colors */ 78 | 79 | .NavBarCell1 { background-color: #ffffff; 80 | background-image:url(resources/bkgheader.png); 81 | background-repeat: repeat-x scroll left top; 82 | line-height:2em; 83 | padding-left:6px; 84 | padding-right:6px; 85 | } 86 | 87 | .NavBarFont1 { 88 | color: white; 89 | } 90 | .NavBarCell1 a { 91 | color: white; 92 | } 93 | 94 | .NavBarCell1Rev { background-color:#FFFFFF; padding-left:6px; padding-right:6px;} 95 | .NavBarFont1 { color:#FFFFFF;} 96 | .NavBarFont1Rev { color:#243446;} 97 | 98 | .NavBarCell2 { background-color:#FFFFFF;} 99 | .NavBarCell3 { background-color:#FFFFFF;} 100 | 101 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/IndexingPlainTextTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright (C) 2001, 2007 University of Sydney 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 2 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 17 | * USA 18 | * 19 | * http://www.gnu.org/licenses/gpl.txt 20 | *******************************************************************************/ 21 | 22 | package tml.test; 23 | 24 | import java.io.File; 25 | 26 | import org.junit.BeforeClass; 27 | import org.junit.Test; 28 | 29 | import tml.Configuration; 30 | import tml.corpus.TextDocument; 31 | 32 | import static org.junit.Assert.*; 33 | 34 | public class IndexingPlainTextTest extends AbstractTmlIndexingTest { 35 | 36 | @BeforeClass 37 | public static void setUpBeforeClass() throws Exception { 38 | AbstractTmlIndexingTest.setUpBeforeClass(); 39 | File[] fileList = { 40 | new File(Configuration.getTmlFolder() + "/corpora/uppsala/0100.a1.txt"), 41 | new File(Configuration.getTmlFolder() + "/corpora/uppsala/0101.a1.txt"), 42 | new File(Configuration.getTmlFolder() + "/corpora/uppsala/0102.a1.txt")}; 43 | repository.addDocumentsInList(fileList); 44 | } 45 | 46 | @Test 47 | public void numbersDiagnostic01() throws Exception { 48 | TextDocument document = repository.getTextDocument("0100.a1"); 49 | document.load(repository); 50 | assertEquals(30, document.getSentenceCorpus().getPassages().length); 51 | assertEquals(9, document.getParagraphCorpus().getPassages().length); 52 | } 53 | 54 | @Test 55 | public void numbersDiagnostic02() throws Exception { 56 | TextDocument document = repository.getTextDocument("0101.a1"); 57 | document.load(repository); 58 | assertEquals(41, document.getSentenceCorpus().getPassages().length); 59 | assertEquals(9, document.getParagraphCorpus().getPassages().length); 60 | } 61 | 62 | @Test 63 | public void numbersDiagnostic36() throws Exception { 64 | TextDocument document = repository.getTextDocument("0102.a1"); 65 | document.load(repository); 66 | assertEquals(49, document.getSentenceCorpus().getPassages().length); 67 | assertEquals(11, document.getParagraphCorpus().getPassages().length); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/importers/AbstractImporter.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.storage.importers; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | import org.apache.log4j.Logger; 23 | 24 | /** 25 | * Abstract class for all importers to extend from. It implements the logger 26 | * a list of file extensions and a static factory to obtain the right 27 | * importer for a given extension 28 | * 29 | * @author Jorge Villalon 30 | * 31 | */ 32 | public abstract class AbstractImporter { 33 | 34 | protected static Logger logger = Logger.getLogger(AbstractImporter.class); 35 | 36 | protected abstract String[] getFileExtensions(); 37 | 38 | protected List fileExtensions; 39 | 40 | /** 41 | * Creates a new instance of an {@link AbstractImporter}. As this class 42 | * is an abstract class, this can be called only by the constructor 43 | * of a sub-class 44 | */ 45 | public AbstractImporter() { 46 | this.fileExtensions = new ArrayList(); 47 | for (String extension : getFileExtensions()) { 48 | this.fileExtensions.add(extension); 49 | } 50 | } 51 | 52 | /** 53 | * @param fileExtension the extension of a filename (e.g. txt, pdf, doc) 54 | * @return true if the importer can manage the extension 55 | */ 56 | public boolean isValidFileExtension(String fileExtension) { 57 | for (String extension : this.fileExtensions) { 58 | if (extension.equals(fileExtension)) 59 | return true; 60 | } 61 | return false; 62 | } 63 | 64 | /** 65 | * @param fileExtension the file extension to validate 66 | * @return an importer to manage files of the given extension 67 | */ 68 | public static Importer createImporter(String fileExtension) { 69 | Importer importer = null; 70 | 71 | importer = new TextImporter(); 72 | if (importer.isValidFileExtension(fileExtension)) 73 | return importer; 74 | 75 | importer = new HtmlImporter(); 76 | if (importer.isValidFileExtension(fileExtension)) 77 | return importer; 78 | 79 | return null; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/ParagraphCoherenceIndex.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations; 17 | 18 | import java.io.IOException; 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | import tml.vectorspace.operations.results.PassageDistancesResult; 23 | 24 | 25 | public class ParagraphCoherenceIndex extends PassageDistances { 26 | 27 | public ParagraphCoherenceIndex() { 28 | this.name = "Paragraph coherence index"; 29 | } 30 | 31 | @Override 32 | public void start() throws Exception { 33 | super.start(); 34 | 35 | List newResults = new ArrayList(); 36 | String lastParagraphId = null; 37 | double average = 0; 38 | int total = 0; 39 | int currentParagraphIndex = 0; 40 | try { 41 | for(int i=0; i) newResults; 71 | 72 | super.end(); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/SVD.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace; 17 | 18 | import java.io.File; 19 | import java.io.FileInputStream; 20 | import java.io.FileOutputStream; 21 | import java.io.IOException; 22 | import java.io.ObjectInputStream; 23 | import java.io.ObjectOutputStream; 24 | import java.io.Serializable; 25 | 26 | public class SVD implements Serializable { 27 | 28 | /** Serialization ID */ 29 | private static final long serialVersionUID = -1733583945325917544L; 30 | 31 | /** Terms matrix in the semantic space */ 32 | private double[][] Ukdata = null; 33 | /** Singular values in the semantic space */ 34 | private double[][] Skdata = null; 35 | /** Documents matrix in the semantic space */ 36 | private double[][] Vkdata = null; 37 | /** 38 | * @return the ukdata 39 | */ 40 | public double[][] getUkdata() { 41 | return Ukdata; 42 | } 43 | /** 44 | * @param ukdata the ukdata to set 45 | */ 46 | public void setUkdata(double[][] ukdata) { 47 | Ukdata = ukdata; 48 | } 49 | /** 50 | * @return the skdata 51 | */ 52 | public double[][] getSkdata() { 53 | return Skdata; 54 | } 55 | /** 56 | * @param skdata the skdata to set 57 | */ 58 | public void setSkdata(double[][] skdata) { 59 | Skdata = skdata; 60 | } 61 | /** 62 | * @return the vkdata 63 | */ 64 | public double[][] getVkdata() { 65 | return Vkdata; 66 | } 67 | /** 68 | * @param vkdata the vkdata to set 69 | */ 70 | public void setVkdata(double[][] vkdata) { 71 | Vkdata = vkdata; 72 | } 73 | 74 | public void saveSVD(File file) throws IOException { 75 | FileOutputStream stream = new FileOutputStream(file); 76 | ObjectOutputStream objSt = new ObjectOutputStream(stream); 77 | objSt.writeObject(this); 78 | objSt.close(); 79 | } 80 | 81 | public static SVD readSVD(File file) throws IOException, ClassNotFoundException { 82 | FileInputStream stream = new FileInputStream(file); 83 | ObjectInputStream objSt = new ObjectInputStream(stream); 84 | SVD svd = (SVD) objSt.readObject(); 85 | objSt.close(); 86 | return svd; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /tml/src/test/java/tml/test/ReadabilityTest.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2010 Stephen O'Rourke (stephen.orourke@sydney.edu.au) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.test; 18 | 19 | import static org.junit.Assert.*; 20 | 21 | import org.junit.BeforeClass; 22 | import org.junit.Test; 23 | 24 | import tml.corpus.TextDocument; 25 | import tml.storage.importers.TextImporter; 26 | import tml.vectorspace.TermWeighting; 27 | import tml.vectorspace.operations.Readability; 28 | import tml.vectorspace.operations.results.ReadabilityResult; 29 | 30 | /** 31 | * This class tests the {@link Readability} operation. 32 | * 33 | * @author Stephen O'Rourke 34 | * 35 | */ 36 | public class ReadabilityTest extends AbstractTmlIndexingTest { 37 | 38 | private static TextDocument document; 39 | 40 | @BeforeClass 41 | public static void setUpBeforeClass() throws Exception { 42 | AbstractTmlIndexingTest.setUpBeforeClass(); 43 | String content = "The cat sat on the mat. On the mat the cat sat.\nThe feline reclined on the axminster."; 44 | repository.addDocument("1", content, "Title", "N/A", new TextImporter()); 45 | 46 | document = repository.getTextDocument("1"); 47 | document.getParameters().setTermWeightLocal(TermWeighting.LocalWeight.TF); 48 | document.getParameters().setTermWeightGlobal(TermWeighting.GlobalWeight.None); 49 | document.load(repository); 50 | } 51 | 52 | @Test 53 | public void shouldCalculateReadability() throws Exception { 54 | Readability operation = new Readability(); 55 | operation.setCorpus(document.getParagraphCorpus()); 56 | operation.start(); 57 | 58 | assertEquals(operation.getResultsNumber(), 2); 59 | 60 | ReadabilityResult result1 = operation.getResults().get(0); 61 | assertEquals(result1.getDiffGradeLevel(), 9.83, 0.005); 62 | assertEquals(result1.getDiffReadingEase(), 70.5, 0.005); 63 | assertEquals(result1.getFleshKincaidGradeLevel(), -1.45, 0.005); 64 | assertEquals(result1.getFleshReadingEase(), 116.19, 0.05); 65 | 66 | ReadabilityResult result2 = operation.getResults().get(1); 67 | assertEquals(result2.getDiffGradeLevel(), 0.0, 0.0); 68 | assertEquals(result2.getDiffReadingEase(), 0.0, 0.0); 69 | assertEquals(result2.getFleshKincaidGradeLevel(), 8.38, 0.005); 70 | assertEquals(result2.getFleshReadingEase(), 45.69, 0.05); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/factorisation/SpaceDecomposition.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.factorisation; 17 | 18 | import java.io.File; 19 | import java.io.FileInputStream; 20 | import java.io.FileOutputStream; 21 | import java.io.IOException; 22 | import java.io.ObjectInputStream; 23 | import java.io.ObjectOutputStream; 24 | import java.io.Serializable; 25 | 26 | public class SpaceDecomposition implements Serializable { 27 | 28 | /** Serialization ID */ 29 | private static final long serialVersionUID = -1733583945325917544L; 30 | 31 | /** Terms matrix in the semantic space */ 32 | private double[][] Ukdata = null; 33 | /** Singular values in the semantic space */ 34 | private double[][] Skdata = null; 35 | /** Documents matrix in the semantic space */ 36 | private double[][] Vkdata = null; 37 | /** 38 | * @return the ukdata 39 | */ 40 | public double[][] getUkdata() { 41 | return Ukdata; 42 | } 43 | /** 44 | * @param ukdata the ukdata to set 45 | */ 46 | public void setUkdata(double[][] ukdata) { 47 | Ukdata = ukdata; 48 | } 49 | /** 50 | * @return the skdata 51 | */ 52 | public double[][] getSkdata() { 53 | return Skdata; 54 | } 55 | /** 56 | * @param skdata the skdata to set 57 | */ 58 | public void setSkdata(double[][] skdata) { 59 | Skdata = skdata; 60 | } 61 | /** 62 | * @return the vkdata 63 | */ 64 | public double[][] getVkdata() { 65 | return Vkdata; 66 | } 67 | /** 68 | * @param vkdata the vkdata to set 69 | */ 70 | public void setVkdata(double[][] vkdata) { 71 | Vkdata = vkdata; 72 | } 73 | 74 | public void saveSVD(File file) throws IOException { 75 | FileOutputStream stream = new FileOutputStream(file); 76 | ObjectOutputStream objSt = new ObjectOutputStream(stream); 77 | objSt.writeObject(this); 78 | objSt.close(); 79 | } 80 | 81 | public static SpaceDecomposition readSVD(File file) throws IOException, ClassNotFoundException { 82 | FileInputStream stream = new FileInputStream(file); 83 | ObjectInputStream objSt = new ObjectInputStream(stream); 84 | SpaceDecomposition svd = (SpaceDecomposition) objSt.readObject(); 85 | objSt.close(); 86 | return svd; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/storage/DocumentCleanup.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package tml.storage; 5 | 6 | import java.util.List; 7 | 8 | import org.apache.log4j.Logger; 9 | import org.apache.lucene.document.Document; 10 | 11 | import tml.corpus.CorpusParameters; 12 | import tml.corpus.TextDocument; 13 | import tml.corpus.CorpusParameters.DimensionalityReduction; 14 | import tml.corpus.CorpusParameters.TermSelection; 15 | import tml.vectorspace.TermWeighting.GlobalWeight; 16 | import tml.vectorspace.TermWeighting.LocalWeight; 17 | 18 | /** 19 | * @author Jorge Villalon 20 | * 21 | */ 22 | public class DocumentCleanup implements Runnable { 23 | 24 | private static Logger logger = Logger.getLogger(DocumentCleanup.class); 25 | private Repository repository; 26 | private CorpusParameters params; 27 | 28 | public DocumentCleanup(Repository repo) { 29 | this.repository = repo; 30 | this.params = new CorpusParameters(); 31 | this.params.setDimensionalityReduction(DimensionalityReduction.NO); 32 | this.params.setDimensionalityReductionThreshold(0); 33 | this.params.setLanczosSVD(false); 34 | this.params.setNormalizeDocuments(false); 35 | this.params.setTermSelectionCriterion(TermSelection.DF); 36 | this.params.setTermSelectionThreshold(0); 37 | this.params.setTermWeightGlobal(GlobalWeight.None); 38 | this.params.setTermWeightLocal(LocalWeight.TF); 39 | } 40 | 41 | /* (non-Javadoc) 42 | * @see java.lang.Runnable#run() 43 | */ 44 | @Override 45 | public void run() { 46 | logger.debug("Document cleanup started"); 47 | 48 | int total = 0; 49 | List docs; 50 | try { 51 | docs = this.repository.getAllTextDocuments(); 52 | } catch (Exception e) { 53 | logger.error(e.getMessage()); 54 | return; 55 | } 56 | 57 | if(docs == null) { 58 | logger.debug("No documents to cleanup"); 59 | return; 60 | } 61 | 62 | for(TextDocument doc : docs) { 63 | try { 64 | String[][] subs = this.repository.getDbConnection().getSubDocuments(doc.getExternalId()); 65 | if(subs.length <= 1) { 66 | logger.debug("Inserting document in the database:" + doc.getExternalId()); 67 | Document document = repository.getIndexReader().document(doc.getLuceneId()); 68 | this.repository.getDbConnection().insertDocument(repository, document); 69 | doc.setParameters(this.params); 70 | doc.load(repository); 71 | for(int id : doc.getSentenceCorpus().getPassagesLuceneIds()) { 72 | Document sentence = repository.getIndexReader().document(id); 73 | this.repository.getDbConnection().insertDocument(repository, sentence); 74 | } 75 | for(int id : doc.getParagraphCorpus().getPassagesLuceneIds()) { 76 | Document sentence = repository.getIndexReader().document(id); 77 | this.repository.getDbConnection().insertDocument(repository, sentence); 78 | } 79 | total++; 80 | } 81 | } catch (Exception e) { 82 | logger.error(e.getMessage()); 83 | continue; 84 | } 85 | } 86 | 87 | if(total > 0) 88 | logger.info("Cleaned " + total + " documents"); 89 | else 90 | logger.debug("Nothing to clean!"); 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/PassageExtractionSummarization.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | 17 | package tml.vectorspace.operations; 18 | 19 | import java.io.IOException; 20 | import java.util.ArrayList; 21 | import java.util.TreeMap; 22 | 23 | import tml.corpus.TextDocument; 24 | import tml.vectorspace.operations.results.PassageExtractionSummarizationResult; 25 | 26 | 27 | import Jama.Matrix; 28 | 29 | /** 30 | * @author Jorge Villalon 31 | * 32 | */ 33 | public class PassageExtractionSummarization extends AbstractOperation { 34 | 35 | private double loadThreshold = 0.5; 36 | 37 | /** 38 | * 39 | */ 40 | public PassageExtractionSummarization() { 41 | this.name = "Passage extraction"; 42 | } 43 | 44 | /** 45 | * @return the threshold by which a text passage will be kept as result 46 | */ 47 | public double getLoadThreshold() { 48 | return loadThreshold; 49 | } 50 | 51 | /** 52 | * @param loadThreshold 53 | */ 54 | public void setLoadThreshold(double loadThreshold) { 55 | this.loadThreshold = loadThreshold; 56 | } 57 | 58 | @Override 59 | public void start() throws Exception { 60 | 61 | super.start(); 62 | 63 | this.results = new ArrayList(); 64 | 65 | Matrix eigenVectors = this.corpus.getSemanticSpace() 66 | .getVk(); 67 | 68 | for (int i = 0; i < eigenVectors.getColumnDimension(); i++) { 69 | TreeMap v = new TreeMap(); 70 | for (int j = 0; j < eigenVectors.getRowDimension(); j++) { 71 | v.put(Math.abs(eigenVectors.get(j, i)), j); 72 | } 73 | double d = v.lastKey(); 74 | int q = v.get(d); 75 | PassageExtractionSummarizationResult result = new PassageExtractionSummarizationResult(); 76 | result.setEigenVectorIndex(i); 77 | result.setLoad(d); 78 | try { 79 | TextDocument doc = this.repository.getTextDocument(this.corpus.getPassages()[q]); 80 | result.setTextPassageContent(doc.getContent()); 81 | result.setTextPassageId(q); 82 | this.results.add(result); 83 | } catch (IOException e) { 84 | e.printStackTrace(); 85 | logger.error(e); 86 | } 87 | if (this.results.size() >= this.maxResults) 88 | break; 89 | } 90 | 91 | super.end(); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/results/TermsExtractionSummarizationResult.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.results; 17 | 18 | /** 19 | * This class represents the result of a {@link TermExtractionSummarization} operation. It 20 | * represents a {@link Term} in the {@link Corpus}, with its corresponding 21 | * eigenvector and loading. 22 | * 23 | * @author Jorge Villalon 24 | * 25 | */ 26 | public class TermsExtractionSummarizationResult extends AbstractResult { 27 | String term; 28 | int termId; 29 | double load; 30 | double variance; 31 | int eigenVectorIndex; 32 | 33 | /** 34 | * @return the position of the eigenvector (relative importance) 35 | */ 36 | public int getEigenVectorIndex() { 37 | return eigenVectorIndex; 38 | } 39 | 40 | /** 41 | * @param eigenVectorIndex the position of the eigenvector 42 | */ 43 | public void setEigenVectorIndex(int eigenVectorIndex) { 44 | this.eigenVectorIndex = eigenVectorIndex; 45 | } 46 | 47 | /** 48 | * @return the load of the term in the eigenvector 49 | */ 50 | public double getLoad() { 51 | return load; 52 | } 53 | 54 | /** 55 | * @param load the load of the term in the eigenvector 56 | */ 57 | public void setLoad(double load) { 58 | this.load = load; 59 | } 60 | 61 | /** 62 | * @return the textual representation of the term 63 | */ 64 | public String getTerm() { 65 | return term; 66 | } 67 | 68 | /** 69 | * @param sentence the textual representation of the term 70 | */ 71 | public void setTerm(String sentence) { 72 | this.term = sentence; 73 | } 74 | 75 | /** 76 | * @return the id of the term 77 | */ 78 | public int getTermId() { 79 | return termId; 80 | } 81 | 82 | /** 83 | * @param termId the id of the term 84 | */ 85 | public void setTermId(int termId) { 86 | this.termId = termId; 87 | } 88 | 89 | /** 90 | * @return the variance corresponding to the eigenvector 91 | */ 92 | public double getVariance() { 93 | return variance; 94 | } 95 | 96 | /** 97 | * @param variance the variance corresponding to the eigenvector 98 | */ 99 | public void setVariance(double variance) { 100 | this.variance = variance; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /tml/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | tml 6 | tml-core 7 | 3.0.0 8 | TML - Text Mining Library 9 | 10 | 11 | 12 | maven-compiler-plugin 13 | 2.0.2 14 | 15 | 1.6 16 | 1.6 17 | 18 | 19 | 20 | 21 | 22 | ${basedir}/src/main/java 23 | 24 | **/* 25 | 26 | 27 | 28 | 29 | 30 | 31 | nz.ac.waikato.cs 32 | weka 33 | 3.5.6 34 | 35 | 36 | stanford 37 | stanford-parser 38 | 1.6.1 39 | 40 | 41 | edu.mit.jwi 42 | jwi 43 | 2.1.5 44 | 45 | 46 | jama 47 | jama 48 | 1.0.2 49 | 50 | 51 | log4j 52 | log4j 53 | 1.2.14 54 | 55 | 56 | org.xerial 57 | sqlite-jdbc 58 | 3.6.20 59 | 60 | 61 | commons-cli 62 | commons-cli 63 | 1.2 64 | 65 | 66 | org.apache.lucene 67 | lucene-core 68 | 2.4.1 69 | 70 | 71 | org.apache.lucene 72 | lucene-analyzers 73 | 2.4.1 74 | 75 | 76 | org.apache.lucene 77 | lucene-snowball 78 | 2.4.1 79 | 80 | 81 | junit 82 | junit 83 | 4.7 84 | 85 | 86 | commons-logging 87 | commons-logging 88 | 1.1.1 89 | jar 90 | compile 91 | 92 | 93 | org.htmlparser 94 | htmlparser 95 | 1.6 96 | jar 97 | compile 98 | 99 | 100 | -------------------------------------------------------------------------------- /tml/src/main/java/tml/vectorspace/operations/summarization/LatentSemanticAnalysisSummarization.java: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | *******************************************************************************/ 16 | package tml.vectorspace.operations.summarization; 17 | 18 | import tml.corpus.Corpus; 19 | import tml.vectorspace.NotEnoughTermsInCorpusException; 20 | import Jama.Matrix; 21 | 22 | /** 23 | * 24 | * LSA based summarization using Steiberger's formula from: 25 | * INPROCEEDINGS{ 26 | * author = {Josef Steinberger and Karel Jezek}, 27 | * title = {Using Latent Semantic Analysis in Text Summarization and Summary Evaluation}, 28 | * booktitle = {Proceedings of the 7th International Conference ISIM}, 29 | * year = {2004} 30 | * } 31 | * 32 | * @author Jorge Villalon 33 | * 34 | */ 35 | public class LatentSemanticAnalysisSummarization extends 36 | AbstractSummarizationOperation implements SummarizationOperation { 37 | 38 | private Matrix Vk = null; 39 | private Matrix Sk = null; 40 | private Matrix Uk = null; 41 | 42 | public LatentSemanticAnalysisSummarization() { 43 | this.name = "LSA"; 44 | } 45 | 46 | @Override 47 | public void setCorpus(Corpus corpus) { 48 | super.setCorpus(corpus); 49 | 50 | if(corpus == null) 51 | return; 52 | 53 | if(!this.corpus.getSemanticSpace().isCalculated()) { 54 | try { 55 | this.corpus.getSemanticSpace().calculate(); 56 | } catch (NotEnoughTermsInCorpusException e) { 57 | logger.error(e); 58 | super.setCorpus(null); 59 | return; 60 | } 61 | } 62 | 63 | // Reminder! Vk is transposed in SVD so Vk is docs by dimensions 64 | this.Vk = this.corpus.getSemanticSpace().getVk().copy(); 65 | this.Uk = this.corpus.getSemanticSpace().getUk().copy(); 66 | 67 | // The variance corresponds to the squared eigenvalues, so we square S 68 | this.Sk = this.corpus.getSemanticSpace().getSk().copy(); 69 | this.Sk = this.Sk.times(this.Sk); 70 | } 71 | 72 | @Override 73 | protected double calculatePassageLoading(int doc) { 74 | double total = 0; 75 | for(int dim =0; dim> ncol), * 8 | * * 9 | * so that {u, sqrt(lambda), v} is a singular triplet of A. * 10 | * (A' = transpose of A) * 11 | * * 12 | * global variables and common areas used by las2 and its * 13 | * procedures. * 14 | **************************************************************/ 15 | 16 | #define LMTNW 600000 /* max. size of working area allowed */ 17 | #define NMAX 3000 /* bound on ncol, order of A */ 18 | #define NZMAX 100000 /* bound on number of nonzeros in a */ 19 | 20 | long ierr, /* error flag */ 21 | j, /* number of lanczos steps taken */ 22 | neig, /* number of ritz values stabilized */ 23 | nsig, /* number of accepted ritz values * 24 | * based on kappa (relative accuracy) */ 25 | ncol, /* number of columns of A */ 26 | nrow, /* number of rows of A */ 27 | mxvcount = 0; 28 | 29 | /************************************************************** 30 | * pointers to areas holding input matrix which is stored in * 31 | * harwell-boeing format. * 32 | **************************************************************/ 33 | long *pointr = NULL, /* pointer to column start array */ 34 | *rowind = NULL; /* pointer to row indices array */ 35 | double *value = NULL; /* pointer to nonzero values array */ 36 | 37 | double rnm, /* norm of the next residual vector */ 38 | anorm, 39 | tol, 40 | eps, /* positive machine epsilon */ 41 | eps1, /* roundoff estimate for dot product * 42 | * of two unit vector */ 43 | reps, 44 | eps34; 45 | 46 | double *xv1 = NULL, /* temp arrays needed for computing */ 47 | *xv2 = NULL, /* singular vectors */ 48 | *ztemp = NULL, 49 | 50 | *a = NULL; /* pointer to area used by user- * 51 | * supplied procedure store and holds * 52 | * lanczos vectors */ 53 | 54 | FILE *fp_out1 = NULL;/* output file pointers */ 55 | long fp_out2; 56 | 57 | char *error[10] = { /* error messages used by function * 58 | * check_parameters */ 59 | NULL, 60 | " SORRY, YOUR MATRIX IS TOO BIG ", 61 | " ***** ENDL MUST BE LESS THAN ENDR *****", 62 | " ***** MAXPRS CANNOT EXCEED LANMAX *****", 63 | " ***** N = NROW + NCOL MUST BE GREATER THAN ZERO *****", 64 | " ***** LANMAX (NUMBER OF LANCZOS STEPS) IS INVALID *****", 65 | " ***** MAXPRS (NUMBER OF IEGENPAIRS DESIRED) IS INVALID *****", 66 | " ***** 6*N+4*LANMAX+1 + LANMAX*LANMAX CANNOT EXCEED NW *****", 67 | " ***** 6*N+4*LANMAX+1 CANNOT EXCEED NW *****", 68 | NULL}; 69 | --------------------------------------------------------------------------------