This package is a general container for the rest of the packages. It also contains a Configuration class that reads the default configuration from the jar file or from the filesystem.
31 |
32 |
33 |
--------------------------------------------------------------------------------
/tml/src/test/resources/tml.properties:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Copyright (C) 2001, 2007 University of Sydney
3 | #
4 | # This program is free software; you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License as published by
6 | # the Free Software Foundation; either version 2 of the License, or
7 | # (at your option) any later version.
8 | #
9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program; if not, write to the Free Software
16 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
17 | # USA
18 | #
19 | # http://www.gnu.org/licenses/gpl.txt
20 | ###############################################################################
21 | # Set root logger level to DEBUG and its only appender to A1.
22 | log4j.rootLogger=DEBUG, A1
23 |
24 | # A1 is set to be a ConsoleAppender.
25 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
26 |
27 | # A1 uses PatternLayout.
28 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
29 | log4j.appender.A1.layout.ConversionPattern=[%-5p] %-4r [%t] %-20c{2} - %m %x %n
30 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/package.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
24 |
25 |
26 | Operations are predefined sets of interesting information that can be obtained from a Semantic Space.
27 |
28 |
Package Specification
29 |
30 |
Operations implement algorithms to obtain patterns of interest from a Semantic Space, such as extracting the most relevant passages or terms, or to obtain a set of labelled clusters.
31 |
32 |
33 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/LastPassageResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations.results;
18 |
19 | /**
20 | * The content of the last passage.
21 | *
22 | * @author Jorge Villalon
23 | *
24 | */
25 | public class LastPassageResult extends AbstractResult {
26 |
27 | String passage;
28 |
29 | /**
30 | * @return the passage
31 | */
32 | public String getPassage() {
33 | return passage;
34 | }
35 |
36 | /**
37 | * @param passage the passage to set
38 | */
39 | public void setPassage(String passage) {
40 | this.passage = passage;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/storage/importers/TextImporter.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.storage.importers;
18 |
19 | /**
20 | * TextImporter implements the simples importer of plain text, therefore
21 | * it just returns the content as it is.
22 | *
23 | * @author Jorge Villalon
24 | *
25 | */
26 | public class TextImporter extends AbstractImporter implements Importer {
27 |
28 | @Override
29 | public String getCleanContent(String content) {
30 | return content;
31 | }
32 |
33 | @Override
34 | protected String[] getFileExtensions() {
35 | String[] extensions = { "txt" };
36 | return extensions;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/factorisation/SingularValueDecomposition.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.factorisation;
17 |
18 | import Jama.Matrix;
19 |
20 | public class SingularValueDecomposition extends MatrixFactorisation {
21 |
22 | private Jama.SingularValueDecomposition svd = null;
23 |
24 | @Override
25 | public void process(Matrix v) {
26 | svd = new Jama.SingularValueDecomposition(v);
27 | this.decomposition = new SpaceDecomposition();
28 | this.decomposition.setUkdata(svd.getU().getArray());
29 | this.decomposition.setSkdata(svd.getS().getArray());
30 | this.decomposition.setVkdata(svd.getV().getArray());
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/tml/src/test/java/tml/test/DbConnectionTest.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package tml.test;
5 |
6 |
7 | import java.io.File;
8 | import java.io.IOException;
9 |
10 | import org.apache.lucene.index.CorruptIndexException;
11 | import org.apache.lucene.store.LockObtainFailedException;
12 | import org.junit.BeforeClass;
13 | import org.junit.Test;
14 |
15 | import tml.Configuration;
16 |
17 | import static org.junit.Assert.*;
18 |
19 | /**
20 | * @author Jorge Villalon
21 | *
22 | */
23 | public class DbConnectionTest extends AbstractTmlIndexingTest {
24 |
25 | /**
26 | * @throws java.lang.Exception
27 | */
28 | @BeforeClass
29 | public static void setUpBeforeClass() throws Exception {
30 | AbstractTmlIndexingTest.setUpBeforeClass();
31 | }
32 |
33 | @Test
34 | public void checkConnection() {
35 | assertNotNull(repository.getDbConnection());
36 | }
37 |
38 | @Test
39 | public void addMetaData() throws LockObtainFailedException, CorruptIndexException, IOException {
40 | File[] files = new File[1];
41 | files[0] = new File(Configuration.getTmlFolder() + "/corpora/uppsala/0100.a1.txt");
42 | repository.addDocumentsInList(files);
43 | }
44 |
45 | @Test
46 | public void getNullMetaData() {
47 | String metadata = repository.getAnnotations("0100.a1", "penntree");
48 | assertNull(metadata);
49 | metadata = repository.getAnnotations("p1d0100.a1", "penntree");
50 | assertNull(metadata);
51 | metadata = repository.getAnnotations("s1d0100.a1", "penntree");
52 | assertNull(metadata);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/corpus/package.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
24 |
25 |
26 | Implements all the classes required for corpora management as Bags of Words, it also includes NLP for sentences.
27 |
28 |
Package Specification
29 |
30 |
This package implements the bag of words approach for documents at three levels: Document, paragraph and sentences. As grammatical information is available at the sentence level, it also includes the PennTree bank tree parse of each sentence.
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/TermWeightingException.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace;
18 |
19 | /**
20 | * Exception occurred while applying the term weighting criteria
21 | * @author Jorge Villalon
22 | *
23 | */
24 | public class TermWeightingException extends Exception {
25 |
26 | /**
27 | *
28 | */
29 | private static final long serialVersionUID = -7804139372695995041L;
30 |
31 | /**
32 | *
33 | */
34 | public TermWeightingException() {
35 | this(null);
36 | }
37 |
38 | /**
39 | * @param e
40 | */
41 | public TermWeightingException(Exception e) {
42 | super("Exception while calculating Term weighting scheme", e);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/corpus/ParagraphCorpus.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.corpus;
17 |
18 | /**
19 | * Corpus that represents the paragraphs of a {@link TextDocument}
20 | *
21 | * @author Jorge Villalon
22 | *
23 | */
24 | public class ParagraphCorpus extends Corpus {
25 |
26 | /**
27 | * @param document the {@link TextDocument} to which the paragraphs belong
28 | * @throws Exception if the document is null
29 | */
30 | public ParagraphCorpus(TextDocument document) throws Exception {
31 |
32 | if(document == null)
33 | throw new Exception("A paragraph corpus must belong to a document");
34 |
35 | this.luceneQuery = "type:paragraph AND reference:" + document.getExternalId();
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/annotators/AbstractAnnotator.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.annotators;
17 |
18 | import java.io.IOException;
19 | import java.util.ArrayList;
20 |
21 | public class AbstractAnnotator {
22 |
23 | private String fieldName;
24 | protected ArrayList types;
25 |
26 | public ArrayList getTypes() {
27 | return types;
28 | }
29 | public AbstractAnnotator(String fieldName, String[] types) throws IOException {
30 | this.fieldName = fieldName;
31 | this.types = new ArrayList();
32 | for(String type : types) {
33 | this.types.add(type);
34 | }
35 | }
36 | /**
37 | * @return the fieldName
38 | */
39 | public String getFieldName() {
40 | return fieldName;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/TermRankedResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations.results;
17 |
18 | public class TermRankedResult extends AbstractResult {
19 |
20 | String term;
21 | double rank;
22 | /**
23 | * @param term the term to set
24 | */
25 | public void setTerm(String term) {
26 | this.term = term;
27 | }
28 | /**
29 | * @return the term
30 | */
31 | public String getTerm() {
32 | return term;
33 | }
34 | /**
35 | * @param rank the rank to set
36 | */
37 | public void setRank(double rank) {
38 | this.rank = rank;
39 | }
40 | /**
41 | * @return the rank
42 | */
43 | public double getRank() {
44 | return rank;
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/NoDocumentsInCorpusException.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace;
18 |
19 | /**
20 | * Exception raised when no documents are found in the {@link Corpus}
21 | * @author Jorge Villalon
22 | *
23 | */
24 | public class NoDocumentsInCorpusException extends Exception {
25 |
26 | private static final long serialVersionUID = 5607315201790740186L;
27 |
28 | /**
29 | * Constructor
30 | */
31 | public NoDocumentsInCorpusException() {
32 | super("No documents to build the corpus");
33 | }
34 |
35 | /**
36 | * Constructor
37 | * @param e
38 | */
39 | public NoDocumentsInCorpusException(Exception e) {
40 | super("No documents to build the corpus", e);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/storage/importers/Importer.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.storage.importers;
18 |
19 | /**
20 | * Interface for all importers. It defines what kind of files it can manage,
21 | * basically by extension, and implements a method to obtain the plain text
22 | * version of the content.
23 | *
24 | * @author Jorge Villalon
25 | *
26 | */
27 | public interface Importer {
28 | /**
29 | * @param content the text to clean
30 | * @return the plain text version of the content
31 | */
32 | public String getCleanContent(String content);
33 |
34 | /**
35 | * @param fileExtension
36 | * @return true if the importer can manage the extension
37 | */
38 | public boolean isValidFileExtension(String fileExtension);
39 | }
40 |
--------------------------------------------------------------------------------
/tml/tml/tml.properties:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ###############################################################################
16 |
17 | # TML specifics
18 |
19 | # Default value for tml folder
20 | tml.folder=.
21 |
22 | # Log file for tml
23 | log4j.appender.ROLLING.File=./log/tml.log
24 |
25 | # Annotators that will be active by default
26 | # tml.annotators=PennTreeAnnotator
27 | tml.annotators=
28 |
29 | #SQlite configuration for Meta data
30 | tml.database.driver=com.mysql.jdbc.Driver
31 | tml.database.url.protocol=jdbc:mysql:
32 | tml.database.url.db=//localhost/tml_metadata
33 |
34 | tml.database.username=tmluser
35 | tml.database.password=password
36 |
37 | # Indexer process
38 | tml.indexer.interval=8
39 | tml.indexer.run=false
40 |
41 | # Annotator process
42 | tml.annotator.interval=10
43 | tml.annotator.run=false
44 |
45 | # Cleanup process
46 | tml.cleanup.interval=600
47 | tml.cleanup.run=false
48 |
49 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/utils/Highlighting.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.utils;
17 |
18 | import java.util.regex.Pattern;
19 |
20 | public class Highlighting {
21 |
22 | public static String highlightText(String text, String[] tokens, String[] cssClass) {
23 | String output = text.toLowerCase();
24 |
25 | for(int i=0; i" + token + "");
30 | }
31 |
32 | return output;
33 | }
34 |
35 | public static String htmlFormat(String txt) {
36 | txt = txt.replaceAll(" ", " ");
37 | txt = txt.replaceAll("\r", "");
38 | txt = txt.replaceAll("\n", "
");
39 | return "
" + txt + "
";
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/ParagraphCoherenceIndexResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations.results;
17 |
18 | public class ParagraphCoherenceIndexResult extends AbstractResult {
19 |
20 | String paragraphId;
21 | double index;
22 | /**
23 | * @return the paragraphId
24 | */
25 | public String getParagraphId() {
26 | return paragraphId;
27 | }
28 | /**
29 | * @param paragraphId the paragraphId to set
30 | */
31 | public void setParagraphId(String paragraphId) {
32 | this.paragraphId = paragraphId;
33 | }
34 | /**
35 | * @return the index
36 | */
37 | public double getIndex() {
38 | return index;
39 | }
40 | /**
41 | * @param index the index to set
42 | */
43 | public void setIndex(double index) {
44 | this.index = index;
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/tml.properties:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ###############################################################################
16 |
17 | # TML specifics
18 |
19 | # Default value for tml folder
20 | tml.folder=/tml
21 |
22 | # Log file for tml
23 | log4j.appender.ROLLING.File=/tml/log/tml.log
24 |
25 | # Annotators that will be active by default
26 | # tml.annotators=PennTreeAnnotator
27 | tml.annotators=
28 |
29 | #SQlite configuration for Meta data
30 | tml.database.driver=com.mysql.jdbc.Driver
31 | tml.database.url.protocol=jdbc:mysql:
32 | tml.database.url.db=//localhost/tml
33 |
34 | tml.database.username=tml
35 | tml.database.password=itsyourfault
36 |
37 | # Indexer process
38 | tml.indexer.interval=8
39 | tml.indexer.run=false
40 |
41 | # Annotator process
42 | tml.annotator.interval=10
43 | tml.annotator.run=false
44 |
45 | # Cleanup process
46 | tml.cleanup.interval=600
47 | tml.cleanup.run=false
48 |
49 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/package.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
24 |
25 |
26 | Implements a Vector Space Model, that can be later transformed using Latent Semantic Analysis.
27 |
28 |
Package Specification
29 |
30 |
This package implements the transformation of a Corpus into a VSM, it also implements the possibility of using LSA to obtain a Semantic Space.
31 |
The package is closely integrated with Weka, providing Data Mining functionalities in case a developer wants operations that are not implemented in TML.
32 |
Patterns that can be obtained from a VSM or semantic space are implemented via operations, that can be found in the operations subpackage.
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/storage/RepositoryEvent.java:
--------------------------------------------------------------------------------
1 | /*
2 | * To change this template, choose Tools | Templates
3 | * and open the template in the editor.
4 | */
5 |
6 | package tml.storage;
7 |
8 | import java.util.EventObject;
9 |
10 | /**
11 | * This class represents an event that was fired by a Repository
12 | * and indicates the current step of the running process and the
13 | * maximum number of steps.
14 | * It also includes a descriptive name of the operation being
15 | * executed.
16 | *
17 | * @author Jorge Villalon
18 | */
19 | public class RepositoryEvent extends EventObject {
20 |
21 | /**
22 | *
23 | */
24 | private static final long serialVersionUID = 4688981006009818932L;
25 | private String action = null;
26 | private int current = 0;
27 | private int maximum = 100;
28 |
29 | public String getAction() {
30 | return action;
31 | }
32 |
33 | public void setAction(String action) {
34 | this.action = action;
35 | }
36 |
37 | public int getCurrent() {
38 | return current;
39 | }
40 |
41 | public void setCurrent(int current) {
42 | this.current = current;
43 | }
44 |
45 | public int getMaximum() {
46 | return maximum;
47 | }
48 |
49 | public void setMaximum(int maximum) {
50 | this.maximum = maximum;
51 | }
52 |
53 | public RepositoryEvent(Object source, String action, int curr, int max) {
54 | super(source);
55 | this.action = action;
56 | this.current = curr;
57 | this.maximum = max;
58 | }
59 |
60 | @Override
61 | public String toString() {
62 | return "Action:" + this.action + " " + this.current + " of " + this.maximum;
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/overview.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
24 |
25 |
26 |
TML (Text Mining Library) is a general Text Mining library with a focus on Latent Semantic Analysis (LSA).
27 | It allows to create semantic spaces (see Deerwester, 1998) from a corpus of documents with detailed parameters.
28 | This spaces can then be used as background knowledge to calculate distances between documents (or passages) and terms of a different corpus.
29 | Typical operations are the similarity between each document in a corpus, or the distances between consecutive sentences in a document.
30 |
Please visit the website in http://kiama.ee.usyd.edu.au/tml/ for downloading TML, a quick start guide and tutorials.
31 |
32 |
33 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/factorisation/MatrixFactorisation.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.factorisation;
17 |
18 | import Jama.Matrix;
19 |
20 | public abstract class MatrixFactorisation {
21 |
22 | /** Terms matrix in the semantic space */
23 | protected Matrix Uk = null;
24 | /** Singular values in the semantic space */
25 | protected Matrix Sk = null;
26 | /** Documents matrix in the semantic space */
27 | protected Matrix Vk = null;
28 | /** The number of dimensions that were kept */
29 | protected int dimensionsKept = -1;
30 |
31 | protected SpaceDecomposition decomposition;
32 | protected int K;
33 |
34 | public int getK() {
35 | return K;
36 | }
37 |
38 | public void setK(int K) {
39 | this.K = K;
40 | }
41 |
42 | public abstract void process(Matrix v);
43 |
44 | public SpaceDecomposition getDecomposition() {
45 | return this.decomposition;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/tml/src/test/java/tml/test/StemmingTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | /**
17 | *
18 | */
19 | package tml.test;
20 |
21 | import org.junit.Test;
22 |
23 | import tml.utils.LuceneUtils;
24 |
25 | import static org.junit.Assert.*;
26 |
27 |
28 |
29 | /**
30 | * This class test that the stemming algorithm is working appropriately.
31 | *
32 | * @author Jorge Villalon
33 | *
34 | */
35 | public class StemmingTest {
36 |
37 | @Test
38 | public void testStemming() {
39 | String[] words = {"increase","increasing","increased","increases","dog","dogs"};
40 | String[] stemmedWords = {"increas","increas","increas","increas","dog","dog"};
41 |
42 | for(int i=0; i list, String word) {
43 | for(String w : list) {
44 | if(w.equals(word) || stringContained(w, word) || stringContained(word, w))
45 | return true;
46 | }
47 | return false;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/storage/DocumentAnnotator.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package tml.storage;
5 |
6 | import java.io.IOException;
7 |
8 | import org.apache.log4j.Logger;
9 |
10 | import tml.annotators.Annotator;
11 |
12 | /**
13 | * @author jorge
14 | *
15 | */
16 | public class DocumentAnnotator implements Runnable {
17 |
18 | private static Logger logger = Logger.getLogger(DocumentAnnotator.class);
19 | private Repository repository;
20 |
21 | public DocumentAnnotator(Repository repo) {
22 | this.repository = repo;
23 | }
24 |
25 | /* (non-Javadoc)
26 | * @see java.lang.Runnable#run()
27 | */
28 | @Override
29 | public void run() {
30 |
31 | int total = 0;
32 | String[][] docs = this.repository.getDbConnection().getUnannotatedDocument();
33 |
34 | if(docs == null) {
35 | logger.debug("No documents to annotate");
36 | return;
37 | }
38 |
39 | for(String[] doc : docs) {
40 | String externalid = doc[0];
41 | String type = doc[1];
42 | String content = null;
43 | try {
44 | content = this.repository.getDocumentField(externalid, this.repository.getLuceneContentField());
45 | } catch (IOException e) {
46 | e.printStackTrace();
47 | logger.error("No content found in Lucene index for document " + externalid);
48 | return;
49 | }
50 | for (Annotator annotator : this.repository.getAnnotators()) {
51 | String metadata = null;
52 | if (annotator.getTypes().contains(type)) {
53 | metadata = annotator.getAnnotations(content);
54 | } else {
55 | metadata = "Not available";
56 | }
57 | this.repository.getDbConnection().setAnnotation(externalid, annotator.getFieldName(), metadata);
58 | }
59 | total++;
60 |
61 | }
62 | if(total > 0)
63 | logger.info("Annotated " + total + " documents");
64 | else
65 | logger.debug("Nothing to annotate");
66 | }
67 |
68 | }
69 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/summarization/VectorLengthSummarization.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations.summarization;
17 |
18 | import Jama.Matrix;
19 |
20 | public class VectorLengthSummarization extends AbstractSummarizationOperation {
21 |
22 | public VectorLengthSummarization() {
23 | this.name = "VectLength";
24 | }
25 |
26 | @Override
27 | protected double calculatePassageLoading(int doc) {
28 | Matrix termDoc = this.corpus.getTermDocMatrix();
29 | double total = 0;
30 | for(int term = 0; term < termDoc.getRowDimension(); term++) {
31 | total += Math.pow(termDoc.get(term, doc),2);
32 | }
33 | return Math.sqrt(total);
34 | }
35 |
36 | @Override
37 | protected double calculateTermLoading(int term) {
38 | Matrix termDoc = this.corpus.getTermDocMatrix();
39 | double total = 0;
40 | for(int doc = 0; doc < termDoc.getRowDimension(); doc++) {
41 | total += Math.pow(termDoc.get(term, doc),2);
42 | }
43 | return Math.sqrt(total);
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/RapidAutomaticKeywordExtractionResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2010 Stephen O'Rourke (stephen.orourke@sydney.edu.au)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations.results;
17 |
18 | /**
19 | * This class represents the result of a {@link RapidAutomaticKeywordExtraction}
20 | * operation.
21 | *
22 | * @author Stephen O'Rourke
23 | *
24 | */
25 | public class RapidAutomaticKeywordExtractionResult extends AbstractResult implements Comparable {
26 | private String keyword;
27 | private Double weighting;
28 |
29 | public String getKeyword() {
30 | return keyword;
31 | }
32 |
33 | public Double getWeighting() {
34 | return weighting;
35 | }
36 |
37 | public void setKeyword(String keyword) {
38 | this.keyword = keyword;
39 | }
40 |
41 | public void setWeighting(Double weighting) {
42 | this.weighting = weighting;
43 | }
44 |
45 | @Override
46 | public int compareTo(RapidAutomaticKeywordExtractionResult result) {
47 | return this.weighting.compareTo(result.weighting);
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/TagCloudsResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations.results;
18 |
19 | /**
20 | * @author Jorge Villalon
21 | *
22 | */
23 | public class TagCloudsResult extends AbstractResult {
24 |
25 | String term;
26 | double weight;
27 |
28 | /**
29 | * @param term
30 | * @param weight
31 | */
32 | public TagCloudsResult(String term, double weight) {
33 | super();
34 | this.term = term;
35 | this.weight = weight;
36 | }
37 |
38 | /**
39 | * @param weight the weight to set
40 | */
41 | public void setWeight(double weight) {
42 | this.weight = weight;
43 | }
44 |
45 | /**
46 | * @return the weight
47 | */
48 | public double getWeight() {
49 | return weight;
50 | }
51 |
52 | /**
53 | * @param term the term to set
54 | */
55 | public void setTerm(String term) {
56 | this.term = term;
57 | }
58 |
59 | /**
60 | * @return the term
61 | */
62 | public String getTerm() {
63 | return term;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/visualizations/TagClouds.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package tml.vectorspace.operations.visualizations;
5 |
6 | import java.util.ArrayList;
7 | import java.util.Collections;
8 | import java.util.Comparator;
9 | import java.util.List;
10 |
11 | import tml.vectorspace.operations.results.TagCloudsResult;
12 |
13 | /**
14 | * @author Jorge
15 | *
16 | */
17 | public class TagClouds extends AbstractVisualization {
18 |
19 | private int maxSizePixels = 24;
20 | private int maxResults = 50;
21 |
22 | public int getMaxSizePixels() {
23 | return maxSizePixels;
24 | }
25 |
26 | public void setMaxSizePixels(int maxSizePixels) {
27 | this.maxSizePixels = maxSizePixels;
28 | }
29 |
30 | @SuppressWarnings("unchecked")
31 | @Override
32 | public String getHTML() {
33 | List newResults = new ArrayList();
34 | int i=0;
35 | for(TagCloudsResult result : (List) operation.getResults()) {
36 | newResults.add(result);
37 | i++;
38 | if(i>maxResults)
39 | break;
40 | }
41 | Collections.sort(newResults,new Comparator() {
42 | @Override
43 | public int compare(TagCloudsResult o1, TagCloudsResult o2) {
44 | return o1.getTerm().compareTo(o2.getTerm());
45 | }
46 | });
47 | StringBuffer buffer = new StringBuffer();
48 | buffer.append("
");
55 | return buffer.toString();
56 | }
57 |
58 | private int calculateSize(double weight) {
59 | double size = (double) maxSizePixels;
60 | size = size * weight;
61 | return (int) size;
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/SummaryResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations.results;
18 |
19 | /**
20 | * @author Jorge Villalon
21 | *
22 | */
23 | public class SummaryResult extends AbstractResult {
24 |
25 | String item;
26 | String value;
27 | String comment;
28 |
29 | /**
30 | * @return the comment
31 | */
32 | public String getComment() {
33 | return comment;
34 | }
35 | /**
36 | * @return the item
37 | */
38 | public String getItem() {
39 | return item;
40 | }
41 | /**
42 | * @return the value
43 | */
44 | public String getValue() {
45 | return value;
46 | }
47 | /**
48 | * @param comment the comment to set
49 | */
50 | public void setComment(String comment) {
51 | this.comment = comment;
52 | }
53 | /**
54 | * @param item the item to set
55 | */
56 | public void setItem(String item) {
57 | this.item = item;
58 | }
59 | /**
60 | * @param value the value to set
61 | */
62 | public void setValue(String value) {
63 | this.value = value;
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/utils/JDBCUtils.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.utils;
17 |
18 | import java.sql.Connection;
19 | import java.sql.DriverManager;
20 | import java.sql.ResultSet;
21 | import java.sql.SQLException;
22 | import java.sql.Statement;
23 |
24 | public class JDBCUtils {
25 |
26 | private Connection m_conn;
27 | private Statement m_stmt;
28 |
29 | public JDBCUtils(String driver, String url, String username, String password) throws Exception
30 | {
31 | try {
32 | Class.forName(driver);
33 | m_conn = DriverManager.getConnection(url, username, password);
34 | m_stmt = m_conn.createStatement();
35 | } catch (Exception e) {
36 | throw e;
37 | }
38 | }
39 | public ResultSet sendQuery(String sql) {
40 | try {
41 | ResultSet m_rs = m_stmt.executeQuery(sql);
42 | return m_rs;
43 | } catch (SQLException e) {
44 | e.printStackTrace();
45 | return null;
46 | }
47 | }
48 | public int sendUpdate(String sql) {
49 | try {
50 | return m_stmt.executeUpdate(sql);
51 | } catch (SQLException e) {
52 | e.printStackTrace();
53 | return -1;
54 | }
55 | }
56 |
57 |
58 |
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/PassageSimilarityResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations.results;
17 |
18 | public class PassageSimilarityResult extends AbstractResult {
19 |
20 | String documentA;
21 | String documentB;
22 | double similarity;
23 |
24 | /**
25 | * @return the documentA
26 | */
27 | public String getDocumentA() {
28 | return documentA;
29 | }
30 | /**
31 | * @return the documentB
32 | */
33 | public String getDocumentB() {
34 | return documentB;
35 | }
36 | /**
37 | * @return the similarity
38 | */
39 | public double getSimilarity() {
40 | return similarity;
41 | }
42 | /**
43 | * @param documentA the documentA to set
44 | */
45 | public void setDocumentA(String documentA) {
46 | this.documentA = documentA;
47 | }
48 | /**
49 | * @param documentB the documentB to set
50 | */
51 | public void setDocumentB(String documentB) {
52 | this.documentB = documentB;
53 | }
54 | /**
55 | * @param similarity the similarity to set
56 | */
57 | public void setSimilarity(double similarity) {
58 | this.similarity = similarity;
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/corpus/SentenceCorpus.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.corpus;
17 |
18 | import java.io.IOException;
19 |
20 | import tml.storage.Repository;
21 | import tml.vectorspace.NoDocumentsInCorpusException;
22 | import tml.vectorspace.NotEnoughTermsInCorpusException;
23 | import tml.vectorspace.TermWeightingException;
24 |
25 |
26 | /**
27 | * Class representing a corpus formed with the sentences of a document
28 | * @author Jorge Villalon
29 | *
30 | */
31 | public class SentenceCorpus extends Corpus {
32 |
33 | /**
34 | * @param document the document to which the sentences belong
35 | * @throws Exception if the document is null
36 | */
37 | public SentenceCorpus(TextDocument document) throws Exception {
38 | if(document == null)
39 | throw new Exception("A sentence corpus must belong to a document");
40 |
41 | this.luceneQuery = "type:sentence AND reference:p*d" + document.getExternalId();
42 | }
43 |
44 | @Override
45 | public void load(Repository storage)
46 | throws NotEnoughTermsInCorpusException, IOException,
47 | NoDocumentsInCorpusException, TermWeightingException {
48 | super.load(storage);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/tml/src/main/java/package.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
24 |
25 |
26 | TML (Text Mining Library) is a general purpose Text Mining library which purpose is to support the development of educational applications by providing TM functionalities (For a quick start please start in Repository).
27 |
28 |
TML design principles
29 |
30 |
The Storage and Corpus packages implement the storing of documents in a repository, and searching the repository to form a corpus. The Vectorspace package implements the transformation of a corpus into a VSM representation, and the use of this model with data mining algorithms. The Utils package contains those routines that are required for specific processes, not directly related with the TM process (e.g. grammar parsing and matrix operations). Finally, a Configuration class allows TML to process Java properties files to read default parameters for its operation.
31 |
Please read each package documentation for examples.
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/utils/LuceneUtils.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.utils;
17 |
18 | import java.io.IOException;
19 | import java.io.StringReader;
20 |
21 | import org.apache.log4j.Logger;
22 | import org.apache.lucene.analysis.Token;
23 | import org.apache.lucene.analysis.TokenStream;
24 | import org.apache.lucene.analysis.snowball.SnowballFilter;
25 | import org.apache.lucene.analysis.standard.StandardTokenizer;
26 | import org.apache.lucene.util.Version;
27 |
28 | public class LuceneUtils {
29 |
30 | private static Logger logger = Logger.getLogger(LuceneUtils.class);
31 |
32 | @SuppressWarnings("deprecation")
33 | public static String stemWords(String words) {
34 | TokenStream stream = new StandardTokenizer(Version.LUCENE_29, new StringReader(words));
35 | SnowballFilter filter = new SnowballFilter(stream, "English");
36 | Token token = new Token();
37 | StringBuffer stemmed = new StringBuffer();
38 | try {
39 | while((token = filter.next(token)) != null) {
40 | stemmed.append(token.term());
41 | stemmed.append(" ");
42 | }
43 | } catch (IOException e) {
44 | e.printStackTrace();
45 | logger.error(e);
46 | }
47 | return stemmed.toString().trim();
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/storage/importers/HtmlImporter.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.storage.importers;
18 |
19 | import org.apache.log4j.Logger;
20 | import org.htmlparser.Parser;
21 | import org.htmlparser.beans.StringBean;
22 | import org.htmlparser.util.ParserException;
23 |
24 | /**
25 | * This importer uses org.htmlpraser to obtain plain text from an HTML file.
26 | *
27 | * @author Jorge Villalon
28 | *
29 | */
30 | public class HtmlImporter extends AbstractImporter implements Importer {
31 |
32 | private static Logger logger = Logger.getLogger(HtmlImporter.class);
33 |
34 | @Override
35 | public String getCleanContent(String content) {
36 |
37 | String clean = null;
38 | try {
39 | Parser parser = new Parser();
40 | parser.setInputHTML(content);
41 | StringBean bean = new StringBean();
42 | parser.visitAllNodesWith(bean);
43 | clean = bean.getStrings();
44 | } catch (ParserException e) {
45 | logger.error(e);
46 | }
47 | return clean;
48 | }
49 |
50 | @Override
51 | protected String[] getFileExtensions() {
52 | String[] extensions = new String[3];
53 | extensions[0] = "xhtml";
54 | extensions[1] = "html";
55 | extensions[2] = "htm";
56 | return extensions;
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/tml/src/test/java/tml/test/NonNegativeMatrixFactorizationTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.test;
17 |
18 | import org.junit.BeforeClass;
19 | import org.junit.Test;
20 |
21 | import tml.Configuration;
22 | import tml.corpus.TextDocument;
23 | import tml.vectorspace.factorisation.NonnegativeMatrixFactorisationED;
24 |
25 |
26 | import Jama.Matrix;
27 |
28 | public class NonNegativeMatrixFactorizationTest extends AbstractTmlIndexingTest {
29 |
30 | private static TextDocument document;
31 |
32 | @BeforeClass
33 | public static void setUpBeforeClass() throws Exception {
34 | AbstractTmlIndexingTest.setUpBeforeClass();
35 | repository.addDocumentsInFolder(Configuration.getTmlFolder() + "/corpora/uppsala");
36 |
37 | document = repository.getTextDocument("0100.a1");
38 | document.load(repository);
39 | }
40 |
41 | @Test
42 | public void testMatrices() {
43 | Matrix m = document.getSentenceCorpus().getTermDocMatrix();
44 | m.print(10, 5);
45 |
46 | NonnegativeMatrixFactorisationED f = new NonnegativeMatrixFactorisationED();
47 | f.setK(5);
48 | f.process(m);
49 |
50 | new Matrix(f.getDecomposition().getUkdata()).print(10, 5);
51 | new Matrix(f.getDecomposition().getSkdata()).print(10, 5);
52 | new Matrix(f.getDecomposition().getVkdata()).print(10, 5);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/test/AbstractTmlIndexingTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | /**
17 | *
18 | */
19 | package tml.test;
20 |
21 |
22 | import java.io.File;
23 | import java.util.Properties;
24 |
25 | import org.apache.log4j.Logger;
26 | import org.junit.BeforeClass;
27 |
28 | import tml.Configuration;
29 | import tml.storage.Repository;
30 |
31 |
32 | /**
33 | * This class implements a base class for all tests that require indexing all the documents
34 | * within a specific folder.
35 | *
36 | * @author Jorge Villalon
37 | *
38 | */
39 | public abstract class AbstractTmlIndexingTest {
40 |
41 | protected static Logger logger = Logger.getLogger(AbstractTmlIndexingTest.class);
42 |
43 | protected static Repository repository;
44 | protected static String repositoryFolder = null;
45 | protected static String documentsFolder;
46 | protected static File[] filesToAdd = null;
47 | protected static Properties prop;
48 |
49 | /**
50 | * @throws java.lang.Exception
51 | */
52 | @BeforeClass
53 | public static void setUpBeforeClass() throws Exception {
54 | prop = Configuration.getTmlProperties(true);
55 | Repository.cleanStorage(Configuration.getTmlFolder() + "/test/lucene");
56 | repository = new Repository(Configuration.getTmlFolder() + "/test/lucene");
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/LastPassage.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations;
18 |
19 | import java.io.IOException;
20 | import java.util.ArrayList;
21 |
22 | import tml.vectorspace.operations.results.LastPassageResult;
23 |
24 |
25 | /**
26 | * Extracts the last passage of the corpus, given the linearity expected in the
27 | * index.
28 | *
29 | * @author Jorge Villalon
30 | *
31 | */
32 | public class LastPassage extends AbstractOperation {
33 |
34 | /**
35 | *
36 | */
37 | public LastPassage() {
38 | this.name = "Last passage";
39 | this.requiresSemanticSpace = false;
40 | }
41 |
42 | @Override
43 | public void start() throws Exception {
44 | super.start();
45 | this.results = new ArrayList();
46 | try {
47 | String externalId = this.corpus.getPassages()[this.corpus.getPassages().length-1];
48 | String content = this.repository.getDocumentField(externalId, this.repository.getLuceneContentField());
49 | LastPassageResult result = new LastPassageResult();
50 | result.setPassage(content);
51 | this.results.add(result);
52 | } catch (IOException e) {
53 | e.printStackTrace();
54 | logger.error(e);
55 | }
56 | super.end();
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/PassageDistancesResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations.results;
18 |
19 | /**
20 | * Results that represent the distances between two passages.
21 | *
22 | * @author Jorge Villalon
23 | *
24 | */
25 | public class PassageDistancesResult extends AbstractResult {
26 | int documentAId;
27 | int documentBId;
28 | double distance;
29 | /**
30 | * @return the documentAId
31 | */
32 | public int getDocumentAId() {
33 | return documentAId;
34 | }
35 | /**
36 | * @param documentAId the documentAId to set
37 | */
38 | public void setDocumentAId(int documentAId) {
39 | this.documentAId = documentAId;
40 | }
41 | /**
42 | * @return the documentBId
43 | */
44 | public int getDocumentBId() {
45 | return documentBId;
46 | }
47 | /**
48 | * @param documentBId the documentBId to set
49 | */
50 | public void setDocumentBId(int documentBId) {
51 | this.documentBId = documentBId;
52 | }
53 | /**
54 | * @return the distance
55 | */
56 | public double getDistance() {
57 | return distance;
58 | }
59 | /**
60 | * @param distance the distance to set
61 | */
62 | public void setDistance(double distance) {
63 | this.distance = distance;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/tml/src/lanczos/unix/makefile:
--------------------------------------------------------------------------------
1 | #######################################################################
2 | # SVDPACKC (Ver 1.0) Makefile #
3 | #######################################################################
4 | # #
5 | # las1: Single-Vector Lanczos SVD via 2-Cyclic Eigensystems #
6 | # las2: Single-Vector Lanczos SVD via A'A Eigensystems #
7 | # bls1: Block Lanczos SVD via 2-Cyclic Eigensystems #
8 | # bls2: Block Lanczos SVD via A'A Eigensystems #
9 | # sis1: Subspace Iteration SVD via 2_Cyclic Eigensystems #
10 | # sis2: Subspace Iteration SVD via A'A Eigensystems #
11 | # tms1: Trace Minimization SVD via 2_Cyclic Eigensystems #
12 | # tms2: Trace Minimization SVD via A'A Eigensystems #
13 | # #
14 | #######################################################################
15 |
16 | CC = gcc
17 | CFLAGS= -O -c
18 | LIB= -lm
19 | TIMER= timersun.o
20 |
21 | all: las1 las2 bls1 bls2 sis1 sis2 tms1 tms2
22 |
23 | las1.o: las1.h
24 | las2.o: las2.h
25 |
26 | bls1.o: bls1.h
27 | bls2.o: bls2.h
28 |
29 | sis1.o: sisg.h sisc.h
30 | sis2.o: sisg.h sisc.h
31 |
32 | tms1.o: tmsg.h tmsc.h
33 | tms2.o: tmsg.h tmsc.h
34 |
35 | .c.o: $*.c
36 | ${CC} ${CFLAGS} $*.c
37 |
38 | las1: las1.o ${TIMER}
39 | ${CC} -o $@ las1.o ${TIMER} ${LIB}
40 |
41 | las2: las2.o ${TIMER}
42 | ${CC} -o $@ las2.o ${TIMER} ${LIB}
43 |
44 | bls1: bls1.o ${TIMER}
45 | ${CC} -o $@ bls1.o ${TIMER} ${LIB}
46 |
47 | bls2: bls2.o ${TIMER}
48 | ${CC} -o $@ bls2.o ${TIMER} ${LIB}
49 |
50 | sis1: sis1.o ${TIMER}
51 | ${CC} -o $@ sis1.o ${TIMER} ${LIB}
52 |
53 | sis2: sis2.o ${TIMER}
54 | ${CC} -o $@ sis2.o ${TIMER} ${LIB}
55 |
56 | tms1: tms1.o ${TIMER}
57 | ${CC} -o $@ tms1.o ${TIMER} ${LIB}
58 |
59 | tms2: tms2.o ${TIMER}
60 | ${CC} -o $@ tms2.o ${TIMER} ${LIB}
61 |
62 | clean: rm \
63 | las1.o las2.o bls1.o bls2.o sis1.o sis2.o tms1.o tms2.o \
64 | timersun.o timermac.o las1 las2 bls1 bls2 sis1 sis2 tms1 tms2
65 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/LexiconAnalysisResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations.results;
18 |
19 | /**
20 | * The result of a {@link LexiconAnalysis} operation.
21 | * @author Jorge Villalon
22 | *
23 | */
24 | public class LexiconAnalysisResult extends AbstractResult {
25 | String document;
26 | int terms;
27 | int newTerms;
28 |
29 | /**
30 | * @return the document
31 | */
32 | public String getDocument() {
33 | return document;
34 | }
35 |
36 | /**
37 | * @param document the document to set
38 | */
39 | public void setDocument(String document) {
40 | this.document = document;
41 | }
42 |
43 | /**
44 | * @return the terms
45 | */
46 | public int getTerms() {
47 | return terms;
48 | }
49 |
50 | /**
51 | * @param terms the terms to set
52 | */
53 | public void setTerms(int terms) {
54 | this.terms = terms;
55 | }
56 |
57 | /**
58 | * @return the newTerms
59 | */
60 | public int getNewTerms() {
61 | return newTerms;
62 | }
63 |
64 | /**
65 | * @param newTerms the newTerms to set
66 | */
67 | public void setNewTerms(int newTerms) {
68 | this.newTerms = newTerms;
69 | }
70 |
71 | @Override
72 | public String toString() {
73 | return "Document: " + this.getDocument() + " Terms: " + this.terms
74 | + " Accumulated: " + this.newTerms;
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/tml/src/test/java/tml/test/LanczosTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.test;
17 |
18 | import org.junit.BeforeClass;
19 | import org.junit.Test;
20 |
21 | import tml.Configuration;
22 | import tml.corpus.SearchResultsCorpus;
23 | import tml.corpus.CorpusParameters.DimensionalityReduction;
24 | import tml.corpus.CorpusParameters.TermSelection;
25 | import tml.vectorspace.TermWeighting.GlobalWeight;
26 | import tml.vectorspace.TermWeighting.LocalWeight;
27 |
28 |
29 |
30 | public class LanczosTest extends AbstractTmlIndexingTest {
31 |
32 | @BeforeClass
33 | public static void setUpBeforeClass() throws Exception {
34 | AbstractTmlIndexingTest.setUpBeforeClass();
35 | repository.addDocumentsInFolder(Configuration.getTmlFolder() + "/corpora/introLSA");
36 | }
37 |
38 | @Test
39 | public void timeBigCorpus() throws Exception {
40 | SearchResultsCorpus corpus = new SearchResultsCorpus("type:document");
41 | corpus.getParameters().setTermSelectionCriterion(TermSelection.DF);
42 | corpus.getParameters().setTermSelectionThreshold(2);
43 | corpus.getParameters().setDimensionalityReduction(DimensionalityReduction.NUM);
44 | corpus.getParameters().setDimensionalityReductionThreshold(2);
45 | corpus.getParameters().setTermWeightGlobal(GlobalWeight.None);
46 | corpus.getParameters().setTermWeightLocal(LocalWeight.TF);
47 | corpus.load(repository);
48 | corpus.getParameters().setLanczosSVD(true);
49 | corpus.getSemanticSpace().calculate();
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/Summary.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations.results;
17 |
18 | public class Summary {
19 |
20 | int[] passagesRank;
21 | int[] termsRank;
22 | double[] passagesLoads;
23 | double[] termsLoads;
24 |
25 | /**
26 | * @return the passagesRank
27 | */
28 | public int[] getPassagesRank() {
29 | return passagesRank;
30 | }
31 | /**
32 | * @param passagesRank the passagesRank to set
33 | */
34 | public void setPassagesRank(int[] passagesRank) {
35 | this.passagesRank = passagesRank;
36 | }
37 | /**
38 | * @return the termsRank
39 | */
40 | public int[] getTermsRank() {
41 | return termsRank;
42 | }
43 | /**
44 | * @param termsRank the termsRank to set
45 | */
46 | public void setTermsRank(int[] termsRank) {
47 | this.termsRank = termsRank;
48 | }
49 | /**
50 | * @return the passagesLoads
51 | */
52 | public double[] getPassagesLoads() {
53 | return passagesLoads;
54 | }
55 | /**
56 | * @param passagesLoads the passagesLoads to set
57 | */
58 | public void setPassagesLoads(double[] passagesLoads) {
59 | this.passagesLoads = passagesLoads;
60 | }
61 | /**
62 | * @return the termsLoads
63 | */
64 | public double[] getTermsLoads() {
65 | return termsLoads;
66 | }
67 | /**
68 | * @param termsLoads the termsLoads to set
69 | */
70 | public void setTermsLoads(double[] termsLoads) {
71 | this.termsLoads = termsLoads;
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/tml.conceptmap.rules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | conj_and
6 |
7 |
9 | prep_of
10 |
11 |
12 | amod
13 | nn
14 | number
15 | num
16 |
18 |
19 |
20 | neg
21 |
22 |
23 | advmod
24 | aux
25 | auxpass
26 |
27 |
28 | det
29 |
30 |
55 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/ConceptExtraction.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations;
18 |
19 | import java.util.ArrayList;
20 | import java.util.List;
21 |
22 | import tml.vectorspace.operations.results.TermRankedResult;
23 | import tml.vectorspace.operations.results.TermsExtractionSummarizationResult;
24 |
25 |
26 | /**
27 | * Concept Extraction operation based on CMM.
28 | * @author Jorge Villalon
29 | *
30 | */
31 | public class ConceptExtraction extends TermExtractionSummarization {
32 |
33 | List newResults =
34 | new ArrayList();
35 |
36 | public ConceptExtraction() {
37 | this.name = "Concept extraction";
38 | }
39 |
40 | @Override
41 | public void start() throws Exception {
42 | this.maxResults = 35;
43 | super.start();
44 |
45 | logger.info("Originally " + this.results.size() + " results");
46 |
47 | CompoundNounsSummarized op = new CompoundNounsSummarized();
48 | op.setCorpus(corpus);
49 | op.start();
50 |
51 | // TODO: Iterate through compound nouns
52 | for (TermRankedResult result : op.getResults()) {
53 | String noun = result.getTerm();
54 | if (noun.trim().length() == 0)
55 | continue;
56 | TermsExtractionSummarizationResult newResult = new TermsExtractionSummarizationResult();
57 | newResult.setTerm(noun);
58 | newResults.add(newResult);
59 | }
60 |
61 | this.results.clear();
62 | this.results.addAll(newResults);
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/tml/src/test/java/tml/test/SimpleCorpusTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (C) 2001, 2007 University of Sydney
3 | *
4 | * This program is free software; you can redistribute it and/or modify
5 | * it under the terms of the GNU General Public License as published by
6 | * the Free Software Foundation; either version 2 of the License, or
7 | * (at your option) any later version.
8 | *
9 | * This program is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | * GNU General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
17 | * USA
18 | *
19 | * http://www.gnu.org/licenses/gpl.txt
20 | *******************************************************************************/
21 |
22 | package tml.test;
23 |
24 | import org.junit.Test;
25 |
26 | import tml.Configuration;
27 | import tml.corpus.SimpleCorpus;
28 | import static org.junit.Assert.*;
29 |
30 |
31 | /**
32 | * This test creates a simple corpus that loads a set of documents and then it
33 | * can be used directly to create a {@link SemanticSpace}.
34 | *
35 | * @author Jorge Villalon
36 | * @see SimpleCorpus
37 | */
38 | public class SimpleCorpusTest extends AbstractTmlIndexingTest {
39 |
40 | /**
41 | * @throws Exception
42 | */
43 | @Test
44 | public void CreateSimpleCorpus() throws Exception {
45 | SimpleCorpus corpus = new SimpleCorpus(Configuration.getTmlFolder() + "/corpora/introLSA", prop.getProperty("tml.lucene.indexpath"));
46 |
47 | for (String term : corpus.getTerms())
48 | System.out.print(term + " ");
49 | System.out.println();
50 | for (String doc : corpus.getDocuments())
51 | System.out.print(doc + " ");
52 | System.out.println();
53 | double[][] m = corpus.getMatrix();
54 | for (int i = 0; i < corpus.getTerms().length; i++) {
55 | for (int j = 0; j < corpus.getDocuments().length; j++) {
56 | System.out.print(m[i][j] + " ");
57 | }
58 | System.out.println();
59 | }
60 | System.out.println();
61 | assertNotNull(corpus);
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/PassageClusteringLingoResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations.results;
18 |
19 | import java.util.ArrayList;
20 | import java.util.List;
21 |
22 | /**
23 | * @author Jorge Villalon
24 | *
25 | */
26 | public class PassageClusteringLingoResult extends AbstractResult {
27 |
28 | int cluster;
29 | String clusterPhrase;
30 | List documents;
31 |
32 | /**
33 | * @return the cluster
34 | */
35 | public int getCluster() {
36 | return cluster;
37 | }
38 |
39 | /**
40 | * @param cluster the cluster to set
41 | */
42 | public void setCluster(int cluster) {
43 | this.cluster = cluster;
44 | }
45 |
46 | /**
47 | * @return the clusterPhrase
48 | */
49 | public String getClusterPhrase() {
50 | return clusterPhrase;
51 | }
52 |
53 | /**
54 | * @param clusterPhrase the clusterPhrase to set
55 | */
56 | public void setClusterPhrase(String clusterPhrase) {
57 | this.clusterPhrase = clusterPhrase;
58 | }
59 |
60 | /**
61 | * @return the documents
62 | */
63 | public List getDocuments() {
64 | if(documents == null)
65 | documents = new ArrayList();
66 | return documents;
67 | }
68 |
69 | /**
70 | * @param documents the documents to set
71 | */
72 | public void setDocuments(List documents) {
73 | this.documents = documents;
74 | }
75 |
76 | @Override
77 | public String toString() {
78 | return this.clusterPhrase + " [" + this.getDocuments().size() + "]";
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/Summary.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations;
18 |
19 | import java.util.ArrayList;
20 |
21 | import tml.vectorspace.operations.results.SummaryResult;
22 |
23 |
24 | /**
25 | * This operation returns several descriptive statistics on the corpus.
26 | *
27 | * @author Jorge Villalon
28 | *
29 | */
30 | public class Summary extends AbstractOperation {
31 |
32 | /**
33 | *
34 | */
35 | public Summary() {
36 | this.name = "Summary";
37 | }
38 |
39 | @Override
40 | public void start() throws Exception {
41 | super.start();
42 |
43 | this.results = new ArrayList();
44 | SummaryResult result = new SummaryResult();
45 | result.setItem("Documents");
46 | result.setValue(Integer.toString(this.corpus.getPassages().length));
47 | result.setComment("Number of documents in the corpus");
48 | results.add(result);
49 | result = new SummaryResult();
50 | result.setItem("Terms");
51 | result.setValue(Integer.toString(this.corpus.getTerms().length));
52 | result.setComment("Number of terms in the corpus");
53 | results.add(result);
54 | result = new SummaryResult();
55 | result.setItem("Term selection criteria");
56 | result.setValue(this.corpus.getParameters().getTermSelectionCriterion() + " ["
57 | + this.corpus.getParameters().getTermSelectionThreshold() + "]"); // "Value"
58 | result
59 | .setComment("The selection criteria used to create the dictionary");
60 | results.add(result);
61 |
62 | super.end();
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/FactorAnalysisPlot.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations;
18 |
19 | import tml.vectorspace.operations.results.FactorAnalysisPlotResult;
20 | import Jama.Matrix;
21 |
22 | /**
23 | * This operation simply presents the content of the reconstructed term/doc
24 | * matrix with a column for documents and the first row showing the terms
25 | *
26 | * @author Jorge Villalon
27 | *
28 | */
29 | public class FactorAnalysisPlot extends AbstractOperation {
30 |
31 | public FactorAnalysisPlot() {
32 | this.name = "Factor analysis";
33 | this.requiresSemanticSpace = true;
34 | }
35 |
36 | @Override
37 | public void start() throws Exception {
38 | super.start();
39 | Matrix u = this.corpus.getSemanticSpace().getUk();
40 | Matrix v = this.corpus.getSemanticSpace().getVk();
41 |
42 | for (int i = 0; i < u.getRowDimension(); i++) {
43 | FactorAnalysisPlotResult result = new FactorAnalysisPlotResult();
44 | result.setName(this.corpus.getTerms()[i]);
45 | result.setX(u.get(i, 0));
46 | result.setY(u.get(i, 1));
47 | results.add(result);
48 | }
49 | for (int i = u.getRowDimension(); i < u.getRowDimension()
50 | + v.getRowDimension(); i++) {
51 | FactorAnalysisPlotResult result = new FactorAnalysisPlotResult();
52 | result.setName(this.corpus.getPassages()[i - u.getRowDimension()]);
53 | result.setX(v.get(i - u.getRowDimension(), 0));
54 | result.setY(v.get(i - u.getRowDimension(), 1));
55 | results.add(result);
56 | }
57 | super.end();
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/LexiconAnalysis.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations;
18 |
19 | import java.util.ArrayList;
20 | import java.util.List;
21 |
22 | import tml.corpus.Corpus.PassageFreqs;
23 | import tml.vectorspace.operations.results.LexiconAnalysisResult;
24 |
25 |
26 | /**
27 | * LexiconAnalysis returns the accumulated lexicon per passage used in the document. It
28 | * is important to consider that stopwords are removed and words that are kept are stemmed,
29 | * therefore this doesn't correspond to the actual total number of different words.
30 | *
31 | * @author Jorge Villalon
32 | *
33 | */
34 | public class LexiconAnalysis extends AbstractOperation {
35 |
36 | /**
37 | * @param corpus
38 | */
39 | public LexiconAnalysis() {
40 | this.name = "Lexicon analysis";
41 | }
42 |
43 | @Override
44 | public void start() throws Exception {
45 | super.start();
46 |
47 | this.results = new ArrayList();
48 |
49 | List list = new ArrayList();
50 |
51 | for (int i=0; i
2 |
3 |
4 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/tml/src/test/java/tml/test/IndexingHtmlTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (C) 2001, 2007 University of Sydney
3 | *
4 | * This program is free software; you can redistribute it and/or modify
5 | * it under the terms of the GNU General Public License as published by
6 | * the Free Software Foundation; either version 2 of the License, or
7 | * (at your option) any later version.
8 | *
9 | * This program is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | * GNU General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
17 | * USA
18 | *
19 | * http://www.gnu.org/licenses/gpl.txt
20 | *******************************************************************************/
21 |
22 | package tml.test;
23 |
24 | import java.io.IOException;
25 |
26 | import org.junit.BeforeClass;
27 | import org.junit.Test;
28 |
29 | import tml.Configuration;
30 | import tml.corpus.TextDocument;
31 | import tml.corpus.CorpusParameters.DimensionalityReduction;
32 | import tml.corpus.CorpusParameters.TermSelection;
33 | import static org.junit.Assert.*;
34 |
35 |
36 | /**
37 | * @author Jorge Villalon
38 | *
39 | */
40 | public class IndexingHtmlTest extends AbstractTmlIndexingTest {
41 |
42 | private static String TESTS_DOCUMENTS_FOLDER = null;
43 |
44 | @BeforeClass
45 | public static void setUpBeforeClass() throws Exception {
46 | AbstractTmlIndexingTest.setUpBeforeClass();
47 | TESTS_DOCUMENTS_FOLDER = Configuration.getTmlFolder() + "/corpora/html";
48 | repository.addDocumentsInFolder(TESTS_DOCUMENTS_FOLDER);
49 | }
50 |
51 | @Test
52 | public void readPage() throws IOException {
53 | TextDocument doc = repository.getTextDocument("Automobile");
54 | assertNotNull(doc);
55 | }
56 |
57 | @Test
58 | public void loadPageCorpus() throws Exception {
59 | TextDocument doc = repository.getTextDocument("Automobile");
60 | doc.getParameters().setTermSelectionCriterion(TermSelection.DF);
61 | doc.getParameters().setTermSelectionThreshold(2);
62 | doc.getParameters().setDimensionalityReduction(DimensionalityReduction.NO);
63 | doc.load(repository);
64 | assertNotNull(doc.getSentenceCorpus());
65 | assertNotNull(doc.getParagraphCorpus());
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/RelationshipExtractionResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations.results;
18 |
19 | /**
20 | * @author Jorge Villalon
21 | *
22 | */
23 | public class RelationshipExtractionResult extends AbstractResult {
24 |
25 | String conceptA;
26 | String conceptB;
27 | String linkingWord;
28 |
29 | private boolean directed;
30 |
31 | /**
32 | * @return the first concept in the relationship
33 | */
34 | public String getConceptA() {
35 | return conceptA;
36 | }
37 |
38 | /**
39 | * @return the second concept in the relationship
40 | */
41 | public String getConceptB() {
42 | return conceptB;
43 | }
44 |
45 | /**
46 | * @return the linking word for the relationship
47 | */
48 | public String getLinkingWord() {
49 | return linkingWord;
50 | }
51 |
52 | /**
53 | * @return if the relationship is directed (from A to B) or non-directed
54 | */
55 | public boolean isDirected() {
56 | return directed;
57 | }
58 |
59 | /**
60 | * @param conceptA
61 | * the first concept
62 | */
63 | public void setConceptA(String conceptA) {
64 | this.conceptA = conceptA;
65 | }
66 |
67 | /**
68 | * @param conceptB
69 | * the second concept
70 | */
71 | public void setConceptB(String conceptB) {
72 | this.conceptB = conceptB;
73 | }
74 |
75 | /**
76 | * @param directed
77 | * if A points to B
78 | */
79 | public void setDirected(boolean directed) {
80 | this.directed = directed;
81 | }
82 |
83 | /**
84 | * @param linkingWord
85 | * the linking word/phrase
86 | */
87 | public void setLinkingWord(String linkingWord) {
88 | this.linkingWord = linkingWord;
89 | }
90 |
91 | }
92 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/annotators/Annotator.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | /**
17 | *
18 | */
19 | package tml.annotators;
20 |
21 | import java.util.ArrayList;
22 |
23 | /**
24 | * Common interface for all annotators. Each annotator will be called
25 | * from the {@link Repository} to analyze each sentence and then
26 | * store the annotated text in a Lucene field while indexing.
27 | *
28 | * @author Jorge Villalon
29 | *
30 | */
31 | public interface Annotator {
32 |
33 | /**
34 | * This method returns the XML annotated
35 | * version of a text. E.g if we have
36 | * "Rafa is in the US" the annotated version
37 | * would be "RafaUS".
38 | *
39 | * TODO: Analyze if UIMA provides a better annotation schema
40 | *
41 | * @param text the text to be annotated
42 | * @return the XML
43 | */
44 | public String getAnnotations(String text);
45 |
46 | /**
47 | * The Lucene field name where this annotations are
48 | * going to be stored.
49 | *
50 | * @return the Lucene field name
51 | */
52 | public String getFieldName();
53 |
54 | /**
55 | * The schema by which these annotations can be verified.
56 | *
57 | * @return null if no schema is attached
58 | */
59 | public Object getSchema();
60 |
61 | /**
62 | * Returns the pieces of text (words or phrases) in the text that
63 | * are annotated with a particular label.
64 | *
65 | * @param annotationLabel the label to search
66 | * @return a list of text. Null if no text is found.
67 | */
68 | public String[] getAnnotatedText(String annotationLabel);
69 |
70 | /**
71 | * This method initialises any static attributes required for the annotator to run
72 | */
73 | public void init();
74 |
75 | public ArrayList getTypes();
76 | }
77 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/TagClouds.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations;
18 |
19 | import java.util.ArrayList;
20 | import java.util.Collections;
21 | import java.util.Comparator;
22 |
23 | import tml.vectorspace.operations.results.TagCloudsResult;
24 |
25 |
26 | /**
27 | * TagClouds returns the list of terms in the {@link Corpus} weighted by
28 | * the term weighting scheme used in the {@link SemanticSpace}.
29 | *
30 | * @author Jorge Villalon
31 | *
32 | */
33 | public class TagClouds extends AbstractOperation {
34 |
35 | /**
36 | *
37 | */
38 | public TagClouds() {
39 | this.name = "Tagclouds";
40 | this.requiresSemanticSpace = false;
41 | }
42 |
43 | @Override
44 | public void start() throws Exception {
45 | super.start();
46 |
47 | this.results = new ArrayList();
48 |
49 | double max = 0;
50 | for (int termIndex = 0; termIndex < corpus.getTerms().length; termIndex++) {
51 | String term = corpus.getTerms()[termIndex];
52 | double weight = corpus.getTermStats()[termIndex].sum;
53 | TagCloudsResult result = new TagCloudsResult(term, weight);
54 | if(weight > max)
55 | max = weight;
56 | this.results.add(result);
57 | }
58 |
59 | if(max == 0)
60 | max = 1;
61 |
62 | for (TagCloudsResult result : this.results) {
63 | result.setWeight(result.getWeight()/max);
64 | }
65 |
66 | Collections.sort(this.results,
67 | new Comparator() {
68 |
69 | @Override
70 | public int compare(TagCloudsResult arg0,
71 | TagCloudsResult arg1) {
72 | int weight0 = (int) (arg0.getWeight() * 100);
73 | int weight1 = (int) (arg1.getWeight() * 100);
74 | return weight1 - weight0;
75 | }
76 | });
77 |
78 | super.end();
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/RelationshipExtraction.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations;
18 |
19 | import java.io.IOException;
20 | import java.util.ArrayList;
21 | import java.util.Collections;
22 | import java.util.List;
23 |
24 | import tml.annotators.PennTreeAnnotator;
25 | import tml.utils.StanfordUtils;
26 | import tml.vectorspace.operations.results.RelationshipExtractionResult;
27 |
28 | import edu.stanford.nlp.trees.Tree;
29 |
30 |
31 | /**
32 | * Relationship extraction aims to extract the labeled relationships from a set
33 | * of concepts
34 | *
35 | * @author Jorge Villalon
36 | *
37 | */
38 | public class RelationshipExtraction extends
39 | AbstractOperation {
40 |
41 | @Override
42 | public void start() throws Exception {
43 | super.start();
44 |
45 | List rels = new ArrayList();
46 | for (String passageId : this.corpus.getPassages()) {
47 | Tree pennTree = null;
48 | try {
49 | pennTree = StanfordUtils.getTreeFromString(passageId, repository.getDocumentField(passageId, PennTreeAnnotator.FIELD_NAME));
50 | } catch (IOException e) {
51 | e.printStackTrace();
52 | logger.error(e);
53 | return;
54 | }
55 | List verbs = StanfordUtils.extractVerbs(pennTree);
56 | if(verbs != null)
57 | for (String verb : verbs) {
58 | verb = verb.trim().toLowerCase();
59 | if (rels.contains(verb))
60 | continue;
61 | if (verb.length() == 0)
62 | continue;
63 | rels.add(verb);
64 | }
65 | }
66 |
67 | Collections.sort(rels);
68 |
69 | for (String verb : rels) {
70 | RelationshipExtractionResult result = new RelationshipExtractionResult();
71 | result.setLinkingWord(verb);
72 | this.results.add(result);
73 | }
74 | super.end();
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/PassageExtractionSummarizationResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations.results;
18 |
19 | /**
20 | * Represents a passage (document, paragraph or sentence), its load
21 | * and the corresponding eigenvector by which it was selected.
22 | *
23 | * @author Jorge Villalon
24 | *
25 | */
26 | public class PassageExtractionSummarizationResult extends AbstractResult {
27 | int eigenVectorIndex;
28 |
29 | double load;
30 |
31 | String textPassageContent;
32 |
33 | int textPassageId;
34 |
35 | /**
36 | * @return the eigenVectorIndex
37 | */
38 | public int getEigenVectorIndex() {
39 | return eigenVectorIndex;
40 | }
41 |
42 | /**
43 | * @param eigenVectorIndex the eigenVectorIndex to set
44 | */
45 | public void setEigenVectorIndex(int eigenVectorIndex) {
46 | this.eigenVectorIndex = eigenVectorIndex;
47 | }
48 |
49 | /**
50 | * @return the load
51 | */
52 | public double getLoad() {
53 | return load;
54 | }
55 |
56 | /**
57 | * @param load the load to set
58 | */
59 | public void setLoad(double load) {
60 | this.load = load;
61 | }
62 |
63 | /**
64 | * @return the textPassageContent
65 | */
66 | public String getTextPassageContent() {
67 | return textPassageContent;
68 | }
69 |
70 | /**
71 | * @param textPassageContent the textPassageContent to set
72 | */
73 | public void setTextPassageContent(String textPassageContent) {
74 | this.textPassageContent = textPassageContent;
75 | }
76 |
77 | /**
78 | * @return the textPassageId
79 | */
80 | public int getTextPassageId() {
81 | return textPassageId;
82 | }
83 |
84 | /**
85 | * @param textPassageId the textPassageId to set
86 | */
87 | public void setTextPassageId(int textPassageId) {
88 | this.textPassageId = textPassageId;
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/utils/DBUtils.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.utils;
17 |
18 | import java.sql.Connection;
19 | import java.sql.DriverManager;
20 | import java.sql.ResultSet;
21 | import java.sql.SQLException;
22 | import java.sql.Statement;
23 | import java.util.ArrayList;
24 |
25 | public class DBUtils {
26 |
27 | private Connection m_conn;
28 | private Statement m_stmt;
29 | private String url;
30 | private String username ;
31 | private String password;
32 |
33 | public DBUtils(String driver, String url, String username, String password) throws ClassNotFoundException
34 | {
35 | this.url=url;
36 | this.username=username;
37 | this.password=password;
38 | this.setDriver(driver);
39 | }
40 | public boolean setConnection() {
41 | try {
42 | m_conn = DriverManager.getConnection(url, username, password);
43 | m_stmt = m_conn.createStatement();
44 | return true;
45 | } catch (Exception e) {
46 | e.printStackTrace();
47 | }
48 | return false;
49 | }
50 | public boolean setDriver(String driver) throws ClassNotFoundException {
51 | Class.forName(driver);
52 | return true;
53 | }
54 | public ArrayList sendQuery(String sql,String fieldname) {
55 | try {
56 | ArrayList al = new ArrayList();
57 | ResultSet m_rs = m_stmt.executeQuery(sql);
58 | while (m_rs.next()) {
59 | al.add(m_rs.getString(fieldname));
60 | }
61 | m_rs.getStatement().close();
62 | return al;
63 | } catch (SQLException e) {
64 | e.printStackTrace();
65 | return null;
66 | }
67 | }
68 | public int sendUpdate(String sql) {
69 | try {
70 | return m_stmt.executeUpdate(sql);
71 | } catch (SQLException e) {
72 | e.printStackTrace();
73 | return -1;
74 | }
75 | }
76 | public void closeConnection()
77 | {
78 | try {
79 | m_conn.close();
80 | } catch (SQLException e) {
81 | e.printStackTrace();
82 | }
83 | }
84 |
85 |
86 |
87 |
88 | }
89 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/CompoundNounsSummarized.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations;
17 |
18 | import java.io.IOException;
19 | import java.util.ArrayList;
20 | import java.util.Collections;
21 | import java.util.Comparator;
22 | import java.util.List;
23 |
24 | import tml.annotators.PennTreeAnnotator;
25 | import tml.utils.StanfordUtils;
26 | import tml.vectorspace.operations.results.TermRankedResult;
27 |
28 | import edu.stanford.nlp.trees.Tree;
29 |
30 | public class CompoundNounsSummarized extends AbstractOperation implements
31 | Operation {
32 |
33 | public CompoundNounsSummarized() {
34 | this.name = "Compound nounds summarized";
35 | }
36 |
37 | @Override
38 | public void start() throws Exception {
39 | super.start();
40 |
41 | List nouns = new ArrayList();
42 | for(String passageId : corpus.getPassages()) {
43 | String annotation = null;
44 | try {
45 | annotation = this.repository.getDocumentField(passageId, PennTreeAnnotator.FIELD_NAME);
46 | } catch (IOException e) {
47 | e.printStackTrace();
48 | logger.error(e);
49 | }
50 | if(annotation != null) {
51 | Tree pennTree = StanfordUtils.getTreeFromString(passageId, annotation);
52 | List allNouns = StanfordUtils.extractNouns(pennTree);
53 | if(allNouns != null)
54 | for(String noun : allNouns) {
55 | noun = noun.toLowerCase();
56 | if(!nouns.contains(noun)) {
57 | nouns.add(noun);
58 | TermRankedResult result = new TermRankedResult();
59 | result.setTerm(noun.toLowerCase());
60 | result.setRank(0);
61 | this.results.add(result);
62 | }
63 | }
64 | }
65 | }
66 |
67 | Collections.sort(this.results, new Comparator() {
68 |
69 | @Override
70 | public int compare(TermRankedResult o1, TermRankedResult o2) {
71 | return o1.getTerm().compareTo(o2.getTerm());
72 | }
73 |
74 | });
75 |
76 | super.end();
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/tml/www/doc/stylesheet.css:
--------------------------------------------------------------------------------
1 | /* Javadoc style sheet */
2 |
3 | /* Define colors, fonts and other style attributes here to override the defaults */
4 |
5 | /* Page background color */
6 | body {
7 | background-color: #FFFFFF;
8 | background-image:url(resources/bkg_gradient.gif);
9 | background-repeat: repeat-x;
10 | margin:0 auto;
11 | font-family:'Lucida Grande', Geneva, Verdana, Arial, sans-serif;
12 | font-size:12px;
13 | padding:0em 2em;
14 | color:#333;
15 |
16 | }
17 |
18 | /* Common elements */
19 |
20 | font {
21 | font-family: inherit;
22 | font-size: inherit;
23 | color: inherit;
24 | font-weight: inherit; }
25 |
26 | hr { display: none; }
27 |
28 | a:link { color:#0066cc; }
29 | a:visited { color:#8b5caf; }
30 | a:hover { color:#6699cc; }
31 |
32 |
33 | /* Headings */
34 | h1 {
35 | font-size: 145%;
36 | background-image:url(resources/h1_hdr.png);
37 | background-repeat:no-repeat;
38 | border-top:1px dotted #CCCCCC;
39 | line-height:1.2em;
40 | color:#182737;
41 | font-size:2em;
42 | padding:1.5em;
43 | margin-top: 0px;
44 | text-align:left;
45 | }
46 |
47 |
48 | /* Default Table elements and colors */
49 |
50 | th, table { border-collapse:collapse;border-color: #E6E7E8; }
51 |
52 |
53 | .TableHeadingColor {
54 | background:#000000 url(resources/bkg_blkheader.png) repeat-x scroll left top;
55 | color:#FFFFFF;
56 | font-size:12px;
57 | font-weight:bold;
58 | height:31px;
59 | text-align:left;
60 | padding:1.5em;
61 | }
62 |
63 | .TableHeadingColor th {
64 | padding-left: 10px;
65 | }
66 |
67 |
68 | .TableSubHeadingColor { background: #EEEEFF } /* Light mauve */
69 | .TableRowColor { background: #FFFFFF; border-color: #E6E7E8;}
70 | .TableRowColor td { line-height: 175%; padding-left: 10px;}
71 |
72 | /* Font used in left-hand frame lists */
73 | .FrameTitleFont { font-size: 125%; font-family: Helvetica, Arial, sans-serif; font-weight: bold; margin-top: 1em; display: block; }
74 | .FrameHeadingFont { font-size: 125%; font-family: 'Lucida Grande', Geneva, Verdana, Arial, sans-serif; font-weight: bold; margin-top: 1em; display: block; }
75 | .FrameItemFont { font-size: 100%; font-family: Helvetica, Arial, sans-serif }
76 |
77 | /* Navigation bar fonts and colors */
78 |
79 | .NavBarCell1 { background-color: #ffffff;
80 | background-image:url(resources/bkgheader.png);
81 | background-repeat: repeat-x scroll left top;
82 | line-height:2em;
83 | padding-left:6px;
84 | padding-right:6px;
85 | }
86 |
87 | .NavBarFont1 {
88 | color: white;
89 | }
90 | .NavBarCell1 a {
91 | color: white;
92 | }
93 |
94 | .NavBarCell1Rev { background-color:#FFFFFF; padding-left:6px; padding-right:6px;}
95 | .NavBarFont1 { color:#FFFFFF;}
96 | .NavBarFont1Rev { color:#243446;}
97 |
98 | .NavBarCell2 { background-color:#FFFFFF;}
99 | .NavBarCell3 { background-color:#FFFFFF;}
100 |
101 |
--------------------------------------------------------------------------------
/tml/src/test/java/tml/test/IndexingPlainTextTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright (C) 2001, 2007 University of Sydney
3 | *
4 | * This program is free software; you can redistribute it and/or modify
5 | * it under the terms of the GNU General Public License as published by
6 | * the Free Software Foundation; either version 2 of the License, or
7 | * (at your option) any later version.
8 | *
9 | * This program is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | * GNU General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
17 | * USA
18 | *
19 | * http://www.gnu.org/licenses/gpl.txt
20 | *******************************************************************************/
21 |
22 | package tml.test;
23 |
24 | import java.io.File;
25 |
26 | import org.junit.BeforeClass;
27 | import org.junit.Test;
28 |
29 | import tml.Configuration;
30 | import tml.corpus.TextDocument;
31 |
32 | import static org.junit.Assert.*;
33 |
34 | public class IndexingPlainTextTest extends AbstractTmlIndexingTest {
35 |
36 | @BeforeClass
37 | public static void setUpBeforeClass() throws Exception {
38 | AbstractTmlIndexingTest.setUpBeforeClass();
39 | File[] fileList = {
40 | new File(Configuration.getTmlFolder() + "/corpora/uppsala/0100.a1.txt"),
41 | new File(Configuration.getTmlFolder() + "/corpora/uppsala/0101.a1.txt"),
42 | new File(Configuration.getTmlFolder() + "/corpora/uppsala/0102.a1.txt")};
43 | repository.addDocumentsInList(fileList);
44 | }
45 |
46 | @Test
47 | public void numbersDiagnostic01() throws Exception {
48 | TextDocument document = repository.getTextDocument("0100.a1");
49 | document.load(repository);
50 | assertEquals(30, document.getSentenceCorpus().getPassages().length);
51 | assertEquals(9, document.getParagraphCorpus().getPassages().length);
52 | }
53 |
54 | @Test
55 | public void numbersDiagnostic02() throws Exception {
56 | TextDocument document = repository.getTextDocument("0101.a1");
57 | document.load(repository);
58 | assertEquals(41, document.getSentenceCorpus().getPassages().length);
59 | assertEquals(9, document.getParagraphCorpus().getPassages().length);
60 | }
61 |
62 | @Test
63 | public void numbersDiagnostic36() throws Exception {
64 | TextDocument document = repository.getTextDocument("0102.a1");
65 | document.load(repository);
66 | assertEquals(49, document.getSentenceCorpus().getPassages().length);
67 | assertEquals(11, document.getParagraphCorpus().getPassages().length);
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/storage/importers/AbstractImporter.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.storage.importers;
18 |
19 | import java.util.ArrayList;
20 | import java.util.List;
21 |
22 | import org.apache.log4j.Logger;
23 |
24 | /**
25 | * Abstract class for all importers to extend from. It implements the logger
26 | * a list of file extensions and a static factory to obtain the right
27 | * importer for a given extension
28 | *
29 | * @author Jorge Villalon
30 | *
31 | */
32 | public abstract class AbstractImporter {
33 |
34 | protected static Logger logger = Logger.getLogger(AbstractImporter.class);
35 |
36 | protected abstract String[] getFileExtensions();
37 |
38 | protected List fileExtensions;
39 |
40 | /**
41 | * Creates a new instance of an {@link AbstractImporter}. As this class
42 | * is an abstract class, this can be called only by the constructor
43 | * of a sub-class
44 | */
45 | public AbstractImporter() {
46 | this.fileExtensions = new ArrayList();
47 | for (String extension : getFileExtensions()) {
48 | this.fileExtensions.add(extension);
49 | }
50 | }
51 |
52 | /**
53 | * @param fileExtension the extension of a filename (e.g. txt, pdf, doc)
54 | * @return true if the importer can manage the extension
55 | */
56 | public boolean isValidFileExtension(String fileExtension) {
57 | for (String extension : this.fileExtensions) {
58 | if (extension.equals(fileExtension))
59 | return true;
60 | }
61 | return false;
62 | }
63 |
64 | /**
65 | * @param fileExtension the file extension to validate
66 | * @return an importer to manage files of the given extension
67 | */
68 | public static Importer createImporter(String fileExtension) {
69 | Importer importer = null;
70 |
71 | importer = new TextImporter();
72 | if (importer.isValidFileExtension(fileExtension))
73 | return importer;
74 |
75 | importer = new HtmlImporter();
76 | if (importer.isValidFileExtension(fileExtension))
77 | return importer;
78 |
79 | return null;
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/ParagraphCoherenceIndex.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations;
17 |
18 | import java.io.IOException;
19 | import java.util.ArrayList;
20 | import java.util.List;
21 |
22 | import tml.vectorspace.operations.results.PassageDistancesResult;
23 |
24 |
25 | public class ParagraphCoherenceIndex extends PassageDistances {
26 |
27 | public ParagraphCoherenceIndex() {
28 | this.name = "Paragraph coherence index";
29 | }
30 |
31 | @Override
32 | public void start() throws Exception {
33 | super.start();
34 |
35 | List newResults = new ArrayList();
36 | String lastParagraphId = null;
37 | double average = 0;
38 | int total = 0;
39 | int currentParagraphIndex = 0;
40 | try {
41 | for(int i=0; i) newResults;
71 |
72 | super.end();
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/SVD.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace;
17 |
18 | import java.io.File;
19 | import java.io.FileInputStream;
20 | import java.io.FileOutputStream;
21 | import java.io.IOException;
22 | import java.io.ObjectInputStream;
23 | import java.io.ObjectOutputStream;
24 | import java.io.Serializable;
25 |
26 | public class SVD implements Serializable {
27 |
28 | /** Serialization ID */
29 | private static final long serialVersionUID = -1733583945325917544L;
30 |
31 | /** Terms matrix in the semantic space */
32 | private double[][] Ukdata = null;
33 | /** Singular values in the semantic space */
34 | private double[][] Skdata = null;
35 | /** Documents matrix in the semantic space */
36 | private double[][] Vkdata = null;
37 | /**
38 | * @return the ukdata
39 | */
40 | public double[][] getUkdata() {
41 | return Ukdata;
42 | }
43 | /**
44 | * @param ukdata the ukdata to set
45 | */
46 | public void setUkdata(double[][] ukdata) {
47 | Ukdata = ukdata;
48 | }
49 | /**
50 | * @return the skdata
51 | */
52 | public double[][] getSkdata() {
53 | return Skdata;
54 | }
55 | /**
56 | * @param skdata the skdata to set
57 | */
58 | public void setSkdata(double[][] skdata) {
59 | Skdata = skdata;
60 | }
61 | /**
62 | * @return the vkdata
63 | */
64 | public double[][] getVkdata() {
65 | return Vkdata;
66 | }
67 | /**
68 | * @param vkdata the vkdata to set
69 | */
70 | public void setVkdata(double[][] vkdata) {
71 | Vkdata = vkdata;
72 | }
73 |
74 | public void saveSVD(File file) throws IOException {
75 | FileOutputStream stream = new FileOutputStream(file);
76 | ObjectOutputStream objSt = new ObjectOutputStream(stream);
77 | objSt.writeObject(this);
78 | objSt.close();
79 | }
80 |
81 | public static SVD readSVD(File file) throws IOException, ClassNotFoundException {
82 | FileInputStream stream = new FileInputStream(file);
83 | ObjectInputStream objSt = new ObjectInputStream(stream);
84 | SVD svd = (SVD) objSt.readObject();
85 | objSt.close();
86 | return svd;
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/tml/src/test/java/tml/test/ReadabilityTest.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2010 Stephen O'Rourke (stephen.orourke@sydney.edu.au)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.test;
18 |
19 | import static org.junit.Assert.*;
20 |
21 | import org.junit.BeforeClass;
22 | import org.junit.Test;
23 |
24 | import tml.corpus.TextDocument;
25 | import tml.storage.importers.TextImporter;
26 | import tml.vectorspace.TermWeighting;
27 | import tml.vectorspace.operations.Readability;
28 | import tml.vectorspace.operations.results.ReadabilityResult;
29 |
30 | /**
31 | * This class tests the {@link Readability} operation.
32 | *
33 | * @author Stephen O'Rourke
34 | *
35 | */
36 | public class ReadabilityTest extends AbstractTmlIndexingTest {
37 |
38 | private static TextDocument document;
39 |
40 | @BeforeClass
41 | public static void setUpBeforeClass() throws Exception {
42 | AbstractTmlIndexingTest.setUpBeforeClass();
43 | String content = "The cat sat on the mat. On the mat the cat sat.\nThe feline reclined on the axminster.";
44 | repository.addDocument("1", content, "Title", "N/A", new TextImporter());
45 |
46 | document = repository.getTextDocument("1");
47 | document.getParameters().setTermWeightLocal(TermWeighting.LocalWeight.TF);
48 | document.getParameters().setTermWeightGlobal(TermWeighting.GlobalWeight.None);
49 | document.load(repository);
50 | }
51 |
52 | @Test
53 | public void shouldCalculateReadability() throws Exception {
54 | Readability operation = new Readability();
55 | operation.setCorpus(document.getParagraphCorpus());
56 | operation.start();
57 |
58 | assertEquals(operation.getResultsNumber(), 2);
59 |
60 | ReadabilityResult result1 = operation.getResults().get(0);
61 | assertEquals(result1.getDiffGradeLevel(), 9.83, 0.005);
62 | assertEquals(result1.getDiffReadingEase(), 70.5, 0.005);
63 | assertEquals(result1.getFleshKincaidGradeLevel(), -1.45, 0.005);
64 | assertEquals(result1.getFleshReadingEase(), 116.19, 0.05);
65 |
66 | ReadabilityResult result2 = operation.getResults().get(1);
67 | assertEquals(result2.getDiffGradeLevel(), 0.0, 0.0);
68 | assertEquals(result2.getDiffReadingEase(), 0.0, 0.0);
69 | assertEquals(result2.getFleshKincaidGradeLevel(), 8.38, 0.005);
70 | assertEquals(result2.getFleshReadingEase(), 45.69, 0.05);
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/factorisation/SpaceDecomposition.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.factorisation;
17 |
18 | import java.io.File;
19 | import java.io.FileInputStream;
20 | import java.io.FileOutputStream;
21 | import java.io.IOException;
22 | import java.io.ObjectInputStream;
23 | import java.io.ObjectOutputStream;
24 | import java.io.Serializable;
25 |
26 | public class SpaceDecomposition implements Serializable {
27 |
28 | /** Serialization ID */
29 | private static final long serialVersionUID = -1733583945325917544L;
30 |
31 | /** Terms matrix in the semantic space */
32 | private double[][] Ukdata = null;
33 | /** Singular values in the semantic space */
34 | private double[][] Skdata = null;
35 | /** Documents matrix in the semantic space */
36 | private double[][] Vkdata = null;
37 | /**
38 | * @return the ukdata
39 | */
40 | public double[][] getUkdata() {
41 | return Ukdata;
42 | }
43 | /**
44 | * @param ukdata the ukdata to set
45 | */
46 | public void setUkdata(double[][] ukdata) {
47 | Ukdata = ukdata;
48 | }
49 | /**
50 | * @return the skdata
51 | */
52 | public double[][] getSkdata() {
53 | return Skdata;
54 | }
55 | /**
56 | * @param skdata the skdata to set
57 | */
58 | public void setSkdata(double[][] skdata) {
59 | Skdata = skdata;
60 | }
61 | /**
62 | * @return the vkdata
63 | */
64 | public double[][] getVkdata() {
65 | return Vkdata;
66 | }
67 | /**
68 | * @param vkdata the vkdata to set
69 | */
70 | public void setVkdata(double[][] vkdata) {
71 | Vkdata = vkdata;
72 | }
73 |
74 | public void saveSVD(File file) throws IOException {
75 | FileOutputStream stream = new FileOutputStream(file);
76 | ObjectOutputStream objSt = new ObjectOutputStream(stream);
77 | objSt.writeObject(this);
78 | objSt.close();
79 | }
80 |
81 | public static SpaceDecomposition readSVD(File file) throws IOException, ClassNotFoundException {
82 | FileInputStream stream = new FileInputStream(file);
83 | ObjectInputStream objSt = new ObjectInputStream(stream);
84 | SpaceDecomposition svd = (SpaceDecomposition) objSt.readObject();
85 | objSt.close();
86 | return svd;
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/storage/DocumentCleanup.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package tml.storage;
5 |
6 | import java.util.List;
7 |
8 | import org.apache.log4j.Logger;
9 | import org.apache.lucene.document.Document;
10 |
11 | import tml.corpus.CorpusParameters;
12 | import tml.corpus.TextDocument;
13 | import tml.corpus.CorpusParameters.DimensionalityReduction;
14 | import tml.corpus.CorpusParameters.TermSelection;
15 | import tml.vectorspace.TermWeighting.GlobalWeight;
16 | import tml.vectorspace.TermWeighting.LocalWeight;
17 |
18 | /**
19 | * @author Jorge Villalon
20 | *
21 | */
22 | public class DocumentCleanup implements Runnable {
23 |
24 | private static Logger logger = Logger.getLogger(DocumentCleanup.class);
25 | private Repository repository;
26 | private CorpusParameters params;
27 |
28 | public DocumentCleanup(Repository repo) {
29 | this.repository = repo;
30 | this.params = new CorpusParameters();
31 | this.params.setDimensionalityReduction(DimensionalityReduction.NO);
32 | this.params.setDimensionalityReductionThreshold(0);
33 | this.params.setLanczosSVD(false);
34 | this.params.setNormalizeDocuments(false);
35 | this.params.setTermSelectionCriterion(TermSelection.DF);
36 | this.params.setTermSelectionThreshold(0);
37 | this.params.setTermWeightGlobal(GlobalWeight.None);
38 | this.params.setTermWeightLocal(LocalWeight.TF);
39 | }
40 |
41 | /* (non-Javadoc)
42 | * @see java.lang.Runnable#run()
43 | */
44 | @Override
45 | public void run() {
46 | logger.debug("Document cleanup started");
47 |
48 | int total = 0;
49 | List docs;
50 | try {
51 | docs = this.repository.getAllTextDocuments();
52 | } catch (Exception e) {
53 | logger.error(e.getMessage());
54 | return;
55 | }
56 |
57 | if(docs == null) {
58 | logger.debug("No documents to cleanup");
59 | return;
60 | }
61 |
62 | for(TextDocument doc : docs) {
63 | try {
64 | String[][] subs = this.repository.getDbConnection().getSubDocuments(doc.getExternalId());
65 | if(subs.length <= 1) {
66 | logger.debug("Inserting document in the database:" + doc.getExternalId());
67 | Document document = repository.getIndexReader().document(doc.getLuceneId());
68 | this.repository.getDbConnection().insertDocument(repository, document);
69 | doc.setParameters(this.params);
70 | doc.load(repository);
71 | for(int id : doc.getSentenceCorpus().getPassagesLuceneIds()) {
72 | Document sentence = repository.getIndexReader().document(id);
73 | this.repository.getDbConnection().insertDocument(repository, sentence);
74 | }
75 | for(int id : doc.getParagraphCorpus().getPassagesLuceneIds()) {
76 | Document sentence = repository.getIndexReader().document(id);
77 | this.repository.getDbConnection().insertDocument(repository, sentence);
78 | }
79 | total++;
80 | }
81 | } catch (Exception e) {
82 | logger.error(e.getMessage());
83 | continue;
84 | }
85 | }
86 |
87 | if(total > 0)
88 | logger.info("Cleaned " + total + " documents");
89 | else
90 | logger.debug("Nothing to clean!");
91 | }
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/PassageExtractionSummarization.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 |
17 | package tml.vectorspace.operations;
18 |
19 | import java.io.IOException;
20 | import java.util.ArrayList;
21 | import java.util.TreeMap;
22 |
23 | import tml.corpus.TextDocument;
24 | import tml.vectorspace.operations.results.PassageExtractionSummarizationResult;
25 |
26 |
27 | import Jama.Matrix;
28 |
29 | /**
30 | * @author Jorge Villalon
31 | *
32 | */
33 | public class PassageExtractionSummarization extends AbstractOperation {
34 |
35 | private double loadThreshold = 0.5;
36 |
37 | /**
38 | *
39 | */
40 | public PassageExtractionSummarization() {
41 | this.name = "Passage extraction";
42 | }
43 |
44 | /**
45 | * @return the threshold by which a text passage will be kept as result
46 | */
47 | public double getLoadThreshold() {
48 | return loadThreshold;
49 | }
50 |
51 | /**
52 | * @param loadThreshold
53 | */
54 | public void setLoadThreshold(double loadThreshold) {
55 | this.loadThreshold = loadThreshold;
56 | }
57 |
58 | @Override
59 | public void start() throws Exception {
60 |
61 | super.start();
62 |
63 | this.results = new ArrayList();
64 |
65 | Matrix eigenVectors = this.corpus.getSemanticSpace()
66 | .getVk();
67 |
68 | for (int i = 0; i < eigenVectors.getColumnDimension(); i++) {
69 | TreeMap v = new TreeMap();
70 | for (int j = 0; j < eigenVectors.getRowDimension(); j++) {
71 | v.put(Math.abs(eigenVectors.get(j, i)), j);
72 | }
73 | double d = v.lastKey();
74 | int q = v.get(d);
75 | PassageExtractionSummarizationResult result = new PassageExtractionSummarizationResult();
76 | result.setEigenVectorIndex(i);
77 | result.setLoad(d);
78 | try {
79 | TextDocument doc = this.repository.getTextDocument(this.corpus.getPassages()[q]);
80 | result.setTextPassageContent(doc.getContent());
81 | result.setTextPassageId(q);
82 | this.results.add(result);
83 | } catch (IOException e) {
84 | e.printStackTrace();
85 | logger.error(e);
86 | }
87 | if (this.results.size() >= this.maxResults)
88 | break;
89 | }
90 |
91 | super.end();
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/results/TermsExtractionSummarizationResult.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations.results;
17 |
18 | /**
19 | * This class represents the result of a {@link TermExtractionSummarization} operation. It
20 | * represents a {@link Term} in the {@link Corpus}, with its corresponding
21 | * eigenvector and loading.
22 | *
23 | * @author Jorge Villalon
24 | *
25 | */
26 | public class TermsExtractionSummarizationResult extends AbstractResult {
27 | String term;
28 | int termId;
29 | double load;
30 | double variance;
31 | int eigenVectorIndex;
32 |
33 | /**
34 | * @return the position of the eigenvector (relative importance)
35 | */
36 | public int getEigenVectorIndex() {
37 | return eigenVectorIndex;
38 | }
39 |
40 | /**
41 | * @param eigenVectorIndex the position of the eigenvector
42 | */
43 | public void setEigenVectorIndex(int eigenVectorIndex) {
44 | this.eigenVectorIndex = eigenVectorIndex;
45 | }
46 |
47 | /**
48 | * @return the load of the term in the eigenvector
49 | */
50 | public double getLoad() {
51 | return load;
52 | }
53 |
54 | /**
55 | * @param load the load of the term in the eigenvector
56 | */
57 | public void setLoad(double load) {
58 | this.load = load;
59 | }
60 |
61 | /**
62 | * @return the textual representation of the term
63 | */
64 | public String getTerm() {
65 | return term;
66 | }
67 |
68 | /**
69 | * @param sentence the textual representation of the term
70 | */
71 | public void setTerm(String sentence) {
72 | this.term = sentence;
73 | }
74 |
75 | /**
76 | * @return the id of the term
77 | */
78 | public int getTermId() {
79 | return termId;
80 | }
81 |
82 | /**
83 | * @param termId the id of the term
84 | */
85 | public void setTermId(int termId) {
86 | this.termId = termId;
87 | }
88 |
89 | /**
90 | * @return the variance corresponding to the eigenvector
91 | */
92 | public double getVariance() {
93 | return variance;
94 | }
95 |
96 | /**
97 | * @param variance the variance corresponding to the eigenvector
98 | */
99 | public void setVariance(double variance) {
100 | this.variance = variance;
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/tml/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 | tml
6 | tml-core
7 | 3.0.0
8 | TML - Text Mining Library
9 |
10 |
11 |
12 | maven-compiler-plugin
13 | 2.0.2
14 |
15 | 1.6
16 | 1.6
17 |
18 |
19 |
20 |
21 |
22 | ${basedir}/src/main/java
23 |
24 | **/*
25 |
26 |
27 |
28 |
29 |
30 |
31 | nz.ac.waikato.cs
32 | weka
33 | 3.5.6
34 |
35 |
36 | stanford
37 | stanford-parser
38 | 1.6.1
39 |
40 |
41 | edu.mit.jwi
42 | jwi
43 | 2.1.5
44 |
45 |
46 | jama
47 | jama
48 | 1.0.2
49 |
50 |
51 | log4j
52 | log4j
53 | 1.2.14
54 |
55 |
56 | org.xerial
57 | sqlite-jdbc
58 | 3.6.20
59 |
60 |
61 | commons-cli
62 | commons-cli
63 | 1.2
64 |
65 |
66 | org.apache.lucene
67 | lucene-core
68 | 2.4.1
69 |
70 |
71 | org.apache.lucene
72 | lucene-analyzers
73 | 2.4.1
74 |
75 |
76 | org.apache.lucene
77 | lucene-snowball
78 | 2.4.1
79 |
80 |
81 | junit
82 | junit
83 | 4.7
84 |
85 |
86 | commons-logging
87 | commons-logging
88 | 1.1.1
89 | jar
90 | compile
91 |
92 |
93 | org.htmlparser
94 | htmlparser
95 | 1.6
96 | jar
97 | compile
98 |
99 |
100 |
--------------------------------------------------------------------------------
/tml/src/main/java/tml/vectorspace/operations/summarization/LatentSemanticAnalysisSummarization.java:
--------------------------------------------------------------------------------
1 | /*******************************************************************************
2 | * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | *******************************************************************************/
16 | package tml.vectorspace.operations.summarization;
17 |
18 | import tml.corpus.Corpus;
19 | import tml.vectorspace.NotEnoughTermsInCorpusException;
20 | import Jama.Matrix;
21 |
22 | /**
23 | *
24 | * LSA based summarization using Steiberger's formula from:
25 | * INPROCEEDINGS{
26 | * author = {Josef Steinberger and Karel Jezek},
27 | * title = {Using Latent Semantic Analysis in Text Summarization and Summary Evaluation},
28 | * booktitle = {Proceedings of the 7th International Conference ISIM},
29 | * year = {2004}
30 | * }
31 | *
32 | * @author Jorge Villalon
33 | *
34 | */
35 | public class LatentSemanticAnalysisSummarization extends
36 | AbstractSummarizationOperation implements SummarizationOperation {
37 |
38 | private Matrix Vk = null;
39 | private Matrix Sk = null;
40 | private Matrix Uk = null;
41 |
42 | public LatentSemanticAnalysisSummarization() {
43 | this.name = "LSA";
44 | }
45 |
46 | @Override
47 | public void setCorpus(Corpus corpus) {
48 | super.setCorpus(corpus);
49 |
50 | if(corpus == null)
51 | return;
52 |
53 | if(!this.corpus.getSemanticSpace().isCalculated()) {
54 | try {
55 | this.corpus.getSemanticSpace().calculate();
56 | } catch (NotEnoughTermsInCorpusException e) {
57 | logger.error(e);
58 | super.setCorpus(null);
59 | return;
60 | }
61 | }
62 |
63 | // Reminder! Vk is transposed in SVD so Vk is docs by dimensions
64 | this.Vk = this.corpus.getSemanticSpace().getVk().copy();
65 | this.Uk = this.corpus.getSemanticSpace().getUk().copy();
66 |
67 | // The variance corresponds to the squared eigenvalues, so we square S
68 | this.Sk = this.corpus.getSemanticSpace().getSk().copy();
69 | this.Sk = this.Sk.times(this.Sk);
70 | }
71 |
72 | @Override
73 | protected double calculatePassageLoading(int doc) {
74 | double total = 0;
75 | for(int dim =0; dim> ncol), *
8 | * *
9 | * so that {u, sqrt(lambda), v} is a singular triplet of A. *
10 | * (A' = transpose of A) *
11 | * *
12 | * global variables and common areas used by las2 and its *
13 | * procedures. *
14 | **************************************************************/
15 |
16 | #define LMTNW 600000 /* max. size of working area allowed */
17 | #define NMAX 3000 /* bound on ncol, order of A */
18 | #define NZMAX 100000 /* bound on number of nonzeros in a */
19 |
20 | long ierr, /* error flag */
21 | j, /* number of lanczos steps taken */
22 | neig, /* number of ritz values stabilized */
23 | nsig, /* number of accepted ritz values *
24 | * based on kappa (relative accuracy) */
25 | ncol, /* number of columns of A */
26 | nrow, /* number of rows of A */
27 | mxvcount = 0;
28 |
29 | /**************************************************************
30 | * pointers to areas holding input matrix which is stored in *
31 | * harwell-boeing format. *
32 | **************************************************************/
33 | long *pointr = NULL, /* pointer to column start array */
34 | *rowind = NULL; /* pointer to row indices array */
35 | double *value = NULL; /* pointer to nonzero values array */
36 |
37 | double rnm, /* norm of the next residual vector */
38 | anorm,
39 | tol,
40 | eps, /* positive machine epsilon */
41 | eps1, /* roundoff estimate for dot product *
42 | * of two unit vector */
43 | reps,
44 | eps34;
45 |
46 | double *xv1 = NULL, /* temp arrays needed for computing */
47 | *xv2 = NULL, /* singular vectors */
48 | *ztemp = NULL,
49 |
50 | *a = NULL; /* pointer to area used by user- *
51 | * supplied procedure store and holds *
52 | * lanczos vectors */
53 |
54 | FILE *fp_out1 = NULL;/* output file pointers */
55 | long fp_out2;
56 |
57 | char *error[10] = { /* error messages used by function *
58 | * check_parameters */
59 | NULL,
60 | " SORRY, YOUR MATRIX IS TOO BIG ",
61 | " ***** ENDL MUST BE LESS THAN ENDR *****",
62 | " ***** MAXPRS CANNOT EXCEED LANMAX *****",
63 | " ***** N = NROW + NCOL MUST BE GREATER THAN ZERO *****",
64 | " ***** LANMAX (NUMBER OF LANCZOS STEPS) IS INVALID *****",
65 | " ***** MAXPRS (NUMBER OF IEGENPAIRS DESIRED) IS INVALID *****",
66 | " ***** 6*N+4*LANMAX+1 + LANMAX*LANMAX CANNOT EXCEED NW *****",
67 | " ***** 6*N+4*LANMAX+1 CANNOT EXCEED NW *****",
68 | NULL};
69 |
--------------------------------------------------------------------------------