├── autoext ├── src │ ├── main │ │ ├── resources │ │ │ ├── autoext.properties │ │ │ └── log4j.properties │ │ └── java │ │ │ └── edu │ │ │ └── usc │ │ │ └── irds │ │ │ ├── lang │ │ │ ├── package-info.java │ │ │ ├── Function.java │ │ │ └── BiFunction.java │ │ │ └── autoext │ │ │ ├── utils │ │ │ ├── Tuple2.java │ │ │ ├── Checks.java │ │ │ ├── ReflectionUtils.java │ │ │ ├── ParseUtils.java │ │ │ ├── MatrixUtils.java │ │ │ ├── Timer.java │ │ │ ├── BracketTreeGen.java │ │ │ ├── XPathEvaluator.java │ │ │ └── D3JsFormat.java │ │ │ ├── base │ │ │ ├── EditDistanceComputer.java │ │ │ ├── EditCost.java │ │ │ └── SimilarityComputer.java │ │ │ ├── tree │ │ │ ├── DefaultEditCost.java │ │ │ ├── StyleSimComputer.java │ │ │ ├── GrossSimComputer.java │ │ │ ├── StructureSimComputer.java │ │ │ ├── ZSTEDistance.java │ │ │ ├── ZSTEDComputer.java │ │ │ └── TreeNode.java │ │ │ ├── Config.java │ │ │ ├── apted │ │ │ ├── StringToIntMapper.java │ │ │ └── APTEDComputer.java │ │ │ └── cluster │ │ │ ├── FileClusterer.java │ │ │ └── SharedNeighborClusterer.java │ └── test │ │ ├── resources │ │ └── html │ │ │ └── simple │ │ │ ├── 3.html │ │ │ ├── 1.html │ │ │ └── 2.html │ │ └── java │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── autoext │ │ ├── tree │ │ ├── ZSTEDComputerTest.java │ │ ├── StyleSimComputerTest.java │ │ └── GrossSimComputerTest.java │ │ └── utils │ │ └── XPathEvaluatorTest.java └── pom.xml ├── visuals ├── README.md └── webapp │ ├── fonts │ ├── glyphicons-halflings-regular.eot │ ├── glyphicons-halflings-regular.ttf │ ├── glyphicons-halflings-regular.woff │ └── glyphicons-halflings-regular.woff2 │ ├── index.html │ ├── css │ └── style.css │ ├── circle-packing.html │ └── circles-tooltip.html ├── screenshots ├── clusters-tooltip.png └── clusters-tooltip2.png ├── .gitignore ├── OPENSOURCE-LICENCES.md ├── NOTICE.txt ├── apted ├── pom.xml ├── README.md └── src │ └── main │ └── java │ └── edu │ └── usc │ └── irds │ └── ted │ └── apted │ ├── util │ ├── LabelDictionary.java │ └── LblTree.java │ ├── RTEDCommandLine.java │ └── InfoTree_PLUS.java ├── autoext-spark ├── src │ └── main │ │ └── scala │ │ └── edu │ │ └── usc │ │ └── irds │ │ └── autoext │ │ ├── spark │ │ ├── Utils.scala │ │ ├── ContentFilter.scala │ │ ├── CliTool.scala │ │ ├── KeyDumper.scala │ │ ├── DeDuplicator.scala │ │ ├── ContentMerge.scala │ │ ├── D3Export.scala │ │ ├── ContentPartitioner.scala │ │ ├── ContentGrep.scala │ │ ├── SparkJob.scala │ │ ├── Main.scala │ │ ├── IOSparkJob.scala │ │ ├── SimilarityCombiner.scala │ │ ├── ContentSimilarityComputer.scala │ │ └── SharedNeighborCuster.scala │ │ └── hdfs │ │ └── RawToSeq.scala └── pom.xml ├── README.md └── pom.xml /autoext/src/main/resources/autoext.properties: -------------------------------------------------------------------------------- 1 | ted.impl=edu.usc.irds.autoext.apted.APTEDComputer 2 | sim.weight=0.5 -------------------------------------------------------------------------------- /visuals/README.md: -------------------------------------------------------------------------------- 1 | Visualization 2 | ============= 3 | 4 | This module contains visualizations for clustering 5 | 6 | -------------------------------------------------------------------------------- /screenshots/clusters-tooltip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/autoextractor/HEAD/screenshots/clusters-tooltip.png -------------------------------------------------------------------------------- /screenshots/clusters-tooltip2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/autoextractor/HEAD/screenshots/clusters-tooltip2.png -------------------------------------------------------------------------------- /visuals/webapp/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/autoextractor/HEAD/visuals/webapp/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /visuals/webapp/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/autoextractor/HEAD/visuals/webapp/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /visuals/webapp/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/autoextractor/HEAD/visuals/webapp/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /visuals/webapp/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USCDataScience/autoextractor/HEAD/visuals/webapp/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/lang/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This package is an effort to backport few utils 3 | * from newer JDK to older. 4 | * 5 | */ 6 | package edu.usc.irds.lang; -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | out* 4 | data* 5 | 6 | *.class 7 | target/ 8 | # Mobile Tools for Java (J2ME) 9 | .mtj.tmp/ 10 | 11 | # Package Files # 12 | *.jar 13 | *.war 14 | *.ear 15 | 16 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 17 | hs_err_pid* 18 | tmp/ 19 | -------------------------------------------------------------------------------- /autoext/src/test/resources/html/simple/3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | This is my page 3 4 | 5 | 6 |
7 |

Sample Heading

8 |
9 |

Paragraph 1

10 |

Paragraph 2

11 |

Paragraph 3

12 |
13 |
14 |
15 | 16 | 17 | -------------------------------------------------------------------------------- /visuals/webapp/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Visualization 6 | 7 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /OPENSOURCE-LICENCES.md: -------------------------------------------------------------------------------- 1 | ## CyberNeko HTML Parser 2 | http://nekohtml.sourceforge.net/ 3 | Apache 2.0 license 4 | 5 | ## Gson 6 | https://github.com/google/gson 7 | Apache 2.0 license 8 | 9 | ## Args4j 10 | http://args4j.kohsuke.org/ 11 | The MIT License (MIT) 12 | 13 | ## SLF4j 14 | http://www.slf4j.org/ 15 | identical to MIT License 16 | 17 | ## JUnit 18 | http://junit.org/ 19 | Eclipse Public License - v 1.0 20 | -------------------------------------------------------------------------------- /autoext/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 9 | log4j.logger.org.apache.spark = WARN 10 | log4j.logger.org.spark-project.jetty.server.handler = WARN 11 | log4j.logger.edu.usc.irds = DEBUG -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Auto-Extractor 2 | ============== 3 | Copyright 2016 Information Retrieval and Data Science (IRDS) Group, 4 | 5 | This product includes software developed at 6 | Information Retrieval and Data Science Group, University of Southern California (USC), Los Angeles, CA (http://irds.usc.edu) 7 | and 8 | NASA Jet Propulsion Laboratory, Pasadena, CA (http://www.jpl.nasa.gov/) 9 | 10 | This product Uses: 11 | * APTED : 12 | The MIT License (MIT) , Copyright (c) 2016 Mateusz Pawlik and Nikolaus Augsten 13 | 14 | * Many other open source tools with Apache Licence 2.0 15 | -------------------------------------------------------------------------------- /autoext/src/test/resources/html/simple/1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | This is my page 1 4 | 5 | 6 |
7 | 8 | Table 1 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
NameEmail
Thamme Gowdatgowdan at Gmail.com
18 |
19 | 20 | 21 | -------------------------------------------------------------------------------- /autoext/src/test/resources/html/simple/2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | This is my page 2 4 | 5 | 6 |
7 | 8 | Table 2 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 |
NameEmail
Thamme Gowdathammegowda.n at usc.edu
CS Deptcs at usc.edu
22 |
23 | 24 | 25 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/lang/Function.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.lang; 18 | 19 | /** 20 | * Backport of JDK8's Function 21 | * 22 | */ 23 | public interface Function { 24 | R apply(T obj); 25 | } 26 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/lang/BiFunction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.lang; 18 | 19 | /** 20 | * 21 | * Back port of JDK8's BiFunction 22 | * 23 | */ 24 | public interface BiFunction { 25 | R apply(T obj1, U ibj2); 26 | } 27 | -------------------------------------------------------------------------------- /apted/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | edu.usc.irds.ted 6 | apted 7 | 0.1.1 8 | jar 9 | 10 | apted 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 4.12 22 | test 23 | 24 | 25 | 26 | 27 | 28 | maven-compiler-plugin 29 | 30 | 1.7 31 | 1.7 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/Utils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.apache.hadoop.conf.Configuration 20 | import org.apache.nutch.protocol.Content 21 | 22 | /** 23 | * Created by tg on 4/5/16. 24 | */ 25 | object Utils { 26 | 27 | def cloneContent(in:Content) : Content = { 28 | new Content(in.getUrl, in.getBaseUrl, in.getContent, 29 | in.getContentType, in.getMetadata, new Configuration()) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentFilter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import java.lang.Boolean 20 | 21 | import edu.usc.irds.lang.Function 22 | 23 | /** 24 | * Creates a filter based substring presence 25 | */ 26 | @SerialVersionUID(100L) 27 | class ContentFilter(subString:String) 28 | extends Function[String, Boolean] 29 | with scala.Serializable { 30 | 31 | override def apply(t: String): Boolean = t.contains(subString) 32 | } 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Auto Extractor 2 | An intelligent extractor library which learns the structures of the input web pages and then figures out a strategy for scraping the structured content. 3 | 4 | ## Links 5 | + [Build Instructions](https://github.com/USCDataScience/autoextractor/wiki/Build-Instructions) 6 | + [Clustering Web Pages using Apache Spark](https://github.com/USCDataScience/autoextractor/wiki/Clustering-Tutorial) 7 | 8 | 9 | # Developers: 10 | * [Thamme Gowda, USC](mailto:tgowdan@gmail.com) 11 | * [Chris Mattmann, USC & NASA JPL]() 12 | 13 | # Citation: 14 | 15 | If you use this work, please cite: 16 | https://ieeexplore.ieee.org/abstract/document/7785739 17 | 18 | ``` 19 | @inproceedings{gowda2016clustering, 20 | title={Clustering Web Pages Based on Structure and Style Similarity (Application Paper)}, 21 | author={Gowda, Thamme and Mattmann, Chris A}, 22 | booktitle={Information Reuse and Integration (IRI), 2016 IEEE 17th International Conference on}, 23 | pages={175--180}, 24 | year={2016}, 25 | organization={IEEE} 26 | } 27 | ``` 28 | 29 | 30 | # References : 31 | + K. Zhang and D. Shasha. 1989. "Simple fast algorithms for the editing distance between trees and related problems". SIAM J. Comput. 18, 6 (December 1989), 1245-1262. 32 | + Jarvis, R.A.; Patrick, Edward A., "Clustering Using a Similarity Measure Based on Shared Near Neighbors," in Computers, IEEE Transactions on , vol.C-22, no.11, pp.1025-1034, Nov. 1973 33 | 34 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/Tuple2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | /** 20 | * A tuple to store pair of values 21 | */ 22 | public class Tuple2 { 23 | public final F pos0; 24 | public final S pos1; 25 | 26 | public Tuple2(F pos0, S pos1) { 27 | this.pos0 = pos0; 28 | this.pos1 = pos1; 29 | } 30 | 31 | public F getPos0() { 32 | return pos0; 33 | } 34 | 35 | public S getPos1() { 36 | return pos1; 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "(" + pos0 + ", " + pos1 + ")"; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/CliTool.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.kohsuke.args4j.{CmdLineException, CmdLineParser} 20 | 21 | import scala.collection.JavaConversions._ 22 | 23 | /** 24 | *Trait for all tools implementing CLI 25 | */ 26 | trait CliTool { 27 | 28 | def parseArgs(args: Array[String]): Unit ={ 29 | val parser = new CmdLineParser(this) 30 | try { 31 | parser.parseArgument(args.toList) 32 | } catch { 33 | case e:CmdLineException => 34 | System.err.println(e.getMessage) 35 | parser.printUsage(System.err) 36 | System.exit(1) 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/KeyDumper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.apache.hadoop.io.Writable 20 | 21 | /** 22 | * Dumps all the keys of sequence files 23 | */ 24 | class KeyDumper extends IOSparkJob { 25 | 26 | def run(): Unit ={ 27 | sc.union(getInputPaths().map(sc.sequenceFile(_, 28 | classOf[Writable], classOf[Writable]))) 29 | .map(rec => rec._1.toString) //keys only 30 | .saveAsTextFile(outPath) //write it to a file 31 | LOG.info(s"Stored the output at $outPath") 32 | } 33 | } 34 | 35 | object KeyDumper{ 36 | 37 | def main(args: Array[String]) { 38 | new KeyDumper().run(args) 39 | } 40 | } -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/base/EditDistanceComputer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.base; 18 | 19 | /** 20 | * Defines a contract for edit distance computer 21 | * 22 | * @author Thamme Gowda 23 | */ 24 | public interface EditDistanceComputer { 25 | 26 | /** 27 | * Computes edit distance between two similar objects 28 | * @param object1 the first object 29 | * @param object2 the second object 30 | * @return the edit distance measure 31 | */ 32 | double computeDistance(T object1, T object2); 33 | 34 | 35 | /** 36 | * Gets cost metric used for computing the edit distance 37 | * @return edit cost metric 38 | */ 39 | EditCost getCostMetric(); 40 | 41 | } 42 | -------------------------------------------------------------------------------- /visuals/webapp/css/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | font: 300 36px "Helvetica Neue"; 3 | height: 640px; 4 | margin: 80px 160px 80px 160px; 5 | overflow: hidden; 6 | position: relative; 7 | width: 960px; 8 | } 9 | 10 | a:link, a:visited { 11 | color: #777; 12 | text-decoration: none; 13 | } 14 | 15 | a:hover { 16 | color: #666; 17 | } 18 | 19 | blockquote { 20 | margin: 0; 21 | } 22 | 23 | blockquote:before { 24 | content: "“"; 25 | position: absolute; 26 | left: -.4em; 27 | } 28 | 29 | blockquote:after { 30 | content: "”"; 31 | position: absolute; 32 | } 33 | 34 | body > ul { 35 | margin: 0; 36 | padding: 0; 37 | } 38 | 39 | h1 { 40 | font-size: 64px; 41 | } 42 | 43 | h1, h2, h3 { 44 | font-weight: inherit; 45 | margin: 0; 46 | } 47 | 48 | h2, h3 { 49 | text-align: right; 50 | font-size: inherit; 51 | position: absolute; 52 | bottom: 0; 53 | right: 0; 54 | } 55 | 56 | h2 { 57 | font-size: 24px; 58 | position: absolute; 59 | } 60 | 61 | h3 { 62 | bottom: -20px; 63 | font-size: 18px; 64 | } 65 | 66 | .invert { 67 | background: #1f1f1f; 68 | color: #dcdccc; 69 | } 70 | 71 | .invert h2, .invert h3 { 72 | color: #7f9f7f; 73 | } 74 | 75 | .string, .regexp { 76 | color: #f39; 77 | } 78 | 79 | .keyword { 80 | color: #00c; 81 | } 82 | 83 | .comment { 84 | color: #777; 85 | font-style: oblique; 86 | } 87 | 88 | .number { 89 | color: #369; 90 | } 91 | 92 | .class, .special { 93 | color: #1181B8; 94 | } 95 | 96 | body > svg { 97 | position: absolute; 98 | top: -80px; 99 | left: -160px; 100 | } 101 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/DeDuplicator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.apache.hadoop.io.Text 20 | import org.apache.hadoop.mapred.SequenceFileOutputFormat 21 | import org.apache.nutch.protocol.Content 22 | 23 | /** 24 | * A de-duplicator tool 25 | */ 26 | class DeDuplicator extends IOSparkJob{ 27 | 28 | def run(): Unit = { 29 | 30 | val rdd = sc.union(getInputPaths() 31 | .map(sc.sequenceFile(_, classOf[Text], classOf[Content]))) // club all parts 32 | 33 | rdd.map({case (k,v) => (new Text(k), Utils.cloneContent(v))}) 34 | .groupByKey() 35 | .map({case (k, v) => (k, v.iterator.next())}) 36 | .saveAsHadoopFile(outPath, classOf[Text], 37 | classOf[Content], classOf[SequenceFileOutputFormat[Text,Content]]) // save it 38 | 39 | LOG.info(s"Done. Saved output at $outPath") 40 | } 41 | } 42 | 43 | object DeDuplicator extends { 44 | 45 | def main(args: Array[String]) { 46 | new DeDuplicator().run(args) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/Checks.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | /** 20 | * Created by tg on 1/4/16. 21 | */ 22 | public class Checks { 23 | 24 | /** 25 | * A custom {@link RuntimeException} to indicate that a check has failed 26 | */ 27 | public static class CheckFailedException extends RuntimeException{ 28 | /** 29 | * creates an exception 30 | * @param message message to describe why this exception was raised. 31 | */ 32 | public CheckFailedException(String message) { 33 | super(message); 34 | } 35 | } 36 | 37 | /** 38 | * Checks boolean condition, on failure raises {@link CheckFailedException} 39 | * @param condition predicate 40 | * @param message error message to assist debug task when the condition fails 41 | */ 42 | public static void check(boolean condition, String message){ 43 | if (!condition) { 44 | throw new CheckFailedException(message); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /apted/README.md: -------------------------------------------------------------------------------- 1 | # README : APTED 2 | This project has an implementation of Tree Edit Distance (TED). 3 | 4 | APTED is originally developed by Mateusz Pawlik and Nikolaus Augsten. 5 | The original implementation can be found at http://tree-edit-distance.dbresearch.uni-salzburg.at/#download 6 | 7 | The source code from original implementation has been reorganized to make it as a reusable library package for maven 8 | based builds. 9 | 10 | ### Requirements 11 | + Newer version of Maven (Tested on 3.0.5) 12 | + Newer version of JDK (Tested on 1.7.0_95) 13 | 14 | ## Build instructions 15 | + `mvn clean test package` to test and and package. Jar will be at `target/apted-*.jar` 16 | + `mvn install` to use it as a maven library for other projects. Then add the following as dependency to your project 17 | 18 | ```xml 19 | 20 | edu.usc.irds.ted 21 | apted 22 | 0.1.1 23 | 24 | ``` 25 | 26 | ## LICENCE 27 | The original project is distributed under MIT licence, 28 | so this project is available under MIT licence. Find the licence header in the files. 29 | 30 | 31 | --- 32 | 33 | ### (original) README 34 | This is an implementation of the APTED algorithm from [2]. It builds up on the 35 | works in [1] and [3]. 36 | 37 | The source code is published under the MIT licence found in the header of each 38 | source file. 39 | 40 | To build, do the following steps from within the root directory: 41 | mkdir build 42 | cd build 43 | cmake .. 44 | make 45 | 46 | [1] M. Pawlik and N. Augsten. Efficient Computation of the Tree Edit 47 | Distance. ACM Transactions on Database Systems (TODS) 40(1). 2015. 48 | [2] M. Pawlik and N. Augsten. Tree edit distance: Robust and memory- 49 | efficient. Information Systems 56. 2016. 50 | [3] M. Pawlik and N. Augsten. RTED: A Robust Algorithm for the Tree Edit 51 | Distance. PVLDB 5(4). 2011. -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | edu.usc.irds.autoext 6 | autoext-parent 7 | 0.2-SNAPSHOT 8 | 9 | autoext 10 | apted 11 | autoext-spark 12 | 13 | pom 14 | 15 | autoext-parent 16 | http://irds.usc.edu 17 | 18 | 19 | UTF-8 20 | 1.7 21 | 1.7 22 | 1.7.12 23 | 4.12 24 | 25 | 26 | 27 | 28 | org.slf4j 29 | slf4j-log4j12 30 | ${slf4j.version} 31 | 32 | 33 | 34 | junit 35 | junit 36 | ${junit.version} 37 | test 38 | 39 | 40 | 41 | 42 | 43 | maven-compiler-plugin 44 | org.apache.maven.plugins 45 | 3.3 46 | 47 | ${source.version} 48 | ${target.version} 49 | 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/ReflectionUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | import edu.usc.irds.autoext.base.EditDistanceComputer; 20 | import edu.usc.irds.autoext.tree.TreeNode; 21 | 22 | /** 23 | * Created by tg on 2/29/16. 24 | */ 25 | public class ReflectionUtils { 26 | 27 | /** 28 | * this method instantiates a class 29 | * @param clsName name of class 30 | * @return instance 31 | * 32 | */ 33 | public static T instantiate(String clsName) { 34 | try { 35 | Class aClass = Class.forName(clsName, true, ReflectionUtils.class.getClassLoader()); 36 | Object instance = aClass.newInstance(); 37 | return (T) instance; 38 | } catch (Exception e) { 39 | throw new RuntimeException(e); 40 | } 41 | } 42 | 43 | 44 | /** 45 | * this method instantiates an instance of edit distance computer 46 | * @param clsName name of class 47 | * @return 48 | */ 49 | public static EditDistanceComputer intantiateEDComputer(String clsName){ 50 | return instantiate(clsName); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /autoext/src/test/java/edu/usc/irds/autoext/tree/ZSTEDComputerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import org.junit.Test; 20 | 21 | import java.io.File; 22 | 23 | import static org.junit.Assert.assertEquals; 24 | 25 | /** 26 | * Created by tg on 12/29/15. 27 | */ 28 | public class ZSTEDComputerTest { 29 | 30 | @Test 31 | public void testMain() throws Exception { 32 | ClassLoader resLoader = getClass().getClassLoader(); 33 | String file1 = resLoader.getResource("html/simple/1.html").getPath(); 34 | String file2 = resLoader.getResource("html/simple/2.html").getPath(); 35 | String file3 = resLoader.getResource("html/simple/3.html").getPath(); 36 | double distance; 37 | //same file 38 | distance = ZSTEDComputer.computeDistance(new File(file1), new File(file1)); 39 | assertEquals(0.0, distance, 0.00); 40 | 41 | //almost same 42 | distance = ZSTEDComputer.computeDistance(new File(file1), new File(file2)); 43 | assertEquals(3.0, distance, 0.00); 44 | //if(true) return; 45 | //dissimilar 46 | distance = ZSTEDComputer.computeDistance(new File(file1), new File(file3)); 47 | assertEquals(10.0, distance, 0.00); 48 | 49 | } 50 | } -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/tree/DefaultEditCost.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import edu.usc.irds.autoext.base.EditCost; 20 | 21 | import java.io.Serializable; 22 | 23 | /** 24 | * Default unit Costs for edit operations 25 | */ 26 | public class DefaultEditCost implements EditCost, Serializable{ 27 | 28 | private static final long serialVersionUID = -4846293473238639407L; 29 | private int insertCost = 1; 30 | private int removeCost = 1; 31 | private int replaceCost = 1; 32 | private int noEditCost = 0; 33 | private int maxEditCost = replaceCost; 34 | 35 | @Override 36 | public double getInsertCost(TreeNode node) { 37 | return insertCost; 38 | } 39 | 40 | @Override 41 | public double getRemoveCost(TreeNode node) { 42 | return removeCost; 43 | } 44 | 45 | @Override 46 | public double getReplaceCost(TreeNode node1, TreeNode node2) { 47 | return replaceCost; 48 | } 49 | 50 | @Override 51 | public double getNoEditCost() { 52 | return noEditCost; 53 | } 54 | 55 | @Override 56 | public double getMaxUnitCost() { 57 | return maxEditCost; 58 | } 59 | 60 | @Override 61 | public boolean isSymmetric() { 62 | return true; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/ParseUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | import org.cyberneko.html.parsers.DOMParser; 20 | import org.w3c.dom.Document; 21 | import org.xml.sax.InputSource; 22 | import org.xml.sax.SAXException; 23 | 24 | import java.io.FileInputStream; 25 | import java.io.IOException; 26 | import java.io.InputStream; 27 | import java.net.URL; 28 | 29 | /** 30 | * Created by tg on 1/5/16. 31 | */ 32 | public class ParseUtils { 33 | 34 | private static final DOMParser domParser = new DOMParser(); 35 | 36 | public static Document parseFile(String path) throws IOException, SAXException { 37 | synchronized (domParser) { 38 | domParser.parse(new InputSource(new FileInputStream(path))); 39 | Document document = domParser.getDocument(); 40 | domParser.reset(); 41 | return document; 42 | } 43 | } 44 | 45 | public static Document parseURL(URL url) throws IOException, SAXException { 46 | try (InputStream stream = url.openStream()) { 47 | synchronized (domParser) { 48 | domParser.parse(new InputSource(stream)); 49 | Document document = domParser.getDocument(); 50 | domParser.reset(); 51 | return document; 52 | } 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/Config.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.util.Properties; 22 | 23 | /** 24 | * The configuration framework. 25 | */ 26 | public class Config { 27 | 28 | public static final String CONFIG_FILE = "autoext.properties"; 29 | public static final Properties DEF_PROPS = new Properties(); 30 | public static final Config INSTANCE; 31 | 32 | static { 33 | try(InputStream stream = Config.class.getClassLoader().getResourceAsStream(CONFIG_FILE)){ 34 | DEF_PROPS.load(stream); 35 | } catch (IOException e) { 36 | throw new RuntimeException(e); 37 | } 38 | INSTANCE = new Config(DEF_PROPS); 39 | } 40 | 41 | public static Config getInstance(){ 42 | return INSTANCE; 43 | } 44 | 45 | private String tedImpl; 46 | private double simWeight; 47 | 48 | public Config(){ 49 | this(DEF_PROPS); 50 | } 51 | 52 | public Config(Properties props) { 53 | this.tedImpl = props.getProperty("ted.impl").trim(); 54 | this.simWeight = Double.parseDouble(props.getProperty("sim.weight").trim()); 55 | } 56 | 57 | public String getTedImpl() { 58 | return tedImpl; 59 | } 60 | 61 | public double getSimWeight() { 62 | return simWeight; 63 | } 64 | 65 | } 66 | 67 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/base/EditCost.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.base; 18 | 19 | import edu.usc.irds.autoext.tree.DefaultEditCost; 20 | 21 | /** 22 | * Defines contract for Edit cost used by edit cost computer 23 | * @see DefaultEditCost 24 | */ 25 | public interface EditCost { 26 | 27 | /** 28 | * Cost for insertion operation 29 | * @param node node to be inserted 30 | * @return the cost of insertion 31 | */ 32 | double getInsertCost(T node); 33 | 34 | /** 35 | * cost for remove operation 36 | * @param node node to be removed 37 | * @return cost for removal 38 | */ 39 | double getRemoveCost(T node); 40 | 41 | /** 42 | * Cost for replacement 43 | * @param node1 node to be removed 44 | * @param node2 node to be inserted 45 | * @return cost for the replacement 46 | */ 47 | double getReplaceCost(T node1, T node2); 48 | 49 | /** 50 | * Cost for no edit operation 51 | * @return cost for no operation 52 | */ 53 | double getNoEditCost(); 54 | 55 | 56 | /** 57 | * Maximum cost for any single edit operation. 58 | * @return maximum bound on unit edit cost 59 | */ 60 | double getMaxUnitCost(); 61 | 62 | 63 | /** 64 | * true if the edit costs are symmetry. Symmetrc 65 | * @return true or false based on the symmetric nature 66 | */ 67 | boolean isSymmetric(); 68 | } 69 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/base/SimilarityComputer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.base; 18 | 19 | import edu.usc.irds.autoext.tree.StructureSimComputer; 20 | import edu.usc.irds.autoext.tree.StyleSimComputer; 21 | import edu.usc.irds.lang.BiFunction; 22 | 23 | 24 | /** 25 | * Generic Similarity computer contract. Look into the implementations for specific details 26 | * @see StructureSimComputer 27 | * @see StyleSimComputer 28 | * @author Thamme Gowda 29 | * 30 | */ 31 | public abstract class SimilarityComputer implements BiFunction { 32 | 33 | /** 34 | * computes similarity between two objects. The similarity score is on [0.0, 1.0] scale inclusive. 35 | * The score of 1.0 indicates that argument {@code obj1} and {@code obj2} are extremely similar. 36 | * Similarity score of 0.0 indicates that both input objects are extremely dissimilar. 37 | * @param obj1 the first object 38 | * @param obj2 the second object 39 | * @return the similarity score [0.0, 1.0] 40 | */ 41 | public abstract double compute(T obj1, T obj2); 42 | 43 | /** 44 | * Glues this contract with Functional programming 45 | * @param obj1 the first object 46 | * @param obj2 the second object 47 | * @return the similarity between first and second 48 | * @see #compute(Object, Object) 49 | */ 50 | @Override 51 | public Double apply(T obj1, T obj2) { 52 | return this.compute(obj1, obj2); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentMerge.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.apache.hadoop.io.Text 20 | import org.apache.hadoop.mapred.SequenceFileOutputFormat 21 | import org.apache.nutch.protocol.Content 22 | import org.apache.spark.rdd.RDD 23 | import org.kohsuke.args4j.Option 24 | import org.slf4j.LoggerFactory 25 | 26 | /** 27 | * Merges sequence parts into one sequence file with configurable number of parts 28 | */ 29 | class ContentMerge extends IOSparkJob { 30 | 31 | @Option(name = "-numparts", usage = "Number of parts in the output. Ex: 1, 2, 3.... Optional => default") 32 | var partitions:Integer = null 33 | 34 | def run(): Unit = { 35 | 36 | val paths = getInputPaths() 37 | LOG.info(s"Found ${paths.length} input paths") 38 | val rdds = new Array[RDD[(Text, Content)]](paths.length) 39 | for( i <- paths.indices){ 40 | rdds(i) = sc.sequenceFile(paths(i), classOf[Text], classOf[Content]) 41 | } 42 | var rdd = sc.union(rdds) // club all parts 43 | if (partitions != null) { 44 | rdd = rdd.coalesce(partitions) 45 | } 46 | rdd.saveAsHadoopFile(outPath, classOf[Text], classOf[Content], 47 | classOf[SequenceFileOutputFormat[_,_]]) // save it 48 | LOG.info(s"Done. Saved output at $outPath") 49 | } 50 | } 51 | 52 | object ContentMerge { 53 | val LOG = LoggerFactory.getLogger(DeDuplicator.getClass) 54 | def main(args: Array[String]) { 55 | new ContentMerge().run(args) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/D3Export.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import java.util 20 | 21 | import edu.usc.irds.autoext.utils.D3JsFormat 22 | import org.kohsuke.args4j.Option 23 | 24 | import scala.collection.JavaConverters._ 25 | 26 | /** 27 | * This CLI Tool exports clusters into most common format used by d3js charts. 28 | */ 29 | class D3Export extends IOSparkJob { 30 | @Option(name="-ids", usage = "Path to directory/file having index to id mapping. Optional.") 31 | var idsFile:String = null 32 | 33 | override def run(): Unit = { 34 | val rdd = sc.union(getInputPaths().map(sc.textFile(_))) 35 | 36 | val clusters = rdd.map(line => { 37 | val items = line.split(",").map(_.trim.toInt) 38 | (items(0), items.slice(2, 2 + items(1)).toSeq.asJava) 39 | }).collectAsMap().asJava.asInstanceOf[util.Map[Integer, util.List[Integer]]] 40 | 41 | var idsMap: util.Map[Integer, String] = null 42 | if (idsFile != null){ 43 | idsMap = sc.textFile(idsFile) 44 | .map(line => { 45 | val parts = line.split(",") 46 | (parts(0).trim.toInt, parts(1).trim)}) 47 | .collectAsMap().asJava.asInstanceOf[util.Map[Integer, String]] 48 | } 49 | LOG.info("Num Clusters : {} ", clusters.size()) 50 | D3JsFormat.storeClusters(outPath, "Clusters 1", clusters, idsMap, 10.0f) 51 | LOG.info("All done") 52 | } 53 | } 54 | 55 | object D3Export { 56 | 57 | def main(args: Array[String]) { 58 | new D3Export().run(args) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentPartitioner.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import java.net.URL 20 | 21 | import org.apache.hadoop.io.Text 22 | import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat 23 | import org.apache.nutch.protocol.Content 24 | import org.apache.spark.rdd.RDD 25 | 26 | /** 27 | * This tool partitions data based on host name and content type 28 | */ 29 | class ContentPartitioner extends IOSparkJob { 30 | 31 | def run(): Unit ={ 32 | val paths = getInputPaths() 33 | var rdd: RDD[(Text, Content)] = sc.sequenceFile(paths(0), classOf[Text], classOf[Content]) 34 | for (i <- 1 to paths.length - 1){ 35 | rdd = sc.union(rdd, sc.sequenceFile(paths(i), classOf[Text], classOf[Content])) 36 | } 37 | rdd.map({case (k,v) => 38 | val newK = new URL(k.toString).getHost + "/" + v.getContentType.replaceAll("[^a-zA-Z]", "").toLowerCase 39 | (new Text(newK), v)}) // key is host name + content type 40 | .saveAsHadoopFile(outPath, classOf[Text], classOf[Content], 41 | classOf[SplitOutputFormat]) 42 | } 43 | } 44 | 45 | /** 46 | * Splits output based on key name and content type 47 | */ 48 | class SplitOutputFormat extends MultipleSequenceFileOutputFormat[Text, Content]{ 49 | override def generateActualKey(key: Text, value: Content): Text = new Text(value.getUrl) 50 | 51 | override def generateFileNameForKeyValue(key: Text, value: Content, name: String): String = 52 | key.toString + "/" + name 53 | } 54 | 55 | object ContentPartitioner{ 56 | def main(args: Array[String]) { 57 | new ContentPartitioner().run(args) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/MatrixUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | import edu.usc.irds.lang.BiFunction; 20 | 21 | import java.util.List; 22 | 23 | /** 24 | * Utilities related to matrix operations 25 | */ 26 | public class MatrixUtils { 27 | 28 | /** 29 | * Computes the symmetrical matrix. 30 | * @param function the function that can be applied to a pair of objects and returns a double 31 | * @param objs list of objects 32 | * @param the object type 33 | * @return 2D matrix computed by applying function on pairs of objects. 34 | */ 35 | public static double[][] computeSymmetricMatrix(BiFunction function, List objs){ 36 | int n = objs.size(); 37 | double[][] table = new double[n][n]; 38 | for (int i = 0; i < n; i++) { 39 | T objI = objs.get(i); 40 | table[i][i] = function.apply(objI, objI); // the principal diagonal element 41 | for (int j = i + 1; j < objs.size(); j++) { 42 | table[i][j] = function.apply(objI, objs.get(j)); // the upper diagonal 43 | table[j][i] = table[i][j]; // the lower diagonal 44 | } 45 | } 46 | return table; 47 | } 48 | 49 | 50 | /** 51 | * Prints the matrix to STDOUT 52 | * @param matrix the matrix 53 | */ 54 | public static void printMatrix(double[][] matrix) { 55 | for (int i = 0; i < matrix.length; i++) { 56 | for (int j = 0; j < matrix[i].length; j++) { 57 | System.out.printf("%5.2f\t", matrix[i][j]); 58 | } 59 | System.out.println(); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /autoext/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | autoext-parent 5 | edu.usc.irds.autoext 6 | 0.2-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | autoext 11 | jar 12 | 13 | autoext 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 2.32 19 | 1.9.22 20 | 1.7.12 21 | 2.5 22 | 23 | 24 | 25 | 26 | net.sourceforge.nekohtml 27 | nekohtml 28 | ${nekohtml.version} 29 | 30 | 31 | com.google.code.gson 32 | gson 33 | ${gson.version} 34 | 35 | 36 | args4j 37 | args4j 38 | ${args4j.version} 39 | 40 | 41 | edu.usc.irds.ted 42 | apted 43 | 0.1.1 44 | 45 | 46 | 47 | 48 | 49 | maven-assembly-plugin 50 | 51 | 52 | package 53 | 54 | single 55 | 56 | 57 | 58 | 59 | 60 | jar-with-dependencies 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentGrep.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.apache.hadoop.io.Text 20 | import org.apache.hadoop.mapred.SequenceFileOutputFormat 21 | import org.apache.nutch.protocol.Content 22 | import org.apache.spark.rdd.RDD 23 | import org.kohsuke.args4j.Option 24 | 25 | /** 26 | * Greps the content for specific url sub strings and content type sub strings 27 | */ 28 | class ContentGrep extends IOSparkJob { 29 | 30 | @Option(name = "-urlfilter", usage = "Url filter substring", required = true) 31 | var urlFilter: String = null 32 | 33 | @Option(name = "-contentfilter", usage = "Content type filter substring") 34 | var contentFilter:String = null 35 | 36 | def run(): Unit ={ 37 | println("Initializing spark context") 38 | val paths = getInputPaths() 39 | println(s"Found ${paths.length} paths") 40 | 41 | val rdds = new Array[RDD[(Text, Content)]](paths.length) 42 | for( i <- paths.indices){ 43 | rdds(i) = sc.sequenceFile(paths(i), classOf[Text], classOf[Content]) 44 | } 45 | var rdd = sc.union(rdds) 46 | val contentFilter = this.contentFilter 47 | val urlFilter = this.urlFilter 48 | rdd = rdd.filter(rec => ((urlFilter == null || rec._2.getUrl.contains(urlFilter)) 49 | && (contentFilter == null || rec._2.getContentType.contains(contentFilter)))) 50 | LOG.info("Saving output at {}", outPath) 51 | rdd.saveAsHadoopFile(outPath, classOf[Text], classOf[Content], classOf[SequenceFileOutputFormat[_,_]]) 52 | LOG.info("Done. Stopping spark context") 53 | sc.stop() 54 | } 55 | } 56 | 57 | object ContentGrep { 58 | 59 | def main(args: Array[String]) { 60 | new ContentGrep().run(args) 61 | } 62 | } -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/SparkJob.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.apache.hadoop.io.Text 20 | import org.apache.nutch.protocol.Content 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | import org.kohsuke.args4j.Option 23 | import org.slf4j.LoggerFactory 24 | 25 | /** 26 | * Base class for all spark jobs 27 | */ 28 | trait SparkJob extends CliTool { 29 | 30 | val LOG = LoggerFactory.getLogger(getClass) 31 | 32 | @Option(name = "-master", aliases = Array("--master"), 33 | usage = "Spark master. This is not required when job is started with spark-submit") 34 | var sparkMaster: String = null 35 | 36 | @Option(name = "-app", aliases= Array("--app-name"), 37 | usage = "Name for spark context.") 38 | var appName: String = getClass.getSimpleName 39 | 40 | var sc: SparkContext = null 41 | 42 | /** 43 | * initializes spark context if not already initialized 44 | */ 45 | def initSpark(): Unit ={ 46 | if (sc == null) { 47 | LOG.info("Initializing Spark Context ") 48 | val conf = new SparkConf().setAppName(appName) 49 | .registerKryoClasses(Array(classOf[Text], classOf[Content])) 50 | if (sparkMaster != null) { 51 | LOG.info("Spark Master {}", sparkMaster) 52 | conf.setMaster(sparkMaster) 53 | } 54 | sc = new SparkContext(conf) 55 | } 56 | } 57 | 58 | def stopSpark(): Unit ={ 59 | if (sc != null){ 60 | LOG.info("Stopping spark.") 61 | sc.stop() 62 | } 63 | } 64 | 65 | /** 66 | * Abstract method which has actual job description 67 | */ 68 | def run() 69 | 70 | def run(args:Array[String]): Unit ={ 71 | parseArgs(args) 72 | initSpark() 73 | run() 74 | stopSpark() 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /autoext/src/test/java/edu/usc/irds/autoext/utils/XPathEvaluatorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | import org.junit.Test; 20 | import org.w3c.dom.Element; 21 | import org.w3c.dom.NodeList; 22 | 23 | import javax.xml.parsers.DocumentBuilder; 24 | import javax.xml.parsers.DocumentBuilderFactory; 25 | import javax.xml.xpath.XPathExpression; 26 | import java.util.Set; 27 | 28 | import static org.junit.Assert.assertEquals; 29 | import static org.junit.Assert.assertTrue; 30 | 31 | /** 32 | * Created by tg on 1/16/16. 33 | */ 34 | public class XPathEvaluatorTest { 35 | 36 | XPathEvaluator instance = new XPathEvaluator(); 37 | Element docRoot ; 38 | { 39 | try { 40 | DocumentBuilder b = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 41 | docRoot = b.parse(getClass().getClassLoader() 42 | .getResourceAsStream("html/simple/1.html")).getDocumentElement(); 43 | } catch (Exception e) { 44 | throw new RuntimeException(e); 45 | } 46 | } 47 | 48 | @Test 49 | public void testEval() throws Exception { 50 | XPathExpression titleExpr = instance.compile("//title/text()"); 51 | NodeList list = instance.eval(docRoot, titleExpr); 52 | assertEquals(1, list.getLength()); 53 | assertEquals("This is my page 1", list.item(0).getTextContent()); 54 | } 55 | 56 | @Test 57 | public void testFindUniqueClassNames() throws Exception { 58 | Set names = instance.findUniqueClassNames(docRoot); 59 | assertEquals(6, names.size()); 60 | assertTrue(names.contains("header")); 61 | assertTrue(names.contains("row")); 62 | assertTrue(names.contains("cell")); 63 | assertTrue(names.contains("col1")); 64 | assertTrue(names.contains("col2")); 65 | assertTrue(names.contains("table")); 66 | } 67 | } -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/hdfs/RawToSeq.scala: -------------------------------------------------------------------------------- 1 | package edu.usc.irds.autoext.hdfs 2 | 3 | import java.io.File 4 | import java.nio.file 5 | import java.nio.file.Paths 6 | import java.util.concurrent.atomic.AtomicInteger 7 | 8 | import edu.usc.irds.autoext.hdfs.RawToSeq.LOG 9 | import edu.usc.irds.autoext.spark.CliTool 10 | import edu.usc.irds.autoext.utils.Timer 11 | import org.apache.commons.io.FileUtils 12 | import org.apache.hadoop.conf.Configuration 13 | import org.apache.hadoop.fs.{FileSystem, Path} 14 | import org.apache.hadoop.io.{SequenceFile, Text} 15 | import org.apache.nutch.metadata.Metadata 16 | import org.apache.nutch.protocol.Content 17 | import org.kohsuke.args4j.Option 18 | import org.slf4j.LoggerFactory 19 | 20 | /** 21 | * This tool creates sequence file from Raw HTML files 22 | */ 23 | class RawToSeq extends CliTool { 24 | 25 | @Option(name = "-in", required = true, usage = "path to directory having html pages") 26 | var in: String = null 27 | 28 | @Option(name = "-out", required = true, usage = "path to output Sequence File") 29 | var output: String = null 30 | 31 | def run(): Unit ={ 32 | val config = new Configuration() 33 | val fs = FileSystem.get(config) 34 | val inDir = new File(in) 35 | 36 | val files = FileUtils.listFiles(inDir, null, true).iterator() 37 | val outPath = new Path(output) 38 | //SequenceFile.createWriter(fs, config, outPath, , classOf[Content], ) 39 | val writer = SequenceFile.createWriter(config, SequenceFile.Writer.keyClass(classOf[Text]), 40 | SequenceFile.Writer.valueClass(classOf[Content]), SequenceFile.Writer.file(outPath)) 41 | 42 | val timer = new Timer 43 | val delay = 2000 44 | val count = new AtomicInteger() 45 | while (files.hasNext) { 46 | val nextFile = files.next() 47 | if (nextFile.isDirectory || nextFile.getName.startsWith(".")){ 48 | //that's fine, skip it 49 | } else if (nextFile.isFile) { 50 | val id = nextFile.getPath 51 | val allBytes: Array[Byte] = file.Files.readAllBytes(Paths.get(nextFile.getAbsolutePath)) 52 | val content = new Content(id, id, allBytes, "text/html", new Metadata(), config) 53 | writer.append(new Text(id), content) 54 | count.incrementAndGet() 55 | } else { 56 | LOG.warn(s"Skip : $nextFile" ) 57 | } 58 | if (timer.read() >= delay ){ 59 | LOG.info(s"Count = $count, Last=$nextFile") 60 | timer.reset() 61 | } 62 | } 63 | writer.close() 64 | LOG.info(s"Done.. $count") 65 | } 66 | } 67 | 68 | object RawToSeq { 69 | 70 | val LOG = LoggerFactory.getLogger(RawToSeq.getClass) 71 | def main(args: Array[String]) { 72 | val i = new RawToSeq() 73 | i.parseArgs(args) 74 | i.run() 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/Timer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | /** 20 | * A simple reusable timer utility for benchmarking the code snippet. 21 | * 22 | * @author Thamme Gowda 23 | */ 24 | public class Timer { 25 | 26 | private long start; 27 | private long end; 28 | 29 | /** 30 | * Creates a timer and also marks the start 31 | */ 32 | public Timer() { 33 | this.start = System.currentTimeMillis(); 34 | } 35 | 36 | /** 37 | * Starts the timer. 38 | * @see #reset() to reuse the timer 39 | */ 40 | public void start(){ 41 | this.start = System.currentTimeMillis(); 42 | } 43 | 44 | /** 45 | * Resets the timer and returns the value before the reset 46 | * @return the previous value of the timer 47 | */ 48 | public long reset(){ 49 | long old = read(); 50 | this.start = System.currentTimeMillis(); 51 | return old; 52 | } 53 | 54 | /** 55 | * Stops the timer 56 | * @return the timer value at the stop 57 | * @see #read() to retrieve it later time 58 | */ 59 | public long stop(){ 60 | this.end = System.currentTimeMillis(); 61 | return this.end - this.start; 62 | } 63 | 64 | /** 65 | * reads the timer value. 66 | * @return the timer value, computes the difference between the start and end when applicable 67 | */ 68 | public long read(){ 69 | return (this.end >= this.start ? this.end : System.currentTimeMillis()) - this.start; 70 | } 71 | 72 | /** 73 | * Gets the timestamp when this timer was started 74 | * @return start timestamp 75 | */ 76 | public long getStart() { 77 | return start; 78 | } 79 | 80 | /** 81 | * Gets the timestamp when this timer was stopped. 82 | * @return stop timestamp. 0 if the timer was not stopped 83 | */ 84 | public long getEnd() { 85 | return end; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/Main.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import edu.usc.irds.autoext.hdfs.RawToSeq 20 | 21 | object Main { 22 | 23 | val cmds = Map[String, (Class[_], String)]( 24 | "help" -> (null, "Prints this help message."), 25 | "partition" -> (classOf[ContentPartitioner], "Partitions Nutch Content based on host names."), 26 | "keydump" -> (classOf[KeyDumper], "Dumps all the keys of sequence files(s)."), 27 | "grep" -> (classOf[ContentGrep], "Greps for the records which contains url and content type filters."), 28 | "merge" -> (classOf[ContentMerge], "Merges (smaller) part files into one large sequence file."), 29 | "similarity" -> (classOf[ContentSimilarityComputer], "Computes similarity between documents."), 30 | "sncluster" -> (classOf[SharedNeighborCuster], "Cluster using Shared near neighbor algorithm."), 31 | "simcombine" -> (classOf[SimilarityCombiner], "Combines two similarity measures on a linear scale."), 32 | "dedup" -> (classOf[DeDuplicator], "Removes duplicate documents (exact url matches)."), 33 | "d3export" -> (classOf[D3Export], "Exports clusters into most popular d3js format for clusters."), 34 | "createseq" -> (classOf[RawToSeq], "Creates a sequence file (compatible with Nutch Segment) from raw HTML files.") 35 | ) 36 | 37 | def printAndExit(exitCode:Int = 0, msg:String = "Usage "): Unit ={ 38 | println(msg) 39 | println("Commands::") 40 | cmds.foreach({case (cmd,(cls, desc))=> println(String.format(" %-9s - %s", cmd, desc))}) 41 | System.exit(exitCode) 42 | } 43 | 44 | def main(args: Array[String]) { 45 | if (args.length == 0) { 46 | printAndExit(1, "Error: Invalid args") 47 | } else if (!cmds.contains(args(0)) || args(0).equalsIgnoreCase("help")){ 48 | printAndExit(1) 49 | } else { 50 | val method = cmds.get(args(0)).get._1.getDeclaredMethod("main", args.getClass) 51 | method.invoke(null, args.slice(1, args.length)) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /apted/src/main/java/edu/usc/irds/ted/apted/util/LabelDictionary.java: -------------------------------------------------------------------------------- 1 | // The MIT License (MIT) 2 | // Copyright (c) 2016 Mateusz Pawlik and Nikolaus Augsten 3 | // 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy 5 | // of this software and associated documentation files (the "Software"), to deal 6 | // in the Software without restriction, including without limitation the rights 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | // copies of the Software, and to permit persons to whom the Software is 9 | // furnished to do so, subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be included in 12 | // all copies or substantial portions of the Software. 13 | // 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | // SOFTWARE. 21 | 22 | package edu.usc.irds.ted.apted.util; 23 | 24 | import java.io.Serializable; 25 | import java.util.Hashtable; 26 | import java.util.Map; 27 | 28 | /** 29 | * Dictionary to store labels to integers mappings. 30 | * 31 | * @author Nikolaus Augsten 32 | * 33 | */ 34 | public class LabelDictionary implements Serializable { 35 | 36 | private static final long serialVersionUID = -5657129208276560195L; 37 | 38 | public LabelDictionary() { 39 | newLabelsAllowed = true; 40 | count = 0; 41 | StrInt = new Hashtable(); 42 | IntStr = new Hashtable(); 43 | } 44 | 45 | public int store(String label) { 46 | if (StrInt.containsKey(label)) 47 | return ((Integer)StrInt.get(label)).intValue(); 48 | if (!newLabelsAllowed) { 49 | return -1; 50 | } else { 51 | Integer intKey = new Integer(count++); 52 | StrInt.put(label, intKey); 53 | IntStr.put(intKey, label); 54 | return intKey.intValue(); 55 | } 56 | } 57 | 58 | public String read(int labelID) 59 | { 60 | return (String)IntStr.get(new Integer(labelID)); 61 | } 62 | 63 | public boolean isNewLabelsAllowed() 64 | { 65 | return newLabelsAllowed; 66 | } 67 | 68 | public void setNewLabelsAllowed(boolean newLabelsAllowed) 69 | { 70 | this.newLabelsAllowed = newLabelsAllowed; 71 | } 72 | 73 | public static final int KEY_DUMMY_LABEL = -1; 74 | private int count; 75 | private Map StrInt; 76 | private Map IntStr; 77 | private boolean newLabelsAllowed; 78 | } -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/IOSparkJob.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.kohsuke.args4j.{CmdLineParser, Option} 20 | 21 | /** 22 | * Trait for SparkJobs which have Inputs and outputs 23 | */ 24 | trait IOSparkJob extends SparkJob { 25 | 26 | @Option(name = "-in", forbids = Array("-list"), 27 | usage = "path to a file/folder having input data") 28 | var inputPath: String = null 29 | 30 | @Option(name = "-list", forbids=Array("-in"), 31 | usage = "path to a file which contains many input paths (one path per line).") 32 | var listFilePath: String = null 33 | 34 | @Option(name = "-out", required = true, usage = "Path to file/folder where the output shall be stored") 35 | var outPath: String = null 36 | 37 | @Option(name = "-locallist", forbids = Array("-in"), depends = Array("-list"), 38 | usage = "When this flag is set the -list is forced to treat as local file." + 39 | " By default the list is read from distributed filesystem when applicable") 40 | var localList: Boolean = false 41 | 42 | override def parseArgs(args:Array[String]): Unit ={ 43 | super.parseArgs(args) 44 | if (inputPath == null && listFilePath == null) { 45 | System.err.println("Either -in or -list is required.") 46 | new CmdLineParser(this).printUsage(System.err) 47 | System.exit(1) 48 | } 49 | } 50 | 51 | /** 52 | * Gets input paths to this io job 53 | * @return paths to job 54 | */ 55 | def getInputPaths(): Array[String] ={ 56 | if (inputPath != null) { 57 | Array(inputPath) 58 | } else if (listFilePath != null) { 59 | val lines = 60 | if (localList) { 61 | val src = scala.io.Source.fromFile(listFilePath) 62 | try src.getLines().toArray finally src.close() 63 | } else { 64 | sc.textFile(listFilePath).collect() 65 | } 66 | lines.map(_.trim).filter(l => !l.startsWith("#") && !l.isEmpty) 67 | } else { 68 | throw new RuntimeException("No input specified") 69 | } 70 | } 71 | 72 | } 73 | 74 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/SimilarityCombiner.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import org.apache.spark.mllib.linalg.distributed.MatrixEntry 20 | import org.kohsuke.args4j.Option 21 | 22 | /** 23 | * Combines two similarities on a linear scale with given weight value 24 | */ 25 | class SimilarityCombiner extends SparkJob { 26 | 27 | @Option(name = "-in1", required = true, usage = "Path to similarity Matrix 1 (Expected : saved MatrixEntry RDD).") 28 | var in1Path: String = null 29 | 30 | @Option(name = "-in2", required = true, usage = "Path to Similarity Matrix 2 (Expected : saved MatrixEntry RDD)") 31 | var in2Path: String = null 32 | 33 | @Option(name = "-out", required = true, usage = "Path to output file/folder where the result similarity matrix shall be stored.") 34 | var outPath: String = null 35 | 36 | @Option(name = "-weight", required = true, 37 | usage = "Weight/Scale for combining the similarities. The expected is [0.0, 1.0]. " + 38 | "The combining step is \n out = in1 * weight + (1.0 - weight) * in2") 39 | var weight: Double = -1.0 40 | 41 | /** 42 | * method which has actual job description 43 | */ 44 | override def run(): Unit ={ 45 | val weight = this.weight //local variable 46 | if (weight < 0 || weight > 1){ 47 | throw new IllegalArgumentException(s"Weight $weight is out of bound. expected in range [0.0, 1.0]") 48 | } 49 | LOG.info(s"Combining $in1Path with $in2Path with scale $weight") 50 | val first = sc.objectFile[MatrixEntry](in1Path).map(e => ((e.i, e.j), e.value)) 51 | val second = sc.objectFile[MatrixEntry](in2Path).map(e => ((e.i, e.j), e.value)) 52 | 53 | val result = first.join(second).map({case ((i,j),(v1, v2)) => MatrixEntry(i, j, weight * v1 + (1 - weight) * v2)}) 54 | result.saveAsObjectFile(outPath) 55 | LOG.info(s"Saved output at $outPath") 56 | } 57 | } 58 | 59 | object SimilarityCombiner{ 60 | def main(args: Array[String]) { 61 | new SimilarityCombiner().run(args) 62 | } 63 | } 64 | 65 | 66 | -------------------------------------------------------------------------------- /autoext/src/test/java/edu/usc/irds/autoext/tree/StyleSimComputerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import edu.usc.irds.autoext.utils.ParseUtils; 20 | import org.junit.Test; 21 | import org.w3c.dom.Document; 22 | 23 | import java.util.Arrays; 24 | import java.util.HashSet; 25 | import java.util.Set; 26 | 27 | import static org.junit.Assert.*; 28 | 29 | /** 30 | * Created by tg on 1/16/16. 31 | */ 32 | public class StyleSimComputerTest { 33 | 34 | StyleSimComputer instance = new StyleSimComputer(); 35 | 36 | @Test 37 | public void testCountIntersection() throws Exception { 38 | 39 | Set a = new HashSet<>(Arrays.asList(1,2,3)); 40 | Set b = new HashSet<>(Arrays.asList(3, 4, 5)); 41 | assertEquals(1, instance.countIntersection(a, b)); 42 | b.clear(); 43 | assertEquals(0, instance.countIntersection(a, b)); 44 | b.addAll(a); 45 | assertEquals(3, instance.countIntersection(a, b)); 46 | } 47 | 48 | @Test 49 | public void testCompute() throws Exception { 50 | Document doc1 = ParseUtils.parseFile("src/test/resources/html/simple/1.html"); 51 | Document doc2 = ParseUtils.parseFile("src/test/resources/html/simple/2.html"); 52 | Document doc3 = ParseUtils.parseFile("src/test/resources/html/simple/3.html"); 53 | 54 | TreeNode tree1 = new TreeNode(doc1.getDocumentElement(), null); 55 | TreeNode tree2 = new TreeNode(doc2.getDocumentElement(), null); 56 | TreeNode tree3 = new TreeNode(doc3.getDocumentElement(), null); 57 | 58 | assertEquals(1.0, instance.compute(tree1, tree1), 0.001); 59 | assertEquals(1.0, instance.compute(tree2, tree2), 0.001); 60 | assertEquals(1.0, instance.compute(tree3, tree3), 0.001); 61 | assertEquals(instance.compute(tree1, tree2), instance.compute(tree2, tree1), 0.001); 62 | assertEquals(instance.compute(tree1, tree3), instance.compute(tree3, tree1), 0.001); 63 | assertEquals(instance.compute(tree2, tree3), instance.compute(tree3, tree2), 0.001); 64 | 65 | assertEquals(0.9, instance.compute(tree1, tree2), 0.25); 66 | assertEquals(0.0, instance.compute(tree1, tree3), 0.25); 67 | assertEquals(0.0, instance.compute(tree2, tree3), 0.25); 68 | 69 | } 70 | } -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/BracketTreeGen.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | import edu.usc.irds.autoext.tree.TreeNode; 20 | import org.kohsuke.args4j.CmdLineException; 21 | import org.kohsuke.args4j.CmdLineParser; 22 | import org.kohsuke.args4j.Option; 23 | import org.w3c.dom.Document; 24 | import org.xml.sax.SAXException; 25 | 26 | import java.io.File; 27 | import java.io.IOException; 28 | import java.net.URL; 29 | import java.nio.file.Files; 30 | 31 | /** 32 | * This is a CLI utility for converting an HTML file into 33 | * bracket notation labelled tree structure 34 | * 35 | */ 36 | public class BracketTreeGen { 37 | 38 | @Option(name = "-in", usage = "Path to HTML file", forbids = {"-url"}) 39 | private File htmlFile; 40 | 41 | @Option(name = "-url", usage = "URL of HTML doc", forbids = {"-in"}) 42 | private URL htmlURL; 43 | 44 | @Option(name = "-out", usage = "Path to output file to store bracket notation tree") 45 | private File output; 46 | 47 | public static void main(String[] args) throws IOException, SAXException { 48 | //args = "-out sample.tree -url https://www.youtube.com/".split(" "); 49 | BracketTreeGen treeGen = new BracketTreeGen(); 50 | CmdLineParser parser = new CmdLineParser(treeGen); 51 | try { 52 | parser.parseArgument(args); 53 | if (treeGen.htmlFile == null && treeGen.htmlURL == null){ 54 | throw new CmdLineException("Either '-in' or '-url' is required"); 55 | } 56 | } catch (CmdLineException e) { 57 | System.out.println(e.getLocalizedMessage()); 58 | parser.printUsage(System.out); 59 | System.exit(-1); 60 | } 61 | Document doc; 62 | if (treeGen.htmlFile != null) { 63 | doc = ParseUtils.parseFile(treeGen.htmlFile.getPath()); 64 | 65 | } else { 66 | doc = ParseUtils.parseURL(treeGen.htmlURL); 67 | } 68 | TreeNode node = new TreeNode(doc.getDocumentElement(), null); 69 | String bracketNotation = node.toBracketNotation(); 70 | if (treeGen.output != null) { 71 | treeGen.output.getAbsoluteFile().getParentFile().mkdirs(); 72 | Files.write(treeGen.output.toPath(), bracketNotation.getBytes("UTF-8")); 73 | System.out.println("Output stored in " + treeGen.output); 74 | } else { 75 | // dump to STDOUT 76 | System.out.println(bracketNotation); 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /autoext/src/test/java/edu/usc/irds/autoext/tree/GrossSimComputerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import edu.usc.irds.autoext.base.SimilarityComputer; 20 | import edu.usc.irds.autoext.utils.ParseUtils; 21 | import org.junit.Test; 22 | import org.w3c.dom.Document; 23 | 24 | import java.util.Arrays; 25 | 26 | import static org.junit.Assert.assertEquals; 27 | 28 | /** 29 | * Created by tg on 1/16/16. 30 | */ 31 | public class GrossSimComputerTest { 32 | 33 | @Test 34 | public void testCompute() throws Exception { 35 | 36 | SimilarityComputer caseSensitiveComputer = new SimilarityComputer() { 37 | @Override 38 | public double compute(String obj1, String obj2) { 39 | return obj1.equals(obj2) ? 1.0 : 0.0; 40 | } 41 | }; 42 | 43 | SimilarityComputer caseInsensitiveComputer = new SimilarityComputer() { 44 | @Override 45 | public double compute(String obj1, String obj2) { 46 | return obj1.toLowerCase().equals(obj2.toLowerCase()) ? 1.0 : 0.0; 47 | } 48 | }; 49 | 50 | GrossSimComputer computer = new GrossSimComputer<>(Arrays.asList(caseSensitiveComputer, caseInsensitiveComputer), Arrays.asList(0.5, 0.5)); 51 | assertEquals(1.0, computer.compute("abcd", "abcd"), 0.00001); 52 | assertEquals(0.5, computer.compute("abcd", "ABCD"), 0.00001); 53 | assertEquals(0.0, computer.compute("aaa", "bbbb"), 0.00001); 54 | } 55 | 56 | @Test 57 | public void testCreateWebSimilarityComputer() throws Exception { 58 | GrossSimComputer simComputer = GrossSimComputer.createWebSimilarityComputer(); 59 | 60 | Document doc1 = ParseUtils.parseFile("src/test/resources/html/simple/1.html"); 61 | Document doc2 = ParseUtils.parseFile("src/test/resources/html/simple/2.html"); 62 | Document doc3 = ParseUtils.parseFile("src/test/resources/html/simple/3.html"); 63 | 64 | TreeNode tree1 = new TreeNode(doc1.getDocumentElement(), null); 65 | TreeNode tree2 = new TreeNode(doc2.getDocumentElement(), null); 66 | TreeNode tree3 = new TreeNode(doc3.getDocumentElement(), null); 67 | assertEquals(1.0, simComputer.compute(tree1, tree1), 0.0001); 68 | assertEquals(1.0, simComputer.compute(tree2, tree2), 0.0001); 69 | assertEquals(1.0, simComputer.compute(tree3, tree3), 0.0001); 70 | assertEquals(0.9, simComputer.compute(tree1, tree2), 0.1); 71 | assertEquals(0.3, simComputer.compute(tree1, tree3), 0.1); 72 | assertEquals(0.3, simComputer.compute(tree2, tree3), 0.1); 73 | } 74 | } -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/tree/StyleSimComputer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import edu.usc.irds.autoext.base.SimilarityComputer; 20 | import edu.usc.irds.autoext.utils.XPathEvaluator; 21 | import org.w3c.dom.Element; 22 | 23 | import java.io.Serializable; 24 | import java.util.Set; 25 | 26 | /** 27 | * Computes CSS style Similarity between two DOM trees 28 | */ 29 | public class StyleSimComputer extends SimilarityComputer implements Serializable { 30 | 31 | private static final long serialVersionUID = 6680072428272456472L; 32 | private static XPathEvaluator xPathUtil = new XPathEvaluator(); 33 | 34 | /** 35 | * Computes the stylistic similarity 36 | * @param elem1 first element 37 | * @param elem2 second element 38 | * @returnt the style similarity 39 | */ 40 | public double compute(Element elem1, Element elem2) { 41 | Set setA = xPathUtil.findUniqueClassNames(elem1); 42 | Set setB = xPathUtil.findUniqueClassNames(elem2); 43 | int modA = setA.size(); 44 | int modB = setB.size(); 45 | if (modA == 0 && modB == 0) { 46 | //Cant be determined by jaccards similarity; 47 | // however, by definition, they are very similar in empty style 48 | return 1.0; 49 | } 50 | int intersectSize = countIntersection(setA, setB); 51 | // the jaccards similarity 52 | return (double) intersectSize / (modA + modB - intersectSize); 53 | } 54 | 55 | /** 56 | * Computes the size of intersection of two sets 57 | * @param small first set. preferably smaller than the second argument 58 | * @param large second set; 59 | * @param the type 60 | * @return size of intersection of sets 61 | */ 62 | public int countIntersection(Set small, Set large){ 63 | //assuming first argument to be smaller than the later; 64 | //however double checking to be sure 65 | if (small.size() > large.size()) { 66 | //swap the references; 67 | Set tmp = small; 68 | small = large; 69 | large = tmp; 70 | } 71 | int result = 0; 72 | for (T item : small) { 73 | if (large.contains(item)){ 74 | //item found in both the sets 75 | result++; 76 | } 77 | } 78 | return result; 79 | } 80 | 81 | 82 | @Override 83 | public double compute(TreeNode obj1, TreeNode obj2) { 84 | //TODO: resolve the casts.. This could cause type cast errors 85 | return compute((Element) obj1.innerNode, (Element) obj2.innerNode); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/XPathEvaluator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | import org.slf4j.Logger; 20 | import org.slf4j.LoggerFactory; 21 | import org.w3c.dom.Element; 22 | import org.w3c.dom.NodeList; 23 | 24 | import javax.xml.xpath.XPathConstants; 25 | import javax.xml.xpath.XPathExpression; 26 | import javax.xml.xpath.XPathExpressionException; 27 | import javax.xml.xpath.XPathFactory; 28 | import java.io.Serializable; 29 | import java.util.Collections; 30 | import java.util.HashSet; 31 | import java.util.Set; 32 | 33 | /** 34 | * An utility for evaluating XPath expressions on Documents 35 | * @author Thamme Gowda 36 | * @since Jan 16, 2016 37 | */ 38 | public class XPathEvaluator implements Serializable { 39 | 40 | public static final Logger LOG = LoggerFactory.getLogger(XPathEvaluator.class.getName()); 41 | private static final String CLASS_VAL_XPATH = "//*[@class]/@class"; 42 | private static final long serialVersionUID = -4886553689128529323L; 43 | 44 | private XPathFactory xPathFactory; 45 | private XPathExpression cssClassValExprsn; 46 | 47 | public XPathEvaluator() { 48 | xPathFactory = XPathFactory.newInstance(); 49 | try { 50 | cssClassValExprsn = compile(CLASS_VAL_XPATH); 51 | } catch (XPathExpressionException e) { 52 | LOG.error(e.getMessage(), e); 53 | throw new RuntimeException(e); 54 | } 55 | } 56 | 57 | public XPathExpression compile(String expression) throws XPathExpressionException { 58 | return xPathFactory.newXPath().compile(expression); 59 | } 60 | 61 | /** 62 | * Evaluates the given xpath expression on input DOM Element 63 | * @param element Root element 64 | * @param expression Xpath expression 65 | * @return List of Nodes obtained by evaluating the nodes 66 | * @throws XPathExpressionException when the xpath expression is invalid 67 | */ 68 | public NodeList eval(Element element, XPathExpression expression) 69 | throws XPathExpressionException { 70 | return (NodeList) expression.evaluate(element, XPathConstants.NODESET); 71 | } 72 | 73 | 74 | /** 75 | * Finds all unique class names from a DOM tree rooted at given element 76 | * @param element the root element of the DOM tree 77 | * @return Set of class names 78 | */ 79 | public Set findUniqueClassNames(Element element){ 80 | try { 81 | NodeList list = eval(element, cssClassValExprsn); 82 | Set cssClasses = new HashSet<>(); 83 | for (int i = 0; i < list.getLength(); i++) { 84 | Collections.addAll(cssClasses, 85 | list.item(i).getTextContent().trim().split("\\s+")); 86 | } 87 | return cssClasses; 88 | } catch (XPathExpressionException e) { 89 | LOG.error(e.getMessage(), e); 90 | throw new RuntimeException(e); 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/apted/StringToIntMapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.apted; 18 | 19 | import edu.usc.irds.lang.Function; 20 | 21 | import java.io.BufferedWriter; 22 | import java.io.File; 23 | import java.io.FileNotFoundException; 24 | import java.io.IOException; 25 | import java.io.PrintWriter; 26 | import java.io.Serializable; 27 | import java.util.HashMap; 28 | import java.util.Map; 29 | import java.util.concurrent.atomic.AtomicInteger; 30 | 31 | /** 32 | * Mapper for converting strings to integer. 33 | * Uses counters for mapping. 34 | * Optionally writes contents to file 35 | */ 36 | public class StringToIntMapper implements Function, AutoCloseable, Serializable { 37 | 38 | private Map map = new HashMap<>(); 39 | private Map reverseMap = new HashMap<>(); 40 | private boolean persist = false; 41 | private AtomicInteger counter = new AtomicInteger(0); 42 | private BufferedWriter writer; 43 | 44 | /** 45 | * creates a mapper instance which uses counters. 46 | * For persistent based mapper see {@link #StringToIntMapper(File)} 47 | */ 48 | public StringToIntMapper(){ 49 | } 50 | 51 | /** 52 | * This instance writes the mapping to given file. 53 | * Should be closed at the end to flush the contents to file 54 | * @param file file instance 55 | * @throws FileNotFoundException 56 | */ 57 | public StringToIntMapper(File file) throws FileNotFoundException { 58 | this(); 59 | this.persist = true; 60 | this.writer = new BufferedWriter(new PrintWriter(file)); 61 | } 62 | 63 | @Override 64 | public Integer apply(String obj) { 65 | return this.map(obj); 66 | } 67 | 68 | /** 69 | * Maps a string to integer 70 | * @param obj the object which requires mapping 71 | * @return integer obtained after mapping 72 | */ 73 | public Integer map(String obj){ 74 | Integer mapped = map.get(obj); 75 | if (mapped == null) { 76 | mapped = counter.incrementAndGet(); 77 | map.put(obj, mapped); 78 | reverseMap.put(mapped, obj); 79 | if (persist){ 80 | try { 81 | writer.write(obj); 82 | writer.write("\n"); 83 | } catch (IOException e) { 84 | e.printStackTrace(); 85 | } 86 | } 87 | } 88 | return mapped; 89 | } 90 | 91 | /** 92 | * returns key that was mapped to this value 93 | * @param val the value for reverse lookup 94 | * @return String if present, null if not present 95 | */ 96 | public String reverseMap(Integer val){ 97 | return this.reverseMap.get(val); 98 | } 99 | 100 | @Override 101 | public void close() throws Exception { 102 | if (writer != null) { 103 | writer.close(); 104 | } 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/tree/GrossSimComputer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import edu.usc.irds.autoext.Config; 20 | import edu.usc.irds.autoext.base.SimilarityComputer; 21 | import edu.usc.irds.autoext.utils.Checks; 22 | 23 | import java.io.Serializable; 24 | import java.util.ArrayList; 25 | import java.util.Arrays; 26 | import java.util.List; 27 | 28 | /** 29 | * Aggregates the similarities from several similarity computers 30 | * @author Thamme Gowda N 31 | * @since Jan 16, 2016 32 | */ 33 | public class GrossSimComputer extends SimilarityComputer implements Serializable { 34 | 35 | private static final long serialVersionUID = -6461871245307945046L; 36 | private final List> computers; 37 | private final List weights; 38 | private int n; 39 | 40 | /** 41 | * Creates a similarity aggregator 42 | * @param computers list of similarity computers 43 | * @param weights list of weights to the computers. 44 | * The weight at the index i in this array specifies the weight for similaritycomputer at i in the argument 1. 45 | * The sum of all weights should add to 1.0 46 | */ 47 | public GrossSimComputer(List> computers, 48 | List weights) { 49 | this.computers = computers; 50 | this.weights = weights; 51 | Checks.check(computers.size() == weights.size(), 52 | "The size of computers and weights should match"); 53 | double sum = 0.0; 54 | for (Double weight : weights) { 55 | sum += weight; 56 | } 57 | Checks.check(Math.abs(1.0 - sum) <= 0.001, 58 | "The sum of all the weights must add up to 1.0"); 59 | this.n = weights.size(); 60 | } 61 | 62 | @Override 63 | public double compute(T obj1, T obj2) { 64 | double result = 0.0; 65 | for (int i = 0; i < n; i++) { 66 | result += computers.get(i).compute(obj1, obj2) * weights.get(i); 67 | } 68 | return result; 69 | } 70 | 71 | /** 72 | * A factory method for creating similarity computer that aggregates structural and stylistic measures 73 | * @return the similarity computer that internally aggregates structure and style measures; 74 | */ 75 | public static GrossSimComputer createWebSimilarityComputer(){ 76 | double structureSimWeight = Config.getInstance().getSimWeight(); 77 | Checks.check(structureSimWeight <= 1.0 && structureSimWeight >= 0.0, "The weight should be in between [0.0, 1.0]"); 78 | StructureSimComputer structSimComputer = new StructureSimComputer(); 79 | StyleSimComputer styleSimComputer = new StyleSimComputer(); 80 | List> similarityComputers = new ArrayList<>(); 81 | similarityComputers.add(structSimComputer); 82 | similarityComputers.add(styleSimComputer); 83 | List weights = Arrays.asList(structureSimWeight, 1.0 - structureSimWeight); 84 | return new GrossSimComputer<>(similarityComputers, weights); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/apted/APTEDComputer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.apted; 18 | 19 | import edu.usc.irds.autoext.base.EditCost; 20 | import edu.usc.irds.autoext.base.EditDistanceComputer; 21 | import edu.usc.irds.autoext.tree.TreeNode; 22 | import edu.usc.irds.lang.Function; 23 | import edu.usc.irds.ted.apted.APTED; 24 | import edu.usc.irds.ted.apted.util.LblTree; 25 | 26 | import java.io.Serializable; 27 | import java.util.List; 28 | 29 | /** 30 | * 31 | * This TED is based on AP-TED algorithm of Mateusz Pawlik and Nikolaus Augsten. 32 | * Refer to http://tree-edit-distance.dbresearch.uni-salzburg.at for more details 33 | * 34 | * @see APTED 35 | */ 36 | public class APTEDComputer 37 | implements EditDistanceComputer, Serializable { 38 | 39 | public static final float INSERT_COST = 1; 40 | public static final float DELETE_COST = 1; 41 | public static final float REPLACE_COST = 1; 42 | public static final float MAX_UNIT = Math.max(Math.max(INSERT_COST, DELETE_COST), REPLACE_COST); 43 | 44 | public static class APTEDMetric implements EditCost, Serializable{ 45 | 46 | @Override 47 | public double getInsertCost(Object node) { 48 | return INSERT_COST; 49 | } 50 | 51 | @Override 52 | public double getRemoveCost(Object node) { 53 | return DELETE_COST; 54 | } 55 | 56 | @Override 57 | public double getReplaceCost(Object node1, Object node2) { 58 | return REPLACE_COST; 59 | } 60 | 61 | @Override 62 | public double getNoEditCost() { 63 | return 0; 64 | } 65 | 66 | @Override 67 | public double getMaxUnitCost() { 68 | return MAX_UNIT; 69 | } 70 | 71 | @Override 72 | public boolean isSymmetric() { 73 | return true; 74 | } 75 | } 76 | 77 | private APTEDMetric cost = new APTEDMetric(); 78 | private StringToIntMapper idMapper = new StringToIntMapper(); 79 | 80 | @Override 81 | public double computeDistance(TreeNode object1, TreeNode object2) { 82 | APTED ted = new APTED(DELETE_COST, INSERT_COST, REPLACE_COST); 83 | LblTree tree1 = transform(object1, idMapper); 84 | LblTree tree2 = transform(object2, idMapper); 85 | return ted.nonNormalizedTreeDist(tree1, tree2); 86 | } 87 | 88 | @Override 89 | public EditCost getCostMetric() { 90 | return cost; 91 | } 92 | 93 | 94 | /** 95 | * Transforms TreeNode to LblNode 96 | * @param node TreeNode 97 | * @param idMapper mapper function that converts string id to integer id 98 | * @return an instance of LblTree 99 | */ 100 | public static LblTree transform(TreeNode node, Function idMapper){ 101 | int treeID = idMapper != null ? idMapper.apply(node.getExternalId()) : -1; 102 | LblTree result = new LblTree(node.getNodeName(), treeID); 103 | List children = node.getChildren(); 104 | if (children != null) { 105 | for (TreeNode child : children) { 106 | result.add(transform(child, idMapper)); 107 | } 108 | } 109 | return result; 110 | } 111 | 112 | 113 | } 114 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/utils/D3JsFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.utils; 18 | 19 | import com.google.gson.Gson; 20 | 21 | import java.io.FileWriter; 22 | import java.io.IOException; 23 | import java.io.Writer; 24 | import java.util.ArrayList; 25 | import java.util.HashMap; 26 | import java.util.List; 27 | import java.util.Map; 28 | 29 | /** 30 | * Utilities for transforming data to d3js format 31 | */ 32 | public class D3JsFormat { 33 | 34 | public static final String INDEX_KEY = "index"; 35 | public static final String CHILDREN_KEY = "children"; 36 | public static final String SIZE = "size"; 37 | public static final String NAME_KEY = "name"; 38 | public static final String CREATED_AT = "createdAt"; 39 | 40 | /** 41 | * 42 | * @param name name for top level cluster 43 | * @param clusters cluster details 44 | * @param nameMap mapping indices back to labels 45 | * @param scaleFactor scale factor for magnifying the cluster size 46 | */ 47 | public static String formatClusters(String name, 48 | Map> clusters, 49 | Map nameMap, 50 | final double scaleFactor){ 51 | 52 | final Map nameMapFinal = nameMap == null ? 53 | new HashMap() : nameMap; 54 | 55 | Map result = new HashMap<>(); 56 | result.put(NAME_KEY, name); 57 | result.put(INDEX_KEY, -1); 58 | result.put(SIZE, clusters.size() * scaleFactor); 59 | result.put(CREATED_AT, System.currentTimeMillis()); 60 | 61 | List level1 = new ArrayList<>(); 62 | result.put(CHILDREN_KEY, level1); 63 | for (Map.Entry> entry : clusters.entrySet()) { 64 | Map child = new HashMap<>(); 65 | level1.add(child); 66 | Integer key = entry.getKey(); 67 | child.put(INDEX_KEY, key); 68 | child.put(NAME_KEY, nameMapFinal.containsKey(key) ? nameMapFinal.get(key): "" + key); 69 | child.put(SIZE, entry.getValue().size() * scaleFactor); 70 | List level2 = new ArrayList<>(); 71 | child.put(CHILDREN_KEY, level2); 72 | 73 | for (final Integer item: entry.getValue()){ 74 | Map node = new HashMap<>(); 75 | node.put(INDEX_KEY, item); 76 | node.put(NAME_KEY, nameMapFinal.containsKey(item)? nameMapFinal.get(item) : ""+ item); 77 | node.put(SIZE, scaleFactor); 78 | level2.add(node); 79 | } 80 | } 81 | return new Gson().toJson(result); 82 | } 83 | 84 | /** 85 | * Stores the clusters data to a json file 86 | * @param file path to file 87 | * @param name name for the top level cluster 88 | * @param clusters cluster data 89 | * @param nameMap map of index to labels 90 | * @param scaleFactor factor to scale 91 | * @throws IOException when an error occurs while writing to file 92 | */ 93 | public static void storeClusters(String file, String name, 94 | Map> clusters, 95 | final Map nameMap, float scaleFactor) 96 | throws IOException { 97 | String result = formatClusters(name, clusters, nameMap, scaleFactor); 98 | try(Writer writer = new FileWriter(file)){ 99 | writer.write(result); 100 | } 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/tree/StructureSimComputer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import edu.usc.irds.autoext.Config; 20 | import edu.usc.irds.autoext.base.EditCost; 21 | import edu.usc.irds.autoext.base.EditDistanceComputer; 22 | import edu.usc.irds.autoext.base.SimilarityComputer; 23 | import edu.usc.irds.autoext.utils.Checks; 24 | import edu.usc.irds.autoext.utils.ReflectionUtils; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | import java.io.Serializable; 29 | import java.util.List; 30 | 31 | /** 32 | *Computes the structural similarity between two DOM Trees 33 | * 34 | */ 35 | public class StructureSimComputer extends SimilarityComputer implements Serializable{ 36 | 37 | public static final Logger LOG = LoggerFactory.getLogger(StructureSimComputer.class); 38 | 39 | private static final long serialVersionUID = 5434501333215722663L; 40 | private final EditCost costMetric; 41 | private EditDistanceComputer distanceComputer; 42 | 43 | public StructureSimComputer(){ 44 | String tedImpl = Config.getInstance().getTedImpl(); 45 | LOG.info("TED = {}", tedImpl); 46 | this.distanceComputer = ReflectionUtils.instantiate(tedImpl); 47 | this.costMetric = distanceComputer.getCostMetric(); 48 | } 49 | 50 | public StructureSimComputer(EditDistanceComputer distanceComputer) { 51 | this(distanceComputer.getCostMetric()); 52 | this.distanceComputer = distanceComputer; 53 | } 54 | 55 | public StructureSimComputer(EditCost costMetric) { 56 | this.costMetric = costMetric; 57 | } 58 | 59 | /** 60 | * Computes similarity between the trees using edit distance measure 61 | * @param tree1 first tree 62 | * @param tree2 second tree 63 | * @return similarity measure 64 | */ 65 | @Override 66 | public double compute(TreeNode tree1, TreeNode tree2){ 67 | return computeSimilarity(distanceComputer.computeDistance(tree1, tree2), 68 | tree1.getSize(), tree2.getSize()); 69 | } 70 | 71 | /** 72 | * Computes similarity between the trees using edit distance measure 73 | * @param distance first distance 74 | * @param size1 number of elements in first tree 75 | * @param size2 number of elements in second tree 76 | * @return similarity measure 77 | */ 78 | public double computeSimilarity(double distance, int size1, int size2){ 79 | //Wish I could speak java here instead of maths :-) 80 | return 1.0 - distance/(costMetric.getMaxUnitCost() * (size1 + size2)); 81 | } 82 | /** 83 | * Computes similarity matrix 84 | * @param trees list of trees 85 | * @return similarity matrix 86 | */ 87 | public double[][] compute(List trees) { 88 | int n = trees.size(); 89 | if (n < 2) { 90 | throw new IllegalArgumentException("At least two nodes should be given"); 91 | } 92 | double matrix[][] = new double[n][n]; 93 | for (int i = 0; i < n; i++) { 94 | for (int j = 0; j < n; j++) { 95 | matrix[i][j] = compute(trees.get(i), trees.get(j)); 96 | } 97 | } 98 | return matrix; 99 | } 100 | 101 | /** 102 | * Computes similarity matrix from distance matrix 103 | * @param treeSizes the number/size of elements in each tree 104 | * @param distanceMatrix the distance matrix 105 | * @return similarity matrix 106 | */ 107 | public double[][] compute(int[] treeSizes, double[][] distanceMatrix) { 108 | Checks.check(treeSizes.length == distanceMatrix.length, "The tree size must be same as the distance matrix's"); 109 | Checks.check(distanceMatrix.length == distanceMatrix[0].length, "The matrix must have same rows and same columns"); 110 | 111 | int n = treeSizes.length; 112 | double matrix[][] = new double[n][n]; 113 | for (int i = 0; i < n; i++) { 114 | for (int j = 0; j < n; j++) { 115 | matrix[i][j] = computeSimilarity(distanceMatrix[i][j], treeSizes[i], treeSizes[j]); 116 | } 117 | } 118 | return matrix; 119 | } 120 | 121 | } 122 | -------------------------------------------------------------------------------- /visuals/webapp/circle-packing.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 30 | 31 | 32 | 33 |
34 |

Choose Clusters JSON File

Format = flares.json

35 | 36 |
37 |
38 |

39 | 53 | 54 | 55 | 56 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/tree/ZSTEDistance.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import edu.usc.irds.autoext.base.EditCost; 20 | 21 | import java.io.Serializable; 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | 25 | import static java.lang.Math.min; 26 | 27 | /** 28 | * Zhang - Shasha's Tree edit distance matrix 29 | * @see ZSTEDComputer 30 | */ 31 | public class ZSTEDistance implements Serializable { 32 | 33 | private static final long serialVersionUID = -3804460419024208744L; 34 | private final List iNodes; 35 | private final List jNodes; 36 | private final List iKeyRoots; 37 | private final List jKeyRoots; 38 | private int[] iLs; 39 | private int[] jLs; 40 | private double[][] treeDist; 41 | private final EditCost cost; 42 | 43 | public ZSTEDistance(TreeNode iTree, TreeNode jTree){ 44 | this(iTree, jTree, new DefaultEditCost()); 45 | } 46 | 47 | /** 48 | * Creates a Edit distance matrix for given trees 49 | * @param iTree root node of first tree 50 | * @param jTree root node of second tree 51 | * @param cost costs for edit operations 52 | */ 53 | public ZSTEDistance(TreeNode iTree, TreeNode jTree, EditCost cost){ 54 | this.cost = cost; 55 | this.iNodes = iTree.postOrderTraverse(); 56 | this.jNodes = jTree.postOrderTraverse(); 57 | this.treeDist = new double[iNodes.size()][jNodes.size()]; 58 | this.iKeyRoots = new ArrayList<>(); 59 | this.jKeyRoots = new ArrayList<>(); 60 | for (TreeNode node : iTree.getKeyRoots()) { 61 | iKeyRoots.add(node.getIndex()); 62 | } 63 | for (TreeNode node : jTree.getKeyRoots()) { 64 | jKeyRoots.add(node.getIndex()); 65 | } 66 | } 67 | 68 | 69 | /** 70 | * Computes and returns edit distance 71 | * @return min edit distance between trees 72 | */ 73 | public double compute(){ 74 | this.iLs = new int[iNodes.size()]; 75 | this.jLs = new int[jNodes.size()]; 76 | for (int i = 0; i < iNodes.size(); i++) { 77 | this.iLs[i] = iNodes.get(i).getLeftMostDescendant().getIndex(); 78 | } 79 | for (int i = 0; i < jNodes.size(); i++) { 80 | this.jLs[i] = jNodes.get(i).getLeftMostDescendant().getIndex(); 81 | } 82 | for (Integer i : iKeyRoots) { 83 | for (Integer j : jKeyRoots) { 84 | treeDistance(i, j); 85 | } 86 | } 87 | return this.treeDist[iNodes.size() -1][jNodes.size() -1]; 88 | } 89 | 90 | /** 91 | * Computes Tree distance between ith node and jth node 92 | * @param i index of first node 93 | * @param j index of second node 94 | */ 95 | private void treeDistance(int i, int j){ 96 | int m = i - iLs[i] + 2; 97 | int n = j - jLs[j] + 2; 98 | double fd[][] = new double[m][n]; 99 | 100 | int iOffset = iLs[i] - 1; 101 | int jOffset = jLs[j] - 1; 102 | 103 | for (int x = 1; x < m; x++) { 104 | // δ(l(i1)..i, θ) = δ(l(1i)..1-1, θ) + γ(v → λ) 105 | fd[x][0] = fd[x-1][0] + cost.getRemoveCost(iNodes.get(x+iOffset)); 106 | } 107 | for (int y = 1; y < n; y++) { 108 | //# δ(θ, l(j1)..j) = δ(θ, l(j1)..j-1) + γ(λ → w) 109 | fd[0][y] = fd[0][y-1] + cost.getInsertCost(jNodes.get(y+jOffset)); 110 | } 111 | 112 | for (int x = 1; x < m; x++) { 113 | TreeNode i1 = iNodes.get(x + iOffset); 114 | for (int y = 1; y < n; y++) { 115 | TreeNode j1 = jNodes.get(y + jOffset); 116 | double removeCost = fd[x - 1][y] + cost.getRemoveCost(i1); 117 | double insertCost = fd[x][y - 1] + cost.getInsertCost(j1); 118 | 119 | // only need to check if x is an ancestor of i 120 | // and y is an ancestor of j 121 | if (iLs[i] == iLs[x+iOffset] && jLs[j] == jLs[y+jOffset]){ 122 | double replacementCost = fd[x - 1][y - 1] + (i1.getNodeName().equals(j1.getNodeName()) ? cost.getNoEditCost() : cost.getReplaceCost(i1, j1)); 123 | fd[x][y] = min(min(removeCost, insertCost), replacementCost); 124 | treeDist[x+iOffset][y+jOffset] = fd[x][y]; 125 | } else { 126 | int p = iLs[x+iOffset]-1-iOffset; 127 | int q = jLs[y+jOffset]-1-jOffset; 128 | fd[x][y] = min(min(removeCost, insertCost), fd[p][q] + treeDist[x+iOffset][y+jOffset]); 129 | } 130 | } 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentSimilarityComputer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import java.io.ByteArrayInputStream 20 | import java.lang 21 | 22 | import edu.usc.irds.autoext.base.SimilarityComputer 23 | import edu.usc.irds.autoext.spark.ContentSimilarityComputer._ 24 | import edu.usc.irds.autoext.spark.Utils._ 25 | import edu.usc.irds.autoext.tree._ 26 | import edu.usc.irds.autoext.utils.Timer 27 | import edu.usc.irds.lang.Function 28 | import org.apache.commons.io.IOUtils 29 | import org.apache.hadoop.io.Text 30 | import org.apache.nutch.protocol.Content 31 | import org.apache.spark.mllib.linalg.distributed.MatrixEntry 32 | import org.apache.spark.rdd.RDD 33 | import org.cyberneko.html.parsers.DOMParser 34 | import org.kohsuke.args4j.Option 35 | import org.xml.sax.InputSource 36 | 37 | /** 38 | * This tool Computes Similarity between documents 39 | */ 40 | class ContentSimilarityComputer extends IOSparkJob { 41 | 42 | @Option(name = "-func", required = true, 43 | usage = "Similarity function. Valid function names = {structure, style}") 44 | var simFunc: String = null 45 | 46 | var simComputer: SimilarityComputer[TreeNode] = null 47 | val htmlFilter: Function[String, lang.Boolean] = new ContentFilter("ml") 48 | 49 | def run(): Unit = { 50 | 51 | simComputer = simFunc match { 52 | case STRUCTURE => new StructureSimComputer() 53 | case STYLE => new StyleSimComputer() 54 | case _ => throw new IllegalArgumentException(s"Similarity function $simFunc is not supported") 55 | } 56 | val rdd = sc.union(getInputPaths().map(sc.sequenceFile(_, classOf[Text], classOf[Content]))) 57 | val (idRdd, entryRDD) = computeSimilarity(rdd) 58 | 59 | LOG.info(s"Storing Ids to URL map at $outPath (CSV File)") 60 | idRdd.map({case(idx, url) => s"$idx,$url"}).saveAsTextFile(outPath + "-ids") 61 | 62 | LOG.info(s"Storing Entries at $outPath (object file)") 63 | entryRDD.saveAsObjectFile(outPath) 64 | } 65 | 66 | /** 67 | * Computes similarity of documents in given sequence file 68 | * @param input Content RDD 69 | */ 70 | private def computeSimilarity(input: RDD[(Text, Content)]) 71 | : (RDD[(Long, String)], RDD[MatrixEntry]) ={ 72 | // local variable serialization, otherwise we need to serialize 'this' whole object 73 | val LOG = this.LOG 74 | val computer = simComputer 75 | 76 | val rdd = input.filter(t => t._2.getContentType.contains("ml") || t._2.getContentType.contains("text"))//get only text or html 77 | .map(t => (new Text(t._1), cloneContent(t._2))) 78 | 79 | var treeRDD: RDD[(Text, TreeNode)] = rdd.map({case (key, content) => 80 | var stream: ByteArrayInputStream = null 81 | var res: (Text, TreeNode) = null 82 | try { 83 | stream = new ByteArrayInputStream(content.getContent) 84 | val parser = new DOMParser() 85 | parser.parse(new InputSource(stream)) 86 | val doc = parser.getDocument 87 | val elements = doc.getElementsByTagName("HTML") 88 | if (elements.getLength > 0) { 89 | val tree = TreeNode.create(elements.item(0), content.getUrl) 90 | res = (key, tree) 91 | } 92 | } catch { 93 | case e: Exception => 94 | LOG.error(e.getMessage) 95 | res = null //error case 96 | } finally { 97 | IOUtils.closeQuietly(stream) 98 | } 99 | res 100 | }).filter(_ != null) 101 | 102 | treeRDD = treeRDD.persist() //cache here so that spark dont end up re-parsing again and again 103 | 104 | val iRdd: RDD[(Long, TreeNode)] = treeRDD 105 | .zipWithIndex() 106 | .map({case ((k, tree), idx) => (idx, tree)}) 107 | 108 | val idRdd = iRdd.map({case (id, tree) => (id, tree.getExternalId)}) 109 | var pairs = iRdd.cartesian(iRdd) 110 | 111 | // throw away lower diagonal 112 | pairs = pairs.filter({case ((i, t1), (j, t2)) => i >= j}).cache() 113 | LOG.info("Num Partitions: {}", pairs.partitions.length) 114 | 115 | val entryRDD: RDD[MatrixEntry] = pairs.flatMap({ case ((i, treeI), (j, treeJ)) => 116 | val res = 117 | if (i == j) { 118 | //principal diagonal => same tree 119 | Array(new MatrixEntry(i, j, 1.0)) 120 | } else { 121 | val score = computer.compute(treeI, treeJ) 122 | Array(new MatrixEntry(i, j, score), new MatrixEntry(j, i, score)) //symmetry 123 | } 124 | //println(f"$i%d x $j%d : ${System.currentTimeMillis() - st}%dms") 125 | res.toTraversable 126 | }) 127 | //return ids as well as entries 128 | (idRdd, entryRDD) 129 | } 130 | } 131 | 132 | object ContentSimilarityComputer { 133 | 134 | val STRUCTURE = "structure" 135 | val STYLE = "style" 136 | 137 | def main(args: Array[String]) { 138 | val timer = new Timer 139 | new ContentSimilarityComputer().run(args) 140 | println("Time Taken : " + timer.read()) 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /visuals/webapp/circles-tooltip.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 38 | 39 | 40 | 41 | 42 |
43 |

Choose Clusters JSON File

Format = flares.json

44 | 45 |
46 |
47 |

48 | 62 | 63 |
64 | 65 | 66 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /autoext-spark/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | autoext-parent 5 | edu.usc.irds.autoext 6 | 0.2-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | autoext-spark 11 | jar 12 | 13 | autoext-spark 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 2.11 19 | 1.5.0 20 | 1.11 21 | edu.usc.irds.autoext.spark.Main 22 | 23 | 24 | 25 | 26 | ${project.parent.groupId} 27 | autoext 28 | ${project.parent.version} 29 | 30 | 31 | org.apache.spark 32 | spark-core_${scala.version} 33 | ${spark.version} 34 | 35 | 36 | org.apache.spark 37 | spark-mllib_${scala.version} 38 | ${spark.version} 39 | 40 | 41 | org.apache.nutch 42 | nutch 43 | ${nutch.version} 44 | 45 | 46 | org.apache.hadoop 47 | hadoop-client 48 | 49 | 50 | org.apache.httpcomponents 51 | httpclient 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | net.alchim31.maven 63 | scala-maven-plugin 64 | 3.2.1 65 | 66 | 67 | 68 | 69 | 70 | net.alchim31.maven 71 | scala-maven-plugin 72 | 73 | 74 | scala-compile-first 75 | process-resources 76 | 77 | add-source 78 | compile 79 | 80 | 81 | 82 | scala-test-compile 83 | process-test-resources 84 | 85 | testCompile 86 | 87 | 88 | 89 | 90 | 91 | 92 | org.apache.maven.plugins 93 | maven-shade-plugin 94 | 2.3 95 | 96 | 97 | package 98 | 99 | shade 100 | 101 | 102 | false 103 | false 104 | 105 | 106 | 107 | * 108 | 109 | 110 | 111 | 112 | *:* 113 | 114 | META-INF/*.SF 115 | META-INF/*.DSA 116 | META-INF/*.RSA 117 | 118 | 119 | 120 | 121 | 122 | reference.conf 123 | 124 | 125 | 126 | ${exec.mainClass} 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | spark-submit 139 | 140 | ${project.name}-${project.version}-submit-${spark.version}_${scala.version} 141 | 142 | 143 | 144 | org.apache.spark 145 | spark-core_${scala.version} 146 | ${spark.version} 147 | provided 148 | 149 | 150 | org.apache.spark 151 | spark-mllib_${scala.version} 152 | ${spark.version} 153 | provided 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /apted/src/main/java/edu/usc/irds/ted/apted/util/LblTree.java: -------------------------------------------------------------------------------- 1 | // The MIT License (MIT) 2 | // Copyright (c) 2016 Mateusz Pawlik and Nikolaus Augsten 3 | // 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy 5 | // of this software and associated documentation files (the "Software"), to deal 6 | // in the Software without restriction, including without limitation the rights 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | // copies of the Software, and to permit persons to whom the Software is 9 | // furnished to do so, subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be included in 12 | // all copies or substantial portions of the Software. 13 | // 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | // SOFTWARE. 21 | 22 | package edu.usc.irds.ted.apted.util; 23 | 24 | import javax.swing.tree.DefaultMutableTreeNode; 25 | import javax.swing.tree.MutableTreeNode; 26 | import java.io.Serializable; 27 | import java.util.Enumeration; 28 | import java.util.Vector; 29 | 30 | /** 31 | * Tree implementation. 32 | * 33 | * @author Nikolaus Augsten and Mateusz Pawlik 34 | * 35 | */ 36 | public class LblTree extends DefaultMutableTreeNode 37 | implements Comparable, Serializable { 38 | 39 | private static final long serialVersionUID = -8076996040097372065L; 40 | 41 | public LblTree(String label, int treeID) 42 | { 43 | this.treeID = -1; 44 | this.label = null; 45 | tmpData = null; 46 | nodeID = -1; 47 | this.treeID = treeID; 48 | this.label = label; 49 | } 50 | 51 | public void setLabel(String label) 52 | { 53 | this.label = label; 54 | } 55 | 56 | public String getLabel() 57 | { 58 | return label; 59 | } 60 | 61 | public int getTreeID() 62 | { 63 | if(isRoot()) 64 | return treeID; 65 | else 66 | return ((LblTree)getRoot()).getTreeID(); 67 | } 68 | 69 | public void setTreeID(int treeID) 70 | { 71 | if(isRoot()) 72 | this.treeID = treeID; 73 | else 74 | ((LblTree)getRoot()).setTreeID(treeID); 75 | } 76 | 77 | public void setTmpData(Object tmpData) 78 | { 79 | this.tmpData = tmpData; 80 | } 81 | 82 | public Object getTmpData() 83 | { 84 | return tmpData; 85 | } 86 | 87 | public void prettyPrint() 88 | { 89 | prettyPrint(false); 90 | } 91 | 92 | public void prettyPrint(boolean printTmpData) 93 | { 94 | for(int i = 0; i < getLevel(); i++) 95 | System.out.print(" "); 96 | 97 | if(!isRoot()) 98 | { 99 | System.out.print("+---+"); 100 | } else 101 | { 102 | if(getTreeID() != -1) 103 | System.out.println((new StringBuilder("treeID: ")).append(getTreeID()).toString()); 104 | System.out.print("*---+"); 105 | } 106 | System.out.print((new StringBuilder(" '")).append(getLabel()).append("' ").toString()); 107 | if(printTmpData) 108 | System.out.println(getTmpData()); 109 | else 110 | System.out.println(); 111 | for(Enumeration e = children(); e.hasMoreElements(); ((LblTree)e.nextElement()).prettyPrint(printTmpData)); 112 | } 113 | 114 | public int getNodeCount() 115 | { 116 | int sum = 1; 117 | for(Enumeration e = children(); e.hasMoreElements();) 118 | sum += ((LblTree)e.nextElement()).getNodeCount(); 119 | 120 | return sum; 121 | } 122 | 123 | // don't try to delete root node (postorder = size of t) 124 | public void deleteNode(int nodePostorder) { 125 | int i = 0; 126 | for (Enumeration e = depthFirstEnumeration(); e.hasMoreElements();) { 127 | i++; 128 | LblTree s = (LblTree) e.nextElement(); 129 | if (i == nodePostorder) { 130 | int sIndex = s.getParent().getIndex(s); 131 | while (s.getChildCount() > 0) { 132 | LblTree ch = (LblTree) s.getFirstChild(); 133 | ((MutableTreeNode) s.getParent()).insert(ch, sIndex); 134 | sIndex++; 135 | } 136 | s.removeFromParent(); 137 | break; 138 | } 139 | } 140 | } 141 | 142 | public void renameNode(int nodePostorder, String label) { 143 | int i = 0; 144 | for (Enumeration e = depthFirstEnumeration(); e.hasMoreElements();) { 145 | i++; 146 | LblTree s = (LblTree) e.nextElement(); 147 | if (i == nodePostorder) { 148 | s.setLabel(label); 149 | break; 150 | } 151 | } 152 | } 153 | 154 | public static LblTree fromString(String s) 155 | { 156 | int treeID = FormatUtilities.getTreeID(s); 157 | s = s.substring(s.indexOf("{"), s.lastIndexOf("}") + 1); 158 | LblTree node = new LblTree(FormatUtilities.getRoot(s), treeID); 159 | Vector c = FormatUtilities.getChildren(s); 160 | for(int i = 0; i < c.size(); i++) 161 | node.add(fromString((String)c.elementAt(i))); 162 | 163 | return node; 164 | } 165 | 166 | public String toString() 167 | { 168 | String res = (new StringBuilder("{")).append(getLabel()).toString(); 169 | if(getTreeID() >= 0 && isRoot()) 170 | res = (new StringBuilder(String.valueOf(getTreeID()))).append(":").append(res).toString(); 171 | for(Enumeration e = children(); e.hasMoreElements();) 172 | res = (new StringBuilder(String.valueOf(res))).append(((LblTree)e.nextElement()).toString()).toString(); 173 | 174 | res = (new StringBuilder(String.valueOf(res))).append("}").toString(); 175 | return res; 176 | } 177 | 178 | public int compareTo(Object o) 179 | { 180 | return getLabel().compareTo(((LblTree)o).getLabel()); 181 | } 182 | 183 | public void clearTmpData() 184 | { 185 | for(Enumeration e = breadthFirstEnumeration(); e.hasMoreElements(); ((LblTree)e.nextElement()).setTmpData(null)); 186 | } 187 | 188 | public static final String TAB_STRING = " "; 189 | public static final String ROOT_STRING = "*---+"; 190 | public static final String BRANCH_STRING = "+---+"; 191 | public static final String OPEN_BRACKET = "{"; 192 | public static final String CLOSE_BRACKET = "}"; 193 | public static final String ID_SEPARATOR = ":"; 194 | public static final int HIDE_NOTHING = 0; 195 | public static final int HIDE_ROOT_LABEL = 1; 196 | public static final int RENAME_LABELS_TO_LEVEL = 2; 197 | public static final int HIDE_ALL_LABELS = 3; 198 | public static final int RANDOM_ROOT_LABEL = 4; 199 | public final int NO_NODE = -1; 200 | public final int NO_TREE_ID = -1; 201 | int treeID; 202 | String label; 203 | Object tmpData; 204 | int nodeID; 205 | } -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/tree/ZSTEDComputer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import edu.usc.irds.autoext.base.EditCost; 20 | import edu.usc.irds.autoext.base.EditDistanceComputer; 21 | import edu.usc.irds.autoext.utils.MatrixUtils; 22 | import edu.usc.irds.autoext.utils.Timer; 23 | import org.cyberneko.html.parsers.DOMParser; 24 | import org.kohsuke.args4j.CmdLineException; 25 | import org.kohsuke.args4j.CmdLineParser; 26 | import org.kohsuke.args4j.Option; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | import org.w3c.dom.Document; 30 | import org.xml.sax.InputSource; 31 | import org.xml.sax.SAXException; 32 | 33 | import java.io.File; 34 | import java.io.FileReader; 35 | import java.io.IOException; 36 | import java.io.Serializable; 37 | import java.util.ArrayList; 38 | import java.util.List; 39 | 40 | /** 41 | * This class implements Zhang-Shasha's Tree Edit Distance (ZS-TED) algorithm for computing the 42 | * edit distance between DOM trees. 43 | * Computes edit distance between two nodes in DOM tree. 44 | * @author Thamme Gowda 45 | * 46 | *
47 | *

References :

48 | *
 49 |  *     K. Zhang and D. Shasha. 1989. Simple fast algorithms for the editing distance between trees and related problems. SIAM J. Comput. 18, 6 (December 1989), 1245-1262.
 50 |  * 
51 | */ 52 | public class ZSTEDComputer implements EditDistanceComputer, Serializable { 53 | 54 | public static final Logger LOG = LoggerFactory.getLogger(ZSTEDComputer.class); 55 | private static final long serialVersionUID = 2054631459801383484L; 56 | private EditCost costMetric = new DefaultEditCost(); 57 | 58 | /** 59 | * CLI argument specification 60 | */ 61 | private static class CliArg { 62 | @Option(name = "-in1", forbids = {"-dir"}, depends = "-in2") 63 | private File html1; 64 | 65 | @Option(name = "-in2", forbids = {"-dir"}, depends = "-in1") 66 | private File html2; 67 | 68 | @Option(name = "-dir", forbids = {"-in1", "-in2"}) 69 | private File inputDir; 70 | } 71 | 72 | @Override 73 | public double computeDistance(TreeNode tree1, TreeNode tree2) { 74 | long st = System.currentTimeMillis(); 75 | double distance = new ZSTEDistance(tree1, tree2, costMetric).compute(); 76 | if (LOG.isDebugEnabled()) { 77 | long time = System.currentTimeMillis() - st; 78 | LOG.debug("Time={}, dist={}, obj1={}, obj2={}", time, distance, 79 | tree1.getExternalId(), tree2.getExternalId()); 80 | } 81 | return distance; 82 | } 83 | 84 | @Override 85 | public EditCost getCostMetric() { 86 | return costMetric; 87 | } 88 | 89 | /** 90 | * Computes edit distance between two html files 91 | * @param file1 first html file 92 | * @param file2 second html file 93 | * @return edit distance measure 94 | * @throws IOException when an error occurs 95 | * @throws SAXException when parser fails 96 | */ 97 | public static double computeDistance(File file1, File file2) 98 | throws IOException, SAXException { 99 | DOMParser domParser = new DOMParser(); 100 | domParser.parse(new InputSource(new FileReader(file1))); 101 | Document doc1 = domParser.getDocument(); 102 | domParser.reset(); 103 | domParser.parse(new InputSource(new FileReader(file2))); 104 | Document doc2 = domParser.getDocument(); 105 | 106 | ZSTEDComputer computer = new ZSTEDComputer(); 107 | return computer.computeDistance(new TreeNode(doc1, null), new TreeNode(doc2, null)); 108 | } 109 | 110 | /** 111 | * Computes edit distances between trees 112 | * @param trees list of trees who's edit distance is to be computed 113 | * @return an nxn square matrix with edit distance measure 114 | */ 115 | public double[][] computeDistanceMatrix(List trees){ 116 | 117 | int n = trees.size(); 118 | double distanceMatrix[][] = new double[n][n]; 119 | boolean symmetricMeasure = getCostMetric().isSymmetric(); 120 | 121 | for (int i = 0; i < n; i++) { 122 | for (int j = 0; j < n; j++) { 123 | if (i == j){ 124 | //diagonal, same file, distance is zero 125 | distanceMatrix[i][j] = 0.0; 126 | } else if (symmetricMeasure && i > j) { 127 | // lower diagonal and the measure is a symmetry 128 | distanceMatrix[i][j] = distanceMatrix[j][i]; 129 | } else { 130 | // upper diagonal or unsymmetrical, compute it 131 | distanceMatrix[i][j] = computeDistance(trees.get(i), trees.get(j)); 132 | } 133 | } 134 | } 135 | return distanceMatrix; 136 | } 137 | 138 | /** 139 | * Computes the edit distance between files in a directory 140 | * @param inputDir directory of html pages 141 | * @throws IOException 142 | * @throws SAXException 143 | */ 144 | private static void computeDistances(File inputDir) throws IOException, SAXException { 145 | 146 | File[] files = inputDir.listFiles(); 147 | List docs = new ArrayList<>(); 148 | List htmlPaths = new ArrayList<>(); 149 | DOMParser parser = new DOMParser(); 150 | for (File file : files) { 151 | if (!file.isFile()) { 152 | //skip 153 | continue; 154 | } 155 | try(FileReader reader = new FileReader(file)) { 156 | parser.parse(new InputSource(reader)); 157 | htmlPaths.add(file.getAbsolutePath()); 158 | docs.add(new TreeNode(parser.getDocument(), null)); 159 | parser.reset(); 160 | } 161 | } 162 | int n = docs.size(); 163 | if (n < 2) { 164 | throw new RuntimeException("At least 2 html/xml files should be present in the input directory"); 165 | } 166 | 167 | ZSTEDComputer edComputer = new ZSTEDComputer(); 168 | StructureSimComputer simComputer = new StructureSimComputer(edComputer); 169 | double[][] distMatrix = edComputer.computeDistanceMatrix(docs); 170 | int treeSizes[] = new int[n]; 171 | for (int i = 0; i < docs.size(); i++) { 172 | treeSizes[i] = docs.get(i).getSize(); 173 | } 174 | double[][] simMatrix = simComputer.compute(treeSizes, distMatrix); 175 | 176 | System.out.println("#Index\tFile Path"); 177 | for (int i = 0; i < htmlPaths.size(); i++) { 178 | System.out.println(i + "\t" + htmlPaths.get(i)); 179 | } 180 | System.out.println("\n#Distance Matrix"); 181 | MatrixUtils.printMatrix(distMatrix); 182 | 183 | System.out.println("\n#Similarity Matrix"); 184 | MatrixUtils.printMatrix(simMatrix); 185 | } 186 | 187 | public static void main(String[] args) throws IOException, SAXException { 188 | //args = "-in1 src/test/resources/html/simple/1.html -in2 src/test/resources/html/simple/2.html".split(" "); 189 | //args = "-dir src/test/resources/html/simple/".split(" "); 190 | CliArg arg = new CliArg(); 191 | CmdLineParser parser = new CmdLineParser(arg); 192 | try { 193 | parser.parseArgument(args); 194 | if (arg.inputDir == null && arg.html1 == null) { 195 | throw new CmdLineException(parser, "Either -dir or -in1 should be given"); 196 | } 197 | } catch (CmdLineException e) { 198 | System.out.println(e.getMessage()); 199 | parser.printUsage(System.out); 200 | System.exit(1); 201 | } 202 | if (arg.inputDir != null) { 203 | computeDistances(arg.inputDir); 204 | } else { 205 | double distance = computeDistance(arg.html1, arg.html2); 206 | System.out.println("Distance=" + distance); 207 | } 208 | 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/SharedNeighborCuster.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.spark 18 | 19 | import java.util 20 | 21 | import edu.usc.irds.autoext.cluster.SharedNeighborClusterer._ 22 | import org.apache.spark.graphx.{Edge, Graph} 23 | import org.apache.spark.mllib.linalg.distributed.MatrixEntry 24 | import org.apache.spark.rdd.RDD 25 | import org.kohsuke.args4j.Option 26 | 27 | import scala.collection.JavaConversions._ 28 | import scala.collection.mutable 29 | 30 | /** 31 | * Shared Near Neighbor Clustering implemented using GraphX on spark 32 | * 33 | * @author Thamme Gowda N. 34 | */ 35 | class SharedNeighborCuster extends IOSparkJob { 36 | 37 | @Option(name = "-sim", aliases = Array("--similarityThreshold"), 38 | usage = "if two items have similarity above this value," + 39 | " then they will be treated as neighbors. Range[0.0, 1.0]") 40 | var similarityThreshold: Double = 0.7 41 | 42 | @Option(name = "-share", aliases = Array("--sharingThreshold"), 43 | usage = "if the percent of similar neighbors in clusters exceeds this value," + 44 | " then those clusters will be collapsed/merged into same cluster. Range:[0.0, 1.0]") 45 | var sharedNeighborThreshold: Double = 0.8 46 | 47 | @Option(name = "-d3export", usage = "Exports data to d3 JSON format") 48 | var d3Export = false 49 | 50 | 51 | /** 52 | * Clusters the items based on similarity using shared near neighbors 53 | * @param simMatrix An RDD of Matrix Entries 54 | * @param simThreshold threshold for treating entries as neighbors 55 | * @param snThreshold threshold for clubbing the cluster (Shared Neighbors) 56 | * @return graph after running clustering algorithm 57 | */ 58 | def cluster(simMatrix:RDD[MatrixEntry], simThreshold:Double, 59 | snThreshold:Double): Graph[VertexData, Double] ={ 60 | 61 | var entryRDD = simMatrix 62 | //STEP : Initial set of neighbors 63 | entryRDD = entryRDD.filter(e => e.value >= simThreshold).cache() 64 | 65 | //Step : Initial edges 66 | var edges = entryRDD.filter(e => e.i < e.j) // only one edge out of (1,2) and (2, 1) 67 | .map(e => {Edge(e.i, e.j, e.value)}) 68 | 69 | //Step : Initial set of Vertices 70 | var vertices = entryRDD 71 | .map(t => (t.i, (t.j, t.value))) 72 | .groupByKey() 73 | .map({case (vId, ns) => 74 | val assignments = new mutable.HashSet[Long]() 75 | assignments.add(vId) // item belong to its own cluster 76 | val neighbors = new util.BitSet() 77 | neighbors.set(vId.toInt, true) 78 | ns.foreach({ case(nId, v) => neighbors.set(nId.toInt, true)}) //FIXME: converting long to int here, possible overflow 79 | 80 | (vId, new VertexData(neighbors, assignments)) 81 | }) 82 | 83 | var graph:Graph[VertexData, Double] = null 84 | var hasMoreIteration = true 85 | var iterations:Int = 0 86 | 87 | while (hasMoreIteration) { 88 | iterations += 1 89 | //Step : Initial Graph 90 | graph = Graph(vertices.distinct(), edges.distinct()).cache() 91 | println(s" Before Iteration $iterations :: Num vertices=${graph.vertices.count()}," + 92 | s" Num Edges=${graph.edges.count()}") 93 | 94 | //Step : collapse similar clusters 95 | val replacementRdd = graph.triplets 96 | //filter clusters which are mutually neighbours => source in dest and dest in source 97 | .filter(et => et.srcAttr.neighbors.get(et.srcId.toInt) && et.srcAttr.neighbors.get(et.dstId.toInt)) 98 | //transform to a convenient form (srcCluster, (similarity, destCluster)) 99 | .map(et => (Math.max(et.srcId, et.dstId), 100 | (findOverlap(et.srcAttr.neighbors, et.dstAttr.neighbors), Math.min(et.srcId, et.dstId)))) 101 | // Filter clusters which exceeds threshold shared neighbors 102 | .filter(_._2._1 >= snThreshold) 103 | // when there are multiple target assignments, reduction to choose one out 104 | .reduceByKey({ 105 | case ((sim1, cluster1),(sim2, cluster2)) => 106 | if (Math.abs(sim1 - sim2) < 1e-6) { // if the similarity is same, pick smaller numeric index 107 | (sim1, Math.min(cluster1, cluster2)) 108 | } else if (sim1 > sim2) { // highest similar cluster 109 | (sim1, cluster1) 110 | } else { 111 | (sim2, cluster2) 112 | }}) 113 | .mapValues(_._2) // Dropped -> TargetAssignment, Similarity Measure not required 114 | 115 | //tree map to keep the keys sorted in ascending order 116 | val replacements = new util.TreeMap[Long, Long](replacementRdd.collectAsMap()) 117 | // resolve the transitive replacements {2=1, 3=2} => {2=1, 3=1} 118 | for (k <- replacements.keySet()) { //TODO: if possible, do distributed computation 119 | var key = replacements.get(k) 120 | while (replacements.containsKey(key)){ 121 | key = replacements.get(key) 122 | } 123 | replacements.put(k, key) 124 | } 125 | 126 | println(s"Number of clusters collapsed :: ${replacements.size()}") 127 | //Step : Decision : Can there be next Iteration? 128 | hasMoreIteration = !replacements.isEmpty 129 | if (hasMoreIteration){ 130 | //Step : Update : Finding vertices for next iteration 131 | vertices = graph.vertices 132 | .map({case (vId, data) => 133 | if (replacements.containsKey(vId)){ 134 | (replacements.get(vId), data) // pass data of this vertex to the assigned vertex 135 | } else { 136 | // this vertex remains, but the neighbors should be updated 137 | for ((k,v) <- replacements if v != vId && data.neighbors.get(k.toInt)) { 138 | //for all replacements which were its neighbors 139 | data.neighbors.set(k.toInt, false) //unset k'th bit and set v'th bit 140 | data.neighbors.set(v.toInt, true) 141 | } 142 | (vId, data) 143 | } 144 | }) 145 | .reduceByKey({case (data1, data2) => 146 | data1.items ++= data2.items; data1 // items are joined 147 | }) 148 | 149 | //Step 2 : Update : Finding updated edges for the next iteration 150 | edges = graph.edges.map( e => { 151 | if (replacements.containsKey(e.srcId) || replacements.contains(e.dstId)) { 152 | //affected edge 153 | Edge(replacements.getOrElse(e.srcId, e.srcId), 154 | replacements.getOrElse(e.dstId, e.dstId), e.attr) 155 | } else { // un affected 156 | e 157 | } 158 | }).filter(e => e.srcId != e.dstId) // drop looping edges 159 | } 160 | } 161 | graph 162 | } 163 | 164 | def run(): Unit ={ 165 | 166 | //STEP : Input set of similarity matrix 167 | val matrixEntries:RDD[MatrixEntry] = sc.union( 168 | getInputPaths().map(sc.objectFile[MatrixEntry](_))) 169 | 170 | //STEP : Cluster 171 | val graph = cluster(matrixEntries, similarityThreshold, sharedNeighborThreshold) 172 | 173 | //STEP : Format output 174 | val clusters = graph.vertices.map({case (id, data) => 175 | //To iterate over the true bits in a BitSet, use the following loop: 176 | s"$id,${data.items.size}," + data.items.mkString(",") 177 | }).cache() 178 | 179 | //STEP save output 180 | clusters.saveAsTextFile(outPath) 181 | println(s"Total Clusters = ${clusters.count()}"); 182 | 183 | //Optional STEP : Export 184 | if (d3Export){ 185 | val d3exp = new D3Export 186 | d3exp.sc = sc 187 | d3exp.inputPath = outPath 188 | d3exp.outPath = s"$outPath.json" 189 | LOG.info(s"Exporting D3 file at ${d3exp.outPath}") 190 | d3exp.run() 191 | } 192 | LOG.info("All Done") 193 | } 194 | } 195 | 196 | 197 | class VertexData (val neighbors:util.BitSet, val items:mutable.HashSet[Long]) 198 | extends Serializable {} 199 | 200 | object SharedNeighborCuster { 201 | 202 | def main(args: Array[String]) { 203 | new SharedNeighborCuster().run(args) 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /apted/src/main/java/edu/usc/irds/ted/apted/RTEDCommandLine.java: -------------------------------------------------------------------------------- 1 | // The MIT License (MIT) 2 | // Copyright (c) 2016 Mateusz Pawlik and Nikolaus Augsten 3 | // 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy 5 | // of this software and associated documentation files (the "Software"), to deal 6 | // in the Software without restriction, including without limitation the rights 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | // copies of the Software, and to permit persons to whom the Software is 9 | // furnished to do so, subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be included in 12 | // all copies or substantial portions of the Software. 13 | // 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | // SOFTWARE. 21 | 22 | package edu.usc.irds.ted.apted; 23 | 24 | import java.io.BufferedReader; 25 | import java.io.FileReader; 26 | import java.util.Date; 27 | 28 | import edu.usc.irds.ted.apted.util.LblTree; 29 | 30 | /** 31 | * This is the command line access for running RTED algorithm. 32 | * 33 | * @author Mateusz Pawlik 34 | * 35 | */ 36 | public class RTEDCommandLine { 37 | 38 | private String helpMessage = 39 | "\n" + 40 | "Compute the edit distance between two trees.\n" + 41 | "\n" + 42 | "SYNTAX\n" + 43 | "\n" + 44 | " java -jar APTED.jar {-t TREE1 TREE2 | -f FILE1 FILE2} [-c CD CI CR] [-v]\n" + 45 | "\n" + 46 | " java -jar APTED.jar -h\n" + 47 | "\n" + 48 | "DESCRIPTION\n" + 49 | "\n" + 50 | " Compute the edit distance between two trees with APTED algorithm [1,2].\n" + 51 | "\n" + 52 | "OPTIONS\n" + 53 | "\n" + 54 | " -h, --help \n" + 55 | " print this help message.\n" + 56 | "\n" + 57 | " -t TREE1 TREE2,\n" + 58 | " --trees TREE1 TREE2\n" + 59 | " compute the tree edit distance between TREE1 and TREE2. The\n" + 60 | " trees are encoded in the bracket notation, for example, in tree\n" + 61 | " {A{B{X}{Y}{F}}{C}} the root node has label A and two children\n" + 62 | " with labels B and C. B has three children with labels X, Y, F.\n" + 63 | "\n" + 64 | " -f FILE1 FILE2, \n" + 65 | " --files FILE1 FILE2\n" + 66 | " compute the tree edit distance between the two trees stored in\n" + 67 | " the files FILE1 and FILE2. The trees are encoded in bracket\n" + 68 | " notation.\n" + 69 | "\n" + 70 | " -c CD CI CR, \n" + 71 | " --costs CD CI CR\n" + 72 | " set custom cost for edit operations. Default is -c 1 1 1.\n" + 73 | " CD - cost of node deletion\n" + 74 | " CI - cost of node insertion\n" + 75 | " CR - cost of node renaming\n" + 76 | "\n" + 77 | " -v, --verbose\n" + 78 | " print verbose output, including tree edit distance, runtime,\n" + 79 | " number of relevant subproblems and strategy statistics.\n" + 80 | "\n" + 81 | "EXAMPLES\n" + 82 | "\n" + 83 | " java -jar APTED.jar -t {a{b}{c}} {a{b{d}}} -c 1 1 0.5\n" + 84 | " java -jar APTED.jar -f 1.tree 2.tree\n" + 85 | " java -jar APTED.jar -t {a{b}{c}} {a{b{d}}} -v\n" + 86 | "\n" + 87 | "REFERENCES\n" + 88 | "\n" + 89 | " [1] M. Pawlik and N. Augsten. Efficient Computation of the Tree Edit\n" + 90 | " Distance. ACM Transactions on Database Systems (TODS) 40(1). 2015.\n" + 91 | " [2] M. Pawlik and N. Augsten. Tree edit distance: Robust and memory-\n" + 92 | " efficient. Information Systems 56. 2016.\n" + 93 | " [3] M. Pawlik and N. Augsten. RTED: A Robust Algorithm for the Tree Edit\n" + 94 | " Distance. PVLDB 5(4). 2011.\n" + 95 | "\n" + 96 | "AUTHORS\n" + 97 | "\n" + 98 | " Mateusz Pawlik, Nikolaus Augsten"; 99 | 100 | private String wrongArgumentsMessage = "Wrong arguments. Try \"java -jar RTED.jar --help\" for help."; 101 | 102 | private LblTree lt1, lt2; 103 | private int size1, size2; 104 | private boolean run, custom, array, strategy, ifSwitch, sota, verbose, demaine, mapping; 105 | private int sotaStrategy; 106 | private String customStrategy, customStrategyArrayFile; 107 | private APTED rted; 108 | private double ted; 109 | 110 | /** 111 | * Main method 112 | * 113 | * @param args 114 | */ 115 | public static void main(String[] args) { 116 | RTEDCommandLine rtedCL = new RTEDCommandLine(); 117 | rtedCL.runCommandLine(args); 118 | } 119 | 120 | /** 121 | * Run the command line with given arguments. 122 | * 123 | * @param args 124 | */ 125 | public void runCommandLine(String[] args) { 126 | rted = new APTED(1, 1, 1); 127 | 128 | try { 129 | for (int i = 0; i < args.length; i++) { 130 | if (args[i].equals("--help") || args[i].equals("-h")) { 131 | System.out.println(helpMessage); 132 | System.exit(0); 133 | } else if (args[i].equals("-t") || args[i].equals("--trees")) { 134 | parseTreesFromCommandLine(args[i+1], args[i+2]); 135 | i = i+2; 136 | run = true; 137 | } else if (args[i].equals("-f") || args[i].equals("--files")) { 138 | parseTreesFromFiles(args[i+1], args[i+2]); 139 | i = i+2; 140 | run = true; 141 | } else if (args[i].equals("-c") || args[i].equals("--costs")) { 142 | setCosts(args[i+1], args[i+2], args[i+3]); 143 | i = i+3; 144 | } else if (args[i].equals("-v") || args[i].equals("--verbose")) { 145 | verbose = true; 146 | } else { 147 | System.out.println(wrongArgumentsMessage); 148 | System.exit(0); 149 | } 150 | } 151 | 152 | } catch (ArrayIndexOutOfBoundsException e) { 153 | System.out.println("Too few arguments."); 154 | System.exit(0); 155 | } 156 | 157 | if (!run) { 158 | System.out.println(wrongArgumentsMessage); 159 | System.exit(0); 160 | } 161 | 162 | long time1 = (new Date()).getTime(); 163 | ted = rted.nonNormalizedTreeDist(lt1, lt2); 164 | long time2 = (new Date()).getTime(); 165 | if (verbose) { 166 | System.out.println("distance: " + ted); 167 | System.out.println("runtime: " + ((time2 - time1) / 1000.0)); 168 | } else { 169 | System.out.println(ted); 170 | } 171 | } 172 | 173 | /** 174 | * Parse two input trees from the command line. 175 | * 176 | * @param ts1 177 | * @param ts2 178 | */ 179 | private void parseTreesFromCommandLine(String ts1, String ts2) { 180 | try { 181 | lt1 = LblTree.fromString(ts1); 182 | size1 = lt1.getNodeCount(); 183 | } catch (Exception e) { 184 | System.out.println("TREE1 argument has wrong format"); 185 | System.exit(0); 186 | } 187 | try { 188 | lt2 = LblTree.fromString(ts2); 189 | size2 = lt2.getNodeCount(); 190 | } catch (Exception e) { 191 | System.out.println("TREE2 argument has wrong format"); 192 | System.exit(0); 193 | } 194 | } 195 | 196 | /** 197 | * Parse two input trees from given files. 198 | * 199 | * @param fs1 200 | * @param fs2 201 | */ 202 | private void parseTreesFromFiles(String fs1, String fs2) { 203 | try { 204 | lt1 = LblTree.fromString((new BufferedReader(new FileReader(fs1))).readLine()); 205 | size1 = lt1.getNodeCount(); 206 | } catch (Exception e) { 207 | System.out.println("TREE1 argument has wrong format"); 208 | System.exit(0); 209 | } 210 | try { 211 | lt2 = LblTree.fromString((new BufferedReader(new FileReader(fs2))).readLine()); 212 | size2 = lt2.getNodeCount(); 213 | } catch (Exception e) { 214 | System.out.println("TREE2 argument has wrong format"); 215 | System.exit(0); 216 | } 217 | } 218 | 219 | /** 220 | * Set custom costs. 221 | * 222 | * @param cds 223 | * @param cis 224 | * @param cms 225 | */ 226 | private void setCosts(String cds, String cis, String cms) { 227 | try { 228 | rted.setCustomCosts(Float.parseFloat(cds), Float.parseFloat(cis), Float.parseFloat(cms)); 229 | } catch (Exception e) { 230 | System.out.println("One of the costs has wrong format."); 231 | System.exit(0); 232 | } 233 | } 234 | 235 | 236 | } 237 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/cluster/FileClusterer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.cluster; 18 | 19 | import edu.usc.irds.autoext.tree.GrossSimComputer; 20 | import edu.usc.irds.autoext.tree.StructureSimComputer; 21 | import edu.usc.irds.autoext.tree.TreeNode; 22 | import edu.usc.irds.autoext.tree.ZSTEDComputer; 23 | import edu.usc.irds.autoext.utils.MatrixUtils; 24 | import edu.usc.irds.autoext.utils.ParseUtils; 25 | import edu.usc.irds.autoext.utils.Timer; 26 | import org.kohsuke.args4j.CmdLineException; 27 | import org.kohsuke.args4j.CmdLineParser; 28 | import org.kohsuke.args4j.Option; 29 | import org.slf4j.Logger; 30 | import org.slf4j.LoggerFactory; 31 | import org.w3c.dom.Document; 32 | import org.xml.sax.SAXException; 33 | 34 | import java.io.BufferedWriter; 35 | import java.io.File; 36 | import java.io.FileWriter; 37 | import java.io.IOException; 38 | import java.io.PrintWriter; 39 | import java.nio.charset.Charset; 40 | import java.nio.file.Files; 41 | import java.util.ArrayList; 42 | import java.util.List; 43 | import java.util.concurrent.atomic.AtomicInteger; 44 | 45 | /** 46 | * This class offers commandline interface to make use of similarity and clustering 47 | * algorithms. 48 | * 49 | */ 50 | public class FileClusterer { 51 | 52 | public static final Logger LOG = LoggerFactory.getLogger(FileClusterer.class); 53 | public static final String IDS_FILE = "ids.txt"; 54 | public static final String ED_DIST_FILE = "edit-distance.csv"; 55 | public static final String TREE_SIM_FILE = "tree-sim.csv"; 56 | public static final String GROSS_SIM_FILE = "gross-sim.csv"; 57 | public static final String CLUSTER_FILE = "clusters.txt"; 58 | public static final String REPORT_FILE = "report.txt"; 59 | public static final char SEP = ','; 60 | 61 | @Option(name = "-list", 62 | required = true, 63 | usage = "path to a file containing paths to html files that requires clustering") 64 | private File listFile; 65 | 66 | @Option(name = "-workdir", 67 | required = true, 68 | usage = "Path to directory to create intermediate files and reports") 69 | private File workDir; 70 | 71 | public void cluster() throws IOException { 72 | 73 | LOG.info("Create work directory ? {} ", workDir.mkdirs()); 74 | File reportFile = new File(workDir, REPORT_FILE); 75 | try (PrintWriter report = new PrintWriter( 76 | new BufferedWriter(new FileWriter(reportFile)))) { 77 | Timer mainTimer = new Timer(); 78 | Timer timer = new Timer(); 79 | report.printf("Starting at : %d\n", timer.getStart()); 80 | report.printf("Input specified : %s\n", listFile.getAbsolutePath()); 81 | 82 | AtomicInteger skipCount = new AtomicInteger(0); 83 | List trees = readTrees(skipCount); 84 | List labels = new ArrayList<>(); 85 | for (TreeNode tree : trees) { 86 | labels.add(tree.getExternalId()); 87 | } 88 | report.printf("Parsed %d files and skipped %d files \n", trees.size(), skipCount.get()); 89 | report.printf("Work Directory :%s\n", workDir.getAbsolutePath()); 90 | report.printf("Time taken to parse : %dms\n", timer.reset()); 91 | 92 | //Step1: write ids/paths to separate file 93 | File idsFile = new File(workDir, IDS_FILE); 94 | Files.write(idsFile.toPath(), labels, Charset.forName("UTF-8")); 95 | LOG.info("Wrote paths to {} ", idsFile.toPath()); 96 | report.printf("Wrote %d ids to %s file in %dms\n", labels.size(), idsFile, timer.reset()); 97 | 98 | //Step 2: Compute similarity and store to file 99 | GrossSimComputer simComputer = GrossSimComputer.createWebSimilarityComputer(); 100 | timer.reset(); 101 | double[][] similarityMatrix = MatrixUtils.computeSymmetricMatrix(simComputer, trees); 102 | report.printf("Computed Gross similarity matrix in %dms\n", timer.reset()); 103 | File similarityFile = new File(workDir, GROSS_SIM_FILE); 104 | writeToCSV(similarityMatrix, similarityFile); 105 | report.printf("Stored similarity matrix in %dms\n", timer.reset()); 106 | 107 | //STEP 5: cluster 108 | SharedNeighborClusterer clusterer = new SharedNeighborClusterer(); 109 | //TODO: make these configurable 110 | double similarityThreshold = 0.75; 111 | int k = 100; 112 | report.printf("Clustering:: SimilarityThreshold=%f," + 113 | " no. of neighbors:%d\n", similarityThreshold, k); 114 | List> clusters = clusterer.cluster(similarityMatrix, 115 | labels.toArray(new String[labels.size()]), similarityThreshold, k); 116 | report.printf("Computed clusters in %dms\n", timer.reset()); 117 | File clustersFile = new File(workDir, CLUSTER_FILE); 118 | writeClusters(clusters, clustersFile); 119 | report.printf("Wrote clusters in %dms\n", timer.reset()); 120 | report.printf("Done! Total time = %dms\n", mainTimer.read()); 121 | } 122 | LOG.info("Done.. Report stored in {} ", reportFile.getAbsolutePath()); 123 | } 124 | 125 | /** 126 | * parses the files and builts trees 127 | * @param skipCounter the counter to be used to increment when some files are skipped 128 | * @return list of trees read 129 | * @throws IOException when an io error occures 130 | */ 131 | private List readTrees(AtomicInteger skipCounter) throws IOException { 132 | List trees = new ArrayList<>(); 133 | 134 | List lines = Files.readAllLines(listFile.toPath(), Charset.forName("UTF-8")); 135 | for (String line : lines) { 136 | line = line.trim(); 137 | if (line.isEmpty() || line.startsWith("#")) { 138 | continue; 139 | } 140 | try { 141 | Document doc = ParseUtils.parseFile(line); 142 | TreeNode tree = new TreeNode(doc.getDocumentElement(), null); 143 | tree.setExternalId(line); 144 | trees.add(tree); 145 | } catch (IOException | SAXException e) { 146 | skipCounter.incrementAndGet(); 147 | LOG.error("Skip : {}, reason:{}", line, e.getMessage()); 148 | } 149 | } 150 | return trees; 151 | } 152 | 153 | /** 154 | * Writes clusters to a clusters file 155 | * @param clusters the clusters list 156 | * @param outputFile output file 157 | * @throws IOException when an io error occurs 158 | */ 159 | public void writeClusters(List> clusters, File outputFile ) throws IOException { 160 | try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))){ 161 | writer.write("##Total Clusters:" + clusters.size() + "\n"); 162 | for (int i = 0; i < clusters.size(); i++) { 163 | writer.write("\n#Cluster:" + i + "\n"); 164 | List ids = clusters.get(i); 165 | for (String id : ids) { 166 | writer.write(id); 167 | writer.write("\n"); 168 | } 169 | } 170 | } 171 | } 172 | 173 | /** 174 | * Writes given matrix to CSV file 175 | * @param matrix the matrix or table 176 | * @param csvFile the target csv file 177 | * @throws IOException when an IO error occurs 178 | */ 179 | private void writeToCSV(double[][] matrix, File csvFile) throws IOException { 180 | try (BufferedWriter writer = new BufferedWriter(new FileWriter(csvFile))) { 181 | for (double[] row : matrix) { 182 | writer.write(String.valueOf(row[0])); 183 | for (int i = 1; i < row.length; i++) { 184 | writer.append(SEP).append(String.valueOf(row[i])); 185 | } 186 | writer.write('\n'); 187 | } 188 | } 189 | } 190 | 191 | public static void main(String[] args) throws IOException { 192 | //args = "-list in.list -workdir simple-work".split(" "); 193 | FileClusterer instance = new FileClusterer(); 194 | CmdLineParser parser = new CmdLineParser(instance); 195 | try { 196 | parser.parseArgument(args); 197 | } catch (CmdLineException e) { 198 | System.out.println(e.getLocalizedMessage()); 199 | parser.printUsage(System.out); 200 | System.exit(1); 201 | } 202 | instance.cluster(); 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/cluster/SharedNeighborClusterer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.cluster; 18 | 19 | import edu.usc.irds.autoext.tree.StructureSimComputer; 20 | import edu.usc.irds.autoext.tree.TreeNode; 21 | import edu.usc.irds.autoext.tree.ZSTEDComputer; 22 | import edu.usc.irds.autoext.utils.Checks; 23 | import edu.usc.irds.autoext.utils.ParseUtils; 24 | import edu.usc.irds.autoext.utils.Tuple2; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | import org.w3c.dom.Document; 28 | import org.xml.sax.SAXException; 29 | 30 | import java.io.File; 31 | import java.io.IOException; 32 | import java.util.ArrayList; 33 | import java.util.BitSet; 34 | import java.util.Comparator; 35 | import java.util.LinkedList; 36 | import java.util.List; 37 | import java.util.SortedSet; 38 | import java.util.TreeSet; 39 | 40 | /** 41 | *

References :

42 | *
 43 |  * Jarvis, R.A.; Patrick, Edward A., "Clustering Using a Similarity Measure Based on Shared Near Neighbors,"
 44 |  * in Computers, IEEE Transactions on , vol.C-22, no.11, pp.1025-1034, Nov. 1973
 45 |  * 
46 | * 47 | * @author Thamme Gowda N 48 | * 49 | */ 50 | public class SharedNeighborClusterer { 51 | 52 | public static final Logger LOG = LoggerFactory.getLogger(SharedNeighborClusterer.class); 53 | public static Comparator> descendingComparator = new Comparator>() { 54 | @Override 55 | public int compare(Tuple2 o1, Tuple2 o2) { 56 | return Double.compare(o2.pos0, o1.pos0); 57 | } 58 | }; 59 | 60 | /** 61 | * checks if the clusters needs to be merged into one 62 | * @param i cluster 1 63 | * @param j cluster 2 64 | * @param simThreshold minimum threshold to consider if the clusters are similar 65 | * @return true if clusters are similar; false otherwise 66 | */ 67 | public static boolean areClustersSimilar(BitSet i, BitSet j, double simThreshold){ 68 | return findOverlap(i, j) >= simThreshold; 69 | } 70 | 71 | /** 72 | * Finds the overlap between two bit sectors 73 | * @param first first bitset 74 | * @param second second bitset 75 | * @return overlap percent normalized to [0.0 1.0] 76 | */ 77 | public static double findOverlap(BitSet first, BitSet second) { 78 | int a = first.cardinality(); 79 | int b = second.cardinality(); 80 | BitSet intersection = (BitSet) first.clone(); 81 | intersection.and(second); 82 | int aIntersectB = intersection.cardinality(); 83 | return (double) aIntersectB / (a + b - aIntersectB); 84 | } 85 | 86 | /** 87 | * Clusters documents 88 | * @param simMatrix similarity matrix, values in between [0.0 to 1.0] inclusive 89 | * @param labels labels for items in similarity matrix 90 | * @param simThreshold similarity threshold to treat that the items are similar, usually >= 0.8 91 | * @param k number of nearest neighbours to start with 92 | */ 93 | public List> cluster(double simMatrix[][], String[] labels, 94 | double simThreshold, 95 | int k){ 96 | long statTime = System.currentTimeMillis(); 97 | Checks.check(simMatrix.length == labels.length, 98 | "Couldn't match labels to similarity matrix "); 99 | 100 | //computing the table 101 | List table = new LinkedList<>(); 102 | for (double[] simRow : simMatrix) { 103 | table.add(findNearestNeighbors(simRow, simThreshold, k)); 104 | } 105 | 106 | int maxIterations = 100; 107 | LOG.debug("Starting to cluster {} elements, max iterations={}", 108 | labels.length, maxIterations); 109 | int iteration = 0; 110 | int numCollapses; 111 | do { 112 | numCollapses = 0; 113 | long st = System.currentTimeMillis(); 114 | for (int i = 0; i < table.size(); i++) { 115 | for (int j = i + 1; j < table.size(); j++) { 116 | if (areClustersSimilar(table.get(i), table.get(j), simThreshold)) { 117 | // threshold or more neighbors in the intersection, collapse this cluster 118 | numCollapses++; 119 | // drop j 120 | table.remove(j); 121 | // replace j's index with i's index everywhere else 122 | for (int l = j ; l < table.size(); l++) { 123 | BitSet set = table.get(l); 124 | if (set.get(j)) { 125 | set.clear(j); 126 | set.set(i); 127 | } 128 | } 129 | //FIXME : possible bug with replacement strategy, test again 130 | } 131 | } 132 | } 133 | LOG.debug("Iteration {} took {}ms", iteration, (System.currentTimeMillis() - st)); 134 | LOG.debug("Iteration {} made {} collapses, num Clusters = {}", iteration, numCollapses, table.size()); 135 | iteration++; 136 | } while (numCollapses > 0 && iteration < maxIterations); 137 | //if you found this code interesting, the credit goes to 138 | // authors of paper "Clustering Using a Similarity Measure Based on Shared Near Neighbors" 139 | 140 | List> clusters = makeClusters(labels, table); 141 | LOG.info("Formed {} clusters from {} items, in {}ms time", 142 | clusters.size(), labels.length, System.currentTimeMillis() - statTime); 143 | return clusters; 144 | } 145 | 146 | /** 147 | * Constructs clusters of input items 148 | * @param labels item names or identifiers 149 | * @param table bitset table having cluster information 150 | * @return List of clusters 151 | */ 152 | private List> makeClusters(String[] labels, List table) { 153 | List> result = new ArrayList<>(); 154 | for (BitSet bs : table) { 155 | List cluster = new ArrayList<>(); 156 | for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i+1)) { 157 | // operate on index i here 158 | cluster.add(labels[i]); 159 | if (i == Integer.MAX_VALUE) { 160 | break; // or (i+1) would overflow 161 | } 162 | } 163 | result.add(cluster); 164 | } 165 | return result; 166 | } 167 | 168 | /** 169 | * Finds nearest neighbors based on similarity measures 170 | * @param similarity similarity measure of an item with all possible nodes 171 | * @param simThreshold cut off similarity to make the computations faster. 172 | * Anything below this will be ignored 173 | * @param k number of neighbors to be picked at max 174 | * @return bit sequence representation of nearest neighbors 175 | */ 176 | public BitSet findNearestNeighbors(double[] similarity, double simThreshold, int k){ 177 | 178 | SortedSet> nearests = new TreeSet<>(descendingComparator); 179 | int n = similarity.length; 180 | // the given node itself will have 1.0 score which is the highest similarity, 181 | // so no need to add it at the zeroth position explicitly 182 | for (int i = 0; i < n; i++) { 183 | if (similarity[i] >= simThreshold) { 184 | nearests.add(new Tuple2<>(similarity[i], i)); 185 | } 186 | } 187 | BitSet nearestNeighbors = new BitSet(); 188 | int count = 0; 189 | for (Tuple2 nearest : nearests) { 190 | nearestNeighbors.set(nearest.pos1); 191 | count++; 192 | if (count >= k){ 193 | //pick nearest k 194 | break; 195 | } 196 | } 197 | return nearestNeighbors; 198 | } 199 | 200 | public static void main(String[] args) throws IOException, SAXException { 201 | if (args.length != 1) { 202 | System.err.println("Invalid Args!"); 203 | System.err.println("Usage : "); 204 | System.exit(1); 205 | return; 206 | } 207 | String dir = args[0]; 208 | String[] fileNames = new File(dir).list(); 209 | File[] files = new File(dir).listFiles(); 210 | 211 | StructureSimComputer computer = new StructureSimComputer(new ZSTEDComputer()); 212 | List nodes = new ArrayList<>(); 213 | for (File file : files) { 214 | Document doc = ParseUtils.parseFile(file.getAbsolutePath()); 215 | nodes.add(new TreeNode(doc, null)); 216 | } 217 | System.out.println("Number of trees found :" + nodes.size()); 218 | double[][] sims = computer.compute(nodes); 219 | SharedNeighborClusterer clusterer = new SharedNeighborClusterer(); 220 | List> list = clusterer.cluster(sims, fileNames, 0.75, 100); 221 | System.out.println(list); 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /autoext/src/main/java/edu/usc/irds/autoext/tree/TreeNode.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package edu.usc.irds.autoext.tree; 18 | 19 | import org.cyberneko.html.parsers.DOMParser; 20 | import org.w3c.dom.Document; 21 | import org.w3c.dom.Node; 22 | import org.w3c.dom.NodeList; 23 | import org.xml.sax.InputSource; 24 | 25 | import java.io.FileReader; 26 | import java.io.Serializable; 27 | import java.util.ArrayList; 28 | import java.util.List; 29 | import java.util.concurrent.atomic.AtomicInteger; 30 | 31 | /** 32 | * TreeNode is a wrapper for {@link Node} which includes additional functions for 33 | * efficiently computing the edit distance 34 | */ 35 | public class TreeNode implements Serializable { 36 | 37 | private static final long serialVersionUID = 3679413437699206690L; 38 | 39 | protected String externalId; 40 | protected String nodeName; 41 | protected Node innerNode; 42 | protected TreeNode parent; 43 | protected List children; 44 | protected TreeNode leftMostDescendant; 45 | 46 | protected int index; 47 | protected int size; 48 | 49 | private TreeNode(String nodeName ) { 50 | this.nodeName = nodeName; 51 | this.leftMostDescendant = findLeftMostDescendant(); 52 | } 53 | 54 | /** 55 | *Creates a tree node object 56 | * @param innerNode the DOM API node 57 | * @param parent the parent node. For the root node, set to {@code null} 58 | */ 59 | public TreeNode(Node innerNode, TreeNode parent) { 60 | this.innerNode = innerNode; 61 | this.parent = parent; 62 | if (innerNode.hasChildNodes()) { 63 | children = new ArrayList<>(); 64 | NodeList childNodes = innerNode.getChildNodes(); 65 | for (int i = 0; i < childNodes.getLength(); i++) { 66 | Node ithNode = childNodes.item(i); 67 | if (ithNode.getNodeType() != Node.ELEMENT_NODE){ 68 | //skip all other nodes 69 | continue; 70 | } 71 | TreeNode child = new TreeNode(ithNode, this); 72 | children.add(child); 73 | } 74 | } 75 | this.leftMostDescendant = findLeftMostDescendant(); 76 | this.nodeName = innerNode.getNodeName(); 77 | if (parent == null) { 78 | //index only for the root node! 79 | this.postOrderIndex(new AtomicInteger(0)); 80 | } 81 | } 82 | 83 | /** 84 | *Creates a tree node object 85 | * @param innerNode the DOM API node 86 | * @param externalId The external Id for this tree 87 | */ 88 | public static TreeNode create(Node innerNode, String externalId){ 89 | TreeNode tree = new TreeNode(innerNode, null); 90 | tree.setExternalId(externalId); 91 | return tree; 92 | } 93 | 94 | public String getNodeName() { 95 | return nodeName; 96 | } 97 | 98 | public int getIndex() { 99 | return index; 100 | } 101 | 102 | /** 103 | * 104 | * @return true if this node has children; returns false otherwise 105 | */ 106 | public boolean hasChildNodes(){ 107 | return children != null && !children.isEmpty(); 108 | } 109 | 110 | /** 111 | * gets the inner DOM API node to which this node is a wrapper 112 | * @return inner node 113 | */ 114 | public Node getInnerNode() { 115 | return innerNode; 116 | } 117 | 118 | /** 119 | * gets parent node 120 | * @return gets parent node ; may return null 121 | * especially for the root node which doesnt have a parent. 122 | */ 123 | public TreeNode getParent() { 124 | return parent; 125 | } 126 | 127 | /** 128 | * gets all children 129 | * @return list of children if they are present or null if they are absent 130 | */ 131 | public List getChildren() { 132 | return children; 133 | } 134 | 135 | /** 136 | * gets lowest leftmost descendant node. 137 | * This is same as {@link #findLeftMostDescendant()} except one: 138 | * this method returns cached state variable instead of finding on request 139 | * @return lowest leftmost descendant node 140 | */ 141 | public TreeNode getLeftMostDescendant() { 142 | return leftMostDescendant; 143 | } 144 | 145 | /** 146 | * finds the lowest left most descendant 147 | * @return lowest left most 148 | * @see #getLeftMostDescendant() 149 | */ 150 | public TreeNode findLeftMostDescendant(){ 151 | return hasChildNodes() ? children.get(0).getLeftMostDescendant() : this; 152 | } 153 | 154 | /** 155 | * pretty prints the tree 156 | */ 157 | public void prettyPrint() { 158 | prettyPrint("", true); 159 | } 160 | 161 | private void prettyPrint(String prefix, boolean isTail) { 162 | String name = String.format("[%d:%d] %s desc:[%s]", index, size, 163 | innerNode.getNodeName(), leftMostDescendant.index); 164 | System.out.println(prefix + (isTail ? "└── " : "├── ") + name); 165 | if (hasChildNodes()) { 166 | for (int i = 0; i < children.size() - 1; i++) { 167 | children.get(i).prettyPrint(prefix + (isTail ? " " : "│ "), false); 168 | } 169 | children.get(children.size() - 1).prettyPrint(prefix + (isTail ? " " : "│ "), true); 170 | } 171 | } 172 | 173 | /** 174 | * Traverses the tree in post order 175 | * @param traversedNodes List of nodes to which the new nodes are to be appended 176 | */ 177 | private void postOrderTraverse(List traversedNodes){ 178 | if (hasChildNodes()){ 179 | for (TreeNode child : children) { 180 | child.postOrderTraverse(traversedNodes); 181 | } 182 | } 183 | traversedNodes.add(this); 184 | } 185 | 186 | /** 187 | * Traverses the Tree in post order 188 | * @return list of nodes visited along the post order traversal 189 | */ 190 | public List postOrderTraverse(){ 191 | List elements = new ArrayList<>(); 192 | postOrderTraverse(elements); 193 | return elements; 194 | } 195 | 196 | /** 197 | * Indexes the tree nodes in post order 198 | * @param startIndex the starting index 199 | */ 200 | public void postOrderIndex(AtomicInteger startIndex){ 201 | int offset = startIndex.get(); 202 | if (hasChildNodes()) { 203 | for (TreeNode child : children) { 204 | child.postOrderIndex(startIndex); 205 | startIndex.incrementAndGet(); 206 | } 207 | } 208 | this.index = startIndex.get(); 209 | this.size = this.index - offset + 1; 210 | } 211 | 212 | @Override 213 | public String toString() { 214 | return String.format("[%d]%s", index, nodeName); 215 | } 216 | 217 | /** 218 | * gets key roots of the tree rooted at this tree. Key root is one whose leftmost descendant 219 | * is different than its immediate parent 220 | * @param keyRootsBuffer buffer for updating the key roots 221 | */ 222 | private void getKeyRoots(List keyRootsBuffer){ 223 | if (hasChildNodes()) { 224 | for (TreeNode child : children) { 225 | child.getKeyRoots(keyRootsBuffer); 226 | } 227 | } 228 | if (this.parent == null || //root node wont have parent 229 | this.parent.leftMostDescendant.index != this.leftMostDescendant.index) { 230 | //left descendant is not same as parent's left descendant => its a key mode 231 | keyRootsBuffer.add(this); 232 | } 233 | } 234 | 235 | /** 236 | * gets key roots of the tree rooted at this tree. Key root is one whose leftmost descendant 237 | * is different than its immediate parent 238 | * @return list of all key root nodes 239 | */ 240 | public List getKeyRoots() { 241 | List keyRoots = new ArrayList<>(); 242 | getKeyRoots(keyRoots); 243 | return keyRoots; 244 | } 245 | 246 | /** 247 | * Get number of nodes (this node + all nodes under this node) 248 | * @return number of nodes in the tree 249 | */ 250 | public int getSize() { 251 | return size; 252 | } 253 | 254 | public static TreeNode createDummyNode(String name){ 255 | return new TreeNode(name); 256 | } 257 | 258 | public String getExternalId() { 259 | return externalId; 260 | } 261 | 262 | public void setExternalId(String externalId) { 263 | this.externalId = externalId; 264 | } 265 | 266 | /** 267 | * Copies the tree labels into bracket notation tree 268 | * @param builder the string builder to copy the labels 269 | */ 270 | protected void toBracketNotation(StringBuilder builder){ 271 | builder.append("{").append(getNodeName()); 272 | if (hasChildNodes()) { 273 | for (TreeNode child : getChildren()) { 274 | child.toBracketNotation(builder); 275 | } 276 | } 277 | builder.append("}"); 278 | } 279 | 280 | /** 281 | * Converts the tree rooted at this node to bracket notation 282 | * @return String containing bracket notation of string labels 283 | */ 284 | public String toBracketNotation(){ 285 | StringBuilder builder = new StringBuilder(); 286 | toBracketNotation(builder); 287 | return builder.toString(); 288 | } 289 | 290 | 291 | public static void main(String[] args) throws Exception { 292 | DOMParser parser = new DOMParser(); 293 | 294 | String pathname = "src/test/resources/html/simple/3.html"; 295 | parser.parse(new InputSource(new FileReader(pathname))); 296 | Document document = parser.getDocument(); 297 | 298 | TreeNode node = new TreeNode(document, null); 299 | node.postOrderIndex(new AtomicInteger(1)); 300 | node.prettyPrint(); 301 | 302 | List nodes = node.postOrderTraverse(); 303 | System.out.println(nodes); 304 | 305 | System.out.println(node.getKeyRoots()); 306 | } 307 | } 308 | -------------------------------------------------------------------------------- /apted/src/main/java/edu/usc/irds/ted/apted/InfoTree_PLUS.java: -------------------------------------------------------------------------------- 1 | // The MIT License (MIT) 2 | // Copyright (c) 2016 Mateusz Pawlik and Nikolaus Augsten 3 | // 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy 5 | // of this software and associated documentation files (the "Software"), to deal 6 | // in the Software without restriction, including without limitation the rights 7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | // copies of the Software, and to permit persons to whom the Software is 9 | // furnished to do so, subject to the following conditions: 10 | // 11 | // The above copyright notice and this permission notice shall be included in 12 | // all copies or substantial portions of the Software. 13 | // 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | // SOFTWARE. 21 | 22 | package edu.usc.irds.ted.apted; 23 | 24 | import edu.usc.irds.ted.apted.util.LabelDictionary; 25 | import edu.usc.irds.ted.apted.util.LblTree; 26 | 27 | import java.io.Serializable; 28 | import java.util.*; 29 | 30 | /** 31 | * Stores and serves various information about the tree, subtrees and nodes. 32 | * 33 | * @author Mateusz Pawlik 34 | * 35 | */ 36 | public class InfoTree_PLUS implements Serializable { 37 | 38 | private static final long serialVersionUID = 7601954909338612571L; 39 | 40 | public InfoTree_PLUS(LblTree aInputTree, LabelDictionary aLd) { 41 | sizeTmp = 0; 42 | descSizesTmp = 0; 43 | krSizesSumTmp = 0; 44 | revkrSizesSumTmp = 0; 45 | preorderTmp = 0; 46 | currentNode = -1; 47 | switched = false; 48 | leafCount = 0; 49 | treeSize = 0; 50 | inputTree = aInputTree; 51 | treeSize = inputTree.getNodeCount(); 52 | sizes = new int[treeSize]; 53 | parents = new int[treeSize]; 54 | preL_to_preR = new int[treeSize]; 55 | preR_to_preL = new int[treeSize]; 56 | 57 | preL_to_postL = new int[treeSize]; 58 | postL_to_preL = new int[treeSize]; 59 | 60 | preL_to_postR = new int[treeSize]; 61 | postR_to_preL = new int[treeSize]; 62 | 63 | labels = new int[treeSize]; 64 | preL_to_ln = new int[treeSize]; 65 | preR_to_ln = new int[treeSize]; 66 | preL_to_kr_sum = new int[treeSize]; 67 | preL_to_rev_kr_sum = new int[treeSize]; 68 | preL_to_desc_sum = new int[treeSize]; 69 | Arrays.fill(parents, -1); 70 | children = new int[treeSize][]; 71 | nodeType_L = new boolean[treeSize]; 72 | nodeType_R = new boolean[treeSize]; 73 | nodeType_H = new boolean[treeSize]; 74 | ld = aLd; 75 | currentNode = 0; 76 | 77 | depthTmp = -1; 78 | depths = new int[treeSize]; 79 | 80 | gatherInfo(inputTree, -1); 81 | postTraversalProcessing(); 82 | } 83 | 84 | public int getSize() 85 | { 86 | return treeSize; 87 | } 88 | 89 | public int getLeafCount() 90 | { 91 | return leafCount; 92 | } 93 | 94 | public boolean ifNodeOfType(int postorder, int type) 95 | { 96 | switch(type) 97 | { 98 | case 0: // '\0' 99 | return nodeType_L[postorder]; 100 | 101 | case 1: // '\001' 102 | return nodeType_R[postorder]; 103 | 104 | case 2: // '\002' 105 | return nodeType_H[postorder]; 106 | } 107 | return false; 108 | } 109 | 110 | public int[] getChildren(int node) 111 | { 112 | return children[node]; 113 | } 114 | 115 | public int getSizes(int node) 116 | { 117 | return sizes[node]; 118 | } 119 | 120 | public int getParents(int node) 121 | { 122 | return parents[node]; 123 | } 124 | 125 | public int getPreL_to_PreR(int node) 126 | { 127 | return preL_to_preR[node]; 128 | } 129 | 130 | public int getPreR_to_PreL(int node) 131 | { 132 | return preR_to_preL[node]; 133 | } 134 | 135 | public int getLabels(int node) 136 | { 137 | return labels[node]; 138 | } 139 | 140 | public int getPreL_to_LN(int node) 141 | { 142 | return preL_to_ln[node]; 143 | } 144 | 145 | public int getPreR_to_LN(int node) 146 | { 147 | return preR_to_ln[node]; 148 | } 149 | 150 | public int getPreL_to_KR_Sum(int node) 151 | { 152 | return preL_to_kr_sum[node]; 153 | } 154 | 155 | public int getPreL_to_Rev_KR_Sum(int node) 156 | { 157 | return preL_to_rev_kr_sum[node]; 158 | } 159 | 160 | public int getPreL_to_Desc_Sum(int node) 161 | { 162 | return preL_to_desc_sum[node]; 163 | } 164 | 165 | public int getCurrentNode() 166 | { 167 | return currentNode; 168 | } 169 | 170 | public void setCurrentNode(int preorderL) 171 | { 172 | currentNode = preorderL; 173 | } 174 | 175 | private int gatherInfo(LblTree aT, int postorder) 176 | { 177 | depthTmp++; 178 | int currentSize = 0; 179 | int childrenCount = 0; 180 | int descSizes = 0; 181 | int krSizesSum = 0; 182 | int revkrSizesSum = 0; 183 | int preorder = preorderTmp; 184 | int preorderR = 0; 185 | int heavyChild = -1; 186 | int weight = -1; 187 | int maxWeight = -1; 188 | int currentPreorder = -1; 189 | ArrayList childrenPreorders = new ArrayList(); 190 | preorderTmp++; 191 | for(Enumeration e = aT.children(); e.hasMoreElements();) 192 | { 193 | childrenCount++; 194 | currentPreorder = preorderTmp; 195 | parents[currentPreorder] = preorder; 196 | postorder = gatherInfo((LblTree)e.nextElement(), postorder); 197 | childrenPreorders.add(Integer.valueOf(currentPreorder)); 198 | weight = sizeTmp + 1; 199 | if(weight >= maxWeight) 200 | { 201 | maxWeight = weight; 202 | heavyChild = currentPreorder; 203 | } 204 | currentSize += 1 + sizeTmp; 205 | descSizes += descSizesTmp; 206 | if(childrenCount > 1) 207 | { 208 | krSizesSum += krSizesSumTmp + sizeTmp + 1; 209 | } else 210 | { 211 | krSizesSum += krSizesSumTmp; 212 | nodeType_L[currentPreorder] = true; 213 | } 214 | if(e.hasMoreElements()) 215 | { 216 | revkrSizesSum += revkrSizesSumTmp + sizeTmp + 1; 217 | } else 218 | { 219 | revkrSizesSum += revkrSizesSumTmp; 220 | nodeType_R[currentPreorder] = true; 221 | } 222 | } 223 | 224 | postorder++; 225 | aT.setTmpData(Integer.valueOf(preorder)); 226 | int currentDescSizes = descSizes + currentSize + 1; 227 | preL_to_desc_sum[preorder] = ((currentSize + 1) * (currentSize + 1 + 3)) / 2 - currentDescSizes; 228 | preL_to_kr_sum[preorder] = krSizesSum + currentSize + 1; 229 | preL_to_rev_kr_sum[preorder] = revkrSizesSum + currentSize + 1; 230 | labels[preorder] = ld.store(aT.getLabel()); 231 | sizes[preorder] = currentSize + 1; 232 | preorderR = treeSize - 1 - postorder; 233 | preL_to_preR[preorder] = preorderR; 234 | preR_to_preL[preorderR] = preorder; 235 | if(heavyChild != -1) 236 | nodeType_H[heavyChild] = true; 237 | children[preorder] = toIntArray(childrenPreorders); 238 | descSizesTmp = currentDescSizes; 239 | sizeTmp = currentSize; 240 | krSizesSumTmp = krSizesSum; 241 | revkrSizesSumTmp = revkrSizesSum; 242 | 243 | 244 | postL_to_preL[postorder] = preorder; 245 | preL_to_postL[preorder] = postorder; 246 | 247 | preL_to_postR[preorder] = treeSize-1-preorder; 248 | postR_to_preL[treeSize-1-preorder] = preorder; 249 | // postR to postL : info[13][treeSize - 1 - preorder] = postorder; 250 | 251 | depths[preorder] = depthTmp; 252 | depthTmp--; 253 | return postorder; 254 | } 255 | 256 | public boolean isLeaf(int nodeInPreorderL) 257 | { 258 | return sizes[nodeInPreorderL] == 1; 259 | } 260 | 261 | private void postTraversalProcessing() 262 | { 263 | int currentLeaf = -1; 264 | for(int i = 0; i < sizes[0]; i++) 265 | { 266 | preL_to_ln[i] = currentLeaf; 267 | if(isLeaf(i)) { 268 | currentLeaf = i; 269 | } 270 | 271 | //lchl and rchl TODO: there are no values for parent node 272 | if (sizes[i] == 1) { 273 | int parent = parents[i]; 274 | if (parent > -1) { 275 | if (parent+1 == i) { 276 | lchl++; 277 | } else 278 | if (preL_to_preR[parent]+1 == preL_to_preR[i]) { 279 | rchl++; 280 | } 281 | } 282 | } 283 | } 284 | 285 | currentLeaf = -1; 286 | for(int i = 0; i < sizes[0]; i++) 287 | { 288 | preR_to_ln[i] = currentLeaf; 289 | if(isLeaf(preR_to_preL[i])) { 290 | currentLeaf = i; 291 | } 292 | } 293 | 294 | } 295 | 296 | public static int[] toIntArray(List integers) 297 | { 298 | int ints[] = new int[integers.size()]; 299 | int i = 0; 300 | for(Iterator iterator = integers.iterator(); iterator.hasNext();) 301 | { 302 | Integer n = (Integer)iterator.next(); 303 | ints[i++] = n.intValue(); 304 | } 305 | 306 | return ints; 307 | } 308 | 309 | public void setSwitched(boolean value) 310 | { 311 | switched = value; 312 | } 313 | 314 | public boolean isSwitched() 315 | { 316 | return switched; 317 | } 318 | 319 | private LblTree inputTree; 320 | private static final byte LEFT = 0; 321 | private static final byte RIGHT = 1; 322 | private static final byte HEAVY = 2; 323 | public int sizes[]; 324 | public int parents[]; 325 | public int preL_to_preR[]; 326 | public int preR_to_preL[]; 327 | public int labels[]; 328 | public int preL_to_ln[]; 329 | public int preR_to_ln[]; 330 | public int preL_to_kr_sum[]; 331 | public int preL_to_rev_kr_sum[]; 332 | public int preL_to_desc_sum[]; 333 | 334 | public int preL_to_postL[]; 335 | public int postL_to_preL[]; 336 | 337 | public int preL_to_postR[]; 338 | public int postR_to_preL[]; 339 | 340 | private LabelDictionary ld; 341 | public boolean nodeType_L[]; 342 | public boolean nodeType_R[]; 343 | public boolean nodeType_H[]; 344 | public int children[][]; 345 | private int sizeTmp; 346 | private int descSizesTmp; 347 | private int krSizesSumTmp; 348 | private int revkrSizesSumTmp; 349 | private int preorderTmp; 350 | private int currentNode; 351 | private boolean switched; 352 | private int leafCount; 353 | private int treeSize; 354 | 355 | private int depthTmp; 356 | public int depths[]; 357 | 358 | public int lchl; 359 | public int rchl; 360 | } --------------------------------------------------------------------------------