12 |
13 |
--------------------------------------------------------------------------------
/OPENSOURCE-LICENCES.md:
--------------------------------------------------------------------------------
1 | ## CyberNeko HTML Parser
2 | http://nekohtml.sourceforge.net/
3 | Apache 2.0 license
4 |
5 | ## Gson
6 | https://github.com/google/gson
7 | Apache 2.0 license
8 |
9 | ## Args4j
10 | http://args4j.kohsuke.org/
11 | The MIT License (MIT)
12 |
13 | ## SLF4j
14 | http://www.slf4j.org/
15 | identical to MIT License
16 |
17 | ## JUnit
18 | http://junit.org/
19 | Eclipse Public License - v 1.0
20 |
--------------------------------------------------------------------------------
/autoext/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
9 | log4j.logger.org.apache.spark = WARN
10 | log4j.logger.org.spark-project.jetty.server.handler = WARN
11 | log4j.logger.edu.usc.irds = DEBUG
--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | Auto-Extractor
2 | ==============
3 | Copyright 2016 Information Retrieval and Data Science (IRDS) Group,
4 |
5 | This product includes software developed at
6 | Information Retrieval and Data Science Group, University of Southern California (USC), Los Angeles, CA (http://irds.usc.edu)
7 | and
8 | NASA Jet Propulsion Laboratory, Pasadena, CA (http://www.jpl.nasa.gov/)
9 |
10 | This product Uses:
11 | * APTED :
12 | The MIT License (MIT) , Copyright (c) 2016 Mateusz Pawlik and Nikolaus Augsten
13 |
14 | * Many other open source tools with Apache Licence 2.0
15 |
--------------------------------------------------------------------------------
/autoext/src/test/resources/html/simple/1.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | This is my page 1
4 |
5 |
6 |
7 |
8 | Table 1
9 |
10 |
Name
11 |
Email
12 |
13 |
14 |
Thamme Gowda
15 |
tgowdan at Gmail.com
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/autoext/src/test/resources/html/simple/2.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | This is my page 2
4 |
5 |
6 |
7 |
8 | Table 2
9 |
10 |
Name
11 |
Email
12 |
13 |
14 |
Thamme Gowda
15 |
thammegowda.n at usc.edu
16 |
17 |
18 |
CS Dept
19 |
cs at usc.edu
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/lang/Function.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.lang;
18 |
19 | /**
20 | * Backport of JDK8's Function
21 | *
22 | */
23 | public interface Function {
24 | R apply(T obj);
25 | }
26 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/lang/BiFunction.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.lang;
18 |
19 | /**
20 | *
21 | * Back port of JDK8's BiFunction
22 | *
23 | */
24 | public interface BiFunction {
25 | R apply(T obj1, U ibj2);
26 | }
27 |
--------------------------------------------------------------------------------
/apted/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | edu.usc.irds.ted
6 | apted
7 | 0.1.1
8 | jar
9 |
10 | apted
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | junit
20 | junit
21 | 4.12
22 | test
23 |
24 |
25 |
26 |
27 |
28 | maven-compiler-plugin
29 |
30 | 1.7
31 | 1.7
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/Utils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.apache.hadoop.conf.Configuration
20 | import org.apache.nutch.protocol.Content
21 |
22 | /**
23 | * Created by tg on 4/5/16.
24 | */
25 | object Utils {
26 |
27 | def cloneContent(in:Content) : Content = {
28 | new Content(in.getUrl, in.getBaseUrl, in.getContent,
29 | in.getContentType, in.getMetadata, new Configuration())
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentFilter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import java.lang.Boolean
20 |
21 | import edu.usc.irds.lang.Function
22 |
23 | /**
24 | * Creates a filter based substring presence
25 | */
26 | @SerialVersionUID(100L)
27 | class ContentFilter(subString:String)
28 | extends Function[String, Boolean]
29 | with scala.Serializable {
30 |
31 | override def apply(t: String): Boolean = t.contains(subString)
32 | }
33 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Auto Extractor
2 | An intelligent extractor library which learns the structures of the input web pages and then figures out a strategy for scraping the structured content.
3 |
4 | ## Links
5 | + [Build Instructions](https://github.com/USCDataScience/autoextractor/wiki/Build-Instructions)
6 | + [Clustering Web Pages using Apache Spark](https://github.com/USCDataScience/autoextractor/wiki/Clustering-Tutorial)
7 |
8 |
9 | # Developers:
10 | * [Thamme Gowda, USC](mailto:tgowdan@gmail.com)
11 | * [Chris Mattmann, USC & NASA JPL]()
12 |
13 | # Citation:
14 |
15 | If you use this work, please cite:
16 | https://ieeexplore.ieee.org/abstract/document/7785739
17 |
18 | ```
19 | @inproceedings{gowda2016clustering,
20 | title={Clustering Web Pages Based on Structure and Style Similarity (Application Paper)},
21 | author={Gowda, Thamme and Mattmann, Chris A},
22 | booktitle={Information Reuse and Integration (IRI), 2016 IEEE 17th International Conference on},
23 | pages={175--180},
24 | year={2016},
25 | organization={IEEE}
26 | }
27 | ```
28 |
29 |
30 | # References :
31 | + K. Zhang and D. Shasha. 1989. "Simple fast algorithms for the editing distance between trees and related problems". SIAM J. Comput. 18, 6 (December 1989), 1245-1262.
32 | + Jarvis, R.A.; Patrick, Edward A., "Clustering Using a Similarity Measure Based on Shared Near Neighbors," in Computers, IEEE Transactions on , vol.C-22, no.11, pp.1025-1034, Nov. 1973
33 |
34 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/Tuple2.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | /**
20 | * A tuple to store pair of values
21 | */
22 | public class Tuple2 {
23 | public final F pos0;
24 | public final S pos1;
25 |
26 | public Tuple2(F pos0, S pos1) {
27 | this.pos0 = pos0;
28 | this.pos1 = pos1;
29 | }
30 |
31 | public F getPos0() {
32 | return pos0;
33 | }
34 |
35 | public S getPos1() {
36 | return pos1;
37 | }
38 |
39 | @Override
40 | public String toString() {
41 | return "(" + pos0 + ", " + pos1 + ")";
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/CliTool.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.kohsuke.args4j.{CmdLineException, CmdLineParser}
20 |
21 | import scala.collection.JavaConversions._
22 |
23 | /**
24 | *Trait for all tools implementing CLI
25 | */
26 | trait CliTool {
27 |
28 | def parseArgs(args: Array[String]): Unit ={
29 | val parser = new CmdLineParser(this)
30 | try {
31 | parser.parseArgument(args.toList)
32 | } catch {
33 | case e:CmdLineException =>
34 | System.err.println(e.getMessage)
35 | parser.printUsage(System.err)
36 | System.exit(1)
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/KeyDumper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.apache.hadoop.io.Writable
20 |
21 | /**
22 | * Dumps all the keys of sequence files
23 | */
24 | class KeyDumper extends IOSparkJob {
25 |
26 | def run(): Unit ={
27 | sc.union(getInputPaths().map(sc.sequenceFile(_,
28 | classOf[Writable], classOf[Writable])))
29 | .map(rec => rec._1.toString) //keys only
30 | .saveAsTextFile(outPath) //write it to a file
31 | LOG.info(s"Stored the output at $outPath")
32 | }
33 | }
34 |
35 | object KeyDumper{
36 |
37 | def main(args: Array[String]) {
38 | new KeyDumper().run(args)
39 | }
40 | }
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/base/EditDistanceComputer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.base;
18 |
19 | /**
20 | * Defines a contract for edit distance computer
21 | *
22 | * @author Thamme Gowda
23 | */
24 | public interface EditDistanceComputer {
25 |
26 | /**
27 | * Computes edit distance between two similar objects
28 | * @param object1 the first object
29 | * @param object2 the second object
30 | * @return the edit distance measure
31 | */
32 | double computeDistance(T object1, T object2);
33 |
34 |
35 | /**
36 | * Gets cost metric used for computing the edit distance
37 | * @return edit cost metric
38 | */
39 | EditCost getCostMetric();
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/visuals/webapp/css/style.css:
--------------------------------------------------------------------------------
1 | body {
2 | font: 300 36px "Helvetica Neue";
3 | height: 640px;
4 | margin: 80px 160px 80px 160px;
5 | overflow: hidden;
6 | position: relative;
7 | width: 960px;
8 | }
9 |
10 | a:link, a:visited {
11 | color: #777;
12 | text-decoration: none;
13 | }
14 |
15 | a:hover {
16 | color: #666;
17 | }
18 |
19 | blockquote {
20 | margin: 0;
21 | }
22 |
23 | blockquote:before {
24 | content: "“";
25 | position: absolute;
26 | left: -.4em;
27 | }
28 |
29 | blockquote:after {
30 | content: "”";
31 | position: absolute;
32 | }
33 |
34 | body > ul {
35 | margin: 0;
36 | padding: 0;
37 | }
38 |
39 | h1 {
40 | font-size: 64px;
41 | }
42 |
43 | h1, h2, h3 {
44 | font-weight: inherit;
45 | margin: 0;
46 | }
47 |
48 | h2, h3 {
49 | text-align: right;
50 | font-size: inherit;
51 | position: absolute;
52 | bottom: 0;
53 | right: 0;
54 | }
55 |
56 | h2 {
57 | font-size: 24px;
58 | position: absolute;
59 | }
60 |
61 | h3 {
62 | bottom: -20px;
63 | font-size: 18px;
64 | }
65 |
66 | .invert {
67 | background: #1f1f1f;
68 | color: #dcdccc;
69 | }
70 |
71 | .invert h2, .invert h3 {
72 | color: #7f9f7f;
73 | }
74 |
75 | .string, .regexp {
76 | color: #f39;
77 | }
78 |
79 | .keyword {
80 | color: #00c;
81 | }
82 |
83 | .comment {
84 | color: #777;
85 | font-style: oblique;
86 | }
87 |
88 | .number {
89 | color: #369;
90 | }
91 |
92 | .class, .special {
93 | color: #1181B8;
94 | }
95 |
96 | body > svg {
97 | position: absolute;
98 | top: -80px;
99 | left: -160px;
100 | }
101 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/DeDuplicator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.apache.hadoop.io.Text
20 | import org.apache.hadoop.mapred.SequenceFileOutputFormat
21 | import org.apache.nutch.protocol.Content
22 |
23 | /**
24 | * A de-duplicator tool
25 | */
26 | class DeDuplicator extends IOSparkJob{
27 |
28 | def run(): Unit = {
29 |
30 | val rdd = sc.union(getInputPaths()
31 | .map(sc.sequenceFile(_, classOf[Text], classOf[Content]))) // club all parts
32 |
33 | rdd.map({case (k,v) => (new Text(k), Utils.cloneContent(v))})
34 | .groupByKey()
35 | .map({case (k, v) => (k, v.iterator.next())})
36 | .saveAsHadoopFile(outPath, classOf[Text],
37 | classOf[Content], classOf[SequenceFileOutputFormat[Text,Content]]) // save it
38 |
39 | LOG.info(s"Done. Saved output at $outPath")
40 | }
41 | }
42 |
43 | object DeDuplicator extends {
44 |
45 | def main(args: Array[String]) {
46 | new DeDuplicator().run(args)
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/Checks.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | /**
20 | * Created by tg on 1/4/16.
21 | */
22 | public class Checks {
23 |
24 | /**
25 | * A custom {@link RuntimeException} to indicate that a check has failed
26 | */
27 | public static class CheckFailedException extends RuntimeException{
28 | /**
29 | * creates an exception
30 | * @param message message to describe why this exception was raised.
31 | */
32 | public CheckFailedException(String message) {
33 | super(message);
34 | }
35 | }
36 |
37 | /**
38 | * Checks boolean condition, on failure raises {@link CheckFailedException}
39 | * @param condition predicate
40 | * @param message error message to assist debug task when the condition fails
41 | */
42 | public static void check(boolean condition, String message){
43 | if (!condition) {
44 | throw new CheckFailedException(message);
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/apted/README.md:
--------------------------------------------------------------------------------
1 | # README : APTED
2 | This project has an implementation of Tree Edit Distance (TED).
3 |
4 | APTED is originally developed by Mateusz Pawlik and Nikolaus Augsten.
5 | The original implementation can be found at http://tree-edit-distance.dbresearch.uni-salzburg.at/#download
6 |
7 | The source code from original implementation has been reorganized to make it as a reusable library package for maven
8 | based builds.
9 |
10 | ### Requirements
11 | + Newer version of Maven (Tested on 3.0.5)
12 | + Newer version of JDK (Tested on 1.7.0_95)
13 |
14 | ## Build instructions
15 | + `mvn clean test package` to test and and package. Jar will be at `target/apted-*.jar`
16 | + `mvn install` to use it as a maven library for other projects. Then add the following as dependency to your project
17 |
18 | ```xml
19 |
20 | edu.usc.irds.ted
21 | apted
22 | 0.1.1
23 |
24 | ```
25 |
26 | ## LICENCE
27 | The original project is distributed under MIT licence,
28 | so this project is available under MIT licence. Find the licence header in the files.
29 |
30 |
31 | ---
32 |
33 | ### (original) README
34 | This is an implementation of the APTED algorithm from [2]. It builds up on the
35 | works in [1] and [3].
36 |
37 | The source code is published under the MIT licence found in the header of each
38 | source file.
39 |
40 | To build, do the following steps from within the root directory:
41 | mkdir build
42 | cd build
43 | cmake ..
44 | make
45 |
46 | [1] M. Pawlik and N. Augsten. Efficient Computation of the Tree Edit
47 | Distance. ACM Transactions on Database Systems (TODS) 40(1). 2015.
48 | [2] M. Pawlik and N. Augsten. Tree edit distance: Robust and memory-
49 | efficient. Information Systems 56. 2016.
50 | [3] M. Pawlik and N. Augsten. RTED: A Robust Algorithm for the Tree Edit
51 | Distance. PVLDB 5(4). 2011.
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | edu.usc.irds.autoext
6 | autoext-parent
7 | 0.2-SNAPSHOT
8 |
9 | autoext
10 | apted
11 | autoext-spark
12 |
13 | pom
14 |
15 | autoext-parent
16 | http://irds.usc.edu
17 |
18 |
19 | UTF-8
20 | 1.7
21 | 1.7
22 | 1.7.12
23 | 4.12
24 |
25 |
26 |
27 |
28 | org.slf4j
29 | slf4j-log4j12
30 | ${slf4j.version}
31 |
32 |
33 |
34 | junit
35 | junit
36 | ${junit.version}
37 | test
38 |
39 |
40 |
41 |
42 |
43 | maven-compiler-plugin
44 | org.apache.maven.plugins
45 | 3.3
46 |
47 | ${source.version}
48 | ${target.version}
49 |
50 |
51 |
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/ReflectionUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | import edu.usc.irds.autoext.base.EditDistanceComputer;
20 | import edu.usc.irds.autoext.tree.TreeNode;
21 |
22 | /**
23 | * Created by tg on 2/29/16.
24 | */
25 | public class ReflectionUtils {
26 |
27 | /**
28 | * this method instantiates a class
29 | * @param clsName name of class
30 | * @return instance
31 | *
32 | */
33 | public static T instantiate(String clsName) {
34 | try {
35 | Class> aClass = Class.forName(clsName, true, ReflectionUtils.class.getClassLoader());
36 | Object instance = aClass.newInstance();
37 | return (T) instance;
38 | } catch (Exception e) {
39 | throw new RuntimeException(e);
40 | }
41 | }
42 |
43 |
44 | /**
45 | * this method instantiates an instance of edit distance computer
46 | * @param clsName name of class
47 | * @return
48 | */
49 | public static EditDistanceComputer intantiateEDComputer(String clsName){
50 | return instantiate(clsName);
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/autoext/src/test/java/edu/usc/irds/autoext/tree/ZSTEDComputerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.tree;
18 |
19 | import org.junit.Test;
20 |
21 | import java.io.File;
22 |
23 | import static org.junit.Assert.assertEquals;
24 |
25 | /**
26 | * Created by tg on 12/29/15.
27 | */
28 | public class ZSTEDComputerTest {
29 |
30 | @Test
31 | public void testMain() throws Exception {
32 | ClassLoader resLoader = getClass().getClassLoader();
33 | String file1 = resLoader.getResource("html/simple/1.html").getPath();
34 | String file2 = resLoader.getResource("html/simple/2.html").getPath();
35 | String file3 = resLoader.getResource("html/simple/3.html").getPath();
36 | double distance;
37 | //same file
38 | distance = ZSTEDComputer.computeDistance(new File(file1), new File(file1));
39 | assertEquals(0.0, distance, 0.00);
40 |
41 | //almost same
42 | distance = ZSTEDComputer.computeDistance(new File(file1), new File(file2));
43 | assertEquals(3.0, distance, 0.00);
44 | //if(true) return;
45 | //dissimilar
46 | distance = ZSTEDComputer.computeDistance(new File(file1), new File(file3));
47 | assertEquals(10.0, distance, 0.00);
48 |
49 | }
50 | }
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/tree/DefaultEditCost.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.tree;
18 |
19 | import edu.usc.irds.autoext.base.EditCost;
20 |
21 | import java.io.Serializable;
22 |
23 | /**
24 | * Default unit Costs for edit operations
25 | */
26 | public class DefaultEditCost implements EditCost, Serializable{
27 |
28 | private static final long serialVersionUID = -4846293473238639407L;
29 | private int insertCost = 1;
30 | private int removeCost = 1;
31 | private int replaceCost = 1;
32 | private int noEditCost = 0;
33 | private int maxEditCost = replaceCost;
34 |
35 | @Override
36 | public double getInsertCost(TreeNode node) {
37 | return insertCost;
38 | }
39 |
40 | @Override
41 | public double getRemoveCost(TreeNode node) {
42 | return removeCost;
43 | }
44 |
45 | @Override
46 | public double getReplaceCost(TreeNode node1, TreeNode node2) {
47 | return replaceCost;
48 | }
49 |
50 | @Override
51 | public double getNoEditCost() {
52 | return noEditCost;
53 | }
54 |
55 | @Override
56 | public double getMaxUnitCost() {
57 | return maxEditCost;
58 | }
59 |
60 | @Override
61 | public boolean isSymmetric() {
62 | return true;
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/ParseUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | import org.cyberneko.html.parsers.DOMParser;
20 | import org.w3c.dom.Document;
21 | import org.xml.sax.InputSource;
22 | import org.xml.sax.SAXException;
23 |
24 | import java.io.FileInputStream;
25 | import java.io.IOException;
26 | import java.io.InputStream;
27 | import java.net.URL;
28 |
29 | /**
30 | * Created by tg on 1/5/16.
31 | */
32 | public class ParseUtils {
33 |
34 | private static final DOMParser domParser = new DOMParser();
35 |
36 | public static Document parseFile(String path) throws IOException, SAXException {
37 | synchronized (domParser) {
38 | domParser.parse(new InputSource(new FileInputStream(path)));
39 | Document document = domParser.getDocument();
40 | domParser.reset();
41 | return document;
42 | }
43 | }
44 |
45 | public static Document parseURL(URL url) throws IOException, SAXException {
46 | try (InputStream stream = url.openStream()) {
47 | synchronized (domParser) {
48 | domParser.parse(new InputSource(stream));
49 | Document document = domParser.getDocument();
50 | domParser.reset();
51 | return document;
52 | }
53 | }
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/Config.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext;
18 |
19 | import java.io.IOException;
20 | import java.io.InputStream;
21 | import java.util.Properties;
22 |
23 | /**
24 | * The configuration framework.
25 | */
26 | public class Config {
27 |
28 | public static final String CONFIG_FILE = "autoext.properties";
29 | public static final Properties DEF_PROPS = new Properties();
30 | public static final Config INSTANCE;
31 |
32 | static {
33 | try(InputStream stream = Config.class.getClassLoader().getResourceAsStream(CONFIG_FILE)){
34 | DEF_PROPS.load(stream);
35 | } catch (IOException e) {
36 | throw new RuntimeException(e);
37 | }
38 | INSTANCE = new Config(DEF_PROPS);
39 | }
40 |
41 | public static Config getInstance(){
42 | return INSTANCE;
43 | }
44 |
45 | private String tedImpl;
46 | private double simWeight;
47 |
48 | public Config(){
49 | this(DEF_PROPS);
50 | }
51 |
52 | public Config(Properties props) {
53 | this.tedImpl = props.getProperty("ted.impl").trim();
54 | this.simWeight = Double.parseDouble(props.getProperty("sim.weight").trim());
55 | }
56 |
57 | public String getTedImpl() {
58 | return tedImpl;
59 | }
60 |
61 | public double getSimWeight() {
62 | return simWeight;
63 | }
64 |
65 | }
66 |
67 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/base/EditCost.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.base;
18 |
19 | import edu.usc.irds.autoext.tree.DefaultEditCost;
20 |
21 | /**
22 | * Defines contract for Edit cost used by edit cost computer
23 | * @see DefaultEditCost
24 | */
25 | public interface EditCost {
26 |
27 | /**
28 | * Cost for insertion operation
29 | * @param node node to be inserted
30 | * @return the cost of insertion
31 | */
32 | double getInsertCost(T node);
33 |
34 | /**
35 | * cost for remove operation
36 | * @param node node to be removed
37 | * @return cost for removal
38 | */
39 | double getRemoveCost(T node);
40 |
41 | /**
42 | * Cost for replacement
43 | * @param node1 node to be removed
44 | * @param node2 node to be inserted
45 | * @return cost for the replacement
46 | */
47 | double getReplaceCost(T node1, T node2);
48 |
49 | /**
50 | * Cost for no edit operation
51 | * @return cost for no operation
52 | */
53 | double getNoEditCost();
54 |
55 |
56 | /**
57 | * Maximum cost for any single edit operation.
58 | * @return maximum bound on unit edit cost
59 | */
60 | double getMaxUnitCost();
61 |
62 |
63 | /**
64 | * true if the edit costs are symmetry. Symmetrc
65 | * @return true or false based on the symmetric nature
66 | */
67 | boolean isSymmetric();
68 | }
69 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/base/SimilarityComputer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.base;
18 |
19 | import edu.usc.irds.autoext.tree.StructureSimComputer;
20 | import edu.usc.irds.autoext.tree.StyleSimComputer;
21 | import edu.usc.irds.lang.BiFunction;
22 |
23 |
24 | /**
25 | * Generic Similarity computer contract. Look into the implementations for specific details
26 | * @see StructureSimComputer
27 | * @see StyleSimComputer
28 | * @author Thamme Gowda
29 | *
30 | */
31 | public abstract class SimilarityComputer implements BiFunction {
32 |
33 | /**
34 | * computes similarity between two objects. The similarity score is on [0.0, 1.0] scale inclusive.
35 | * The score of 1.0 indicates that argument {@code obj1} and {@code obj2} are extremely similar.
36 | * Similarity score of 0.0 indicates that both input objects are extremely dissimilar.
37 | * @param obj1 the first object
38 | * @param obj2 the second object
39 | * @return the similarity score [0.0, 1.0]
40 | */
41 | public abstract double compute(T obj1, T obj2);
42 |
43 | /**
44 | * Glues this contract with Functional programming
45 | * @param obj1 the first object
46 | * @param obj2 the second object
47 | * @return the similarity between first and second
48 | * @see #compute(Object, Object)
49 | */
50 | @Override
51 | public Double apply(T obj1, T obj2) {
52 | return this.compute(obj1, obj2);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentMerge.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.apache.hadoop.io.Text
20 | import org.apache.hadoop.mapred.SequenceFileOutputFormat
21 | import org.apache.nutch.protocol.Content
22 | import org.apache.spark.rdd.RDD
23 | import org.kohsuke.args4j.Option
24 | import org.slf4j.LoggerFactory
25 |
26 | /**
27 | * Merges sequence parts into one sequence file with configurable number of parts
28 | */
29 | class ContentMerge extends IOSparkJob {
30 |
31 | @Option(name = "-numparts", usage = "Number of parts in the output. Ex: 1, 2, 3.... Optional => default")
32 | var partitions:Integer = null
33 |
34 | def run(): Unit = {
35 |
36 | val paths = getInputPaths()
37 | LOG.info(s"Found ${paths.length} input paths")
38 | val rdds = new Array[RDD[(Text, Content)]](paths.length)
39 | for( i <- paths.indices){
40 | rdds(i) = sc.sequenceFile(paths(i), classOf[Text], classOf[Content])
41 | }
42 | var rdd = sc.union(rdds) // club all parts
43 | if (partitions != null) {
44 | rdd = rdd.coalesce(partitions)
45 | }
46 | rdd.saveAsHadoopFile(outPath, classOf[Text], classOf[Content],
47 | classOf[SequenceFileOutputFormat[_,_]]) // save it
48 | LOG.info(s"Done. Saved output at $outPath")
49 | }
50 | }
51 |
52 | object ContentMerge {
53 | val LOG = LoggerFactory.getLogger(DeDuplicator.getClass)
54 | def main(args: Array[String]) {
55 | new ContentMerge().run(args)
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/D3Export.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import java.util
20 |
21 | import edu.usc.irds.autoext.utils.D3JsFormat
22 | import org.kohsuke.args4j.Option
23 |
24 | import scala.collection.JavaConverters._
25 |
26 | /**
27 | * This CLI Tool exports clusters into most common format used by d3js charts.
28 | */
29 | class D3Export extends IOSparkJob {
30 | @Option(name="-ids", usage = "Path to directory/file having index to id mapping. Optional.")
31 | var idsFile:String = null
32 |
33 | override def run(): Unit = {
34 | val rdd = sc.union(getInputPaths().map(sc.textFile(_)))
35 |
36 | val clusters = rdd.map(line => {
37 | val items = line.split(",").map(_.trim.toInt)
38 | (items(0), items.slice(2, 2 + items(1)).toSeq.asJava)
39 | }).collectAsMap().asJava.asInstanceOf[util.Map[Integer, util.List[Integer]]]
40 |
41 | var idsMap: util.Map[Integer, String] = null
42 | if (idsFile != null){
43 | idsMap = sc.textFile(idsFile)
44 | .map(line => {
45 | val parts = line.split(",")
46 | (parts(0).trim.toInt, parts(1).trim)})
47 | .collectAsMap().asJava.asInstanceOf[util.Map[Integer, String]]
48 | }
49 | LOG.info("Num Clusters : {} ", clusters.size())
50 | D3JsFormat.storeClusters(outPath, "Clusters 1", clusters, idsMap, 10.0f)
51 | LOG.info("All done")
52 | }
53 | }
54 |
55 | object D3Export {
56 |
57 | def main(args: Array[String]) {
58 | new D3Export().run(args)
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentPartitioner.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import java.net.URL
20 |
21 | import org.apache.hadoop.io.Text
22 | import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat
23 | import org.apache.nutch.protocol.Content
24 | import org.apache.spark.rdd.RDD
25 |
26 | /**
27 | * This tool partitions data based on host name and content type
28 | */
29 | class ContentPartitioner extends IOSparkJob {
30 |
31 | def run(): Unit ={
32 | val paths = getInputPaths()
33 | var rdd: RDD[(Text, Content)] = sc.sequenceFile(paths(0), classOf[Text], classOf[Content])
34 | for (i <- 1 to paths.length - 1){
35 | rdd = sc.union(rdd, sc.sequenceFile(paths(i), classOf[Text], classOf[Content]))
36 | }
37 | rdd.map({case (k,v) =>
38 | val newK = new URL(k.toString).getHost + "/" + v.getContentType.replaceAll("[^a-zA-Z]", "").toLowerCase
39 | (new Text(newK), v)}) // key is host name + content type
40 | .saveAsHadoopFile(outPath, classOf[Text], classOf[Content],
41 | classOf[SplitOutputFormat])
42 | }
43 | }
44 |
45 | /**
46 | * Splits output based on key name and content type
47 | */
48 | class SplitOutputFormat extends MultipleSequenceFileOutputFormat[Text, Content]{
49 | override def generateActualKey(key: Text, value: Content): Text = new Text(value.getUrl)
50 |
51 | override def generateFileNameForKeyValue(key: Text, value: Content, name: String): String =
52 | key.toString + "/" + name
53 | }
54 |
55 | object ContentPartitioner{
56 | def main(args: Array[String]) {
57 | new ContentPartitioner().run(args)
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/MatrixUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | import edu.usc.irds.lang.BiFunction;
20 |
21 | import java.util.List;
22 |
23 | /**
24 | * Utilities related to matrix operations
25 | */
26 | public class MatrixUtils {
27 |
28 | /**
29 | * Computes the symmetrical matrix.
30 | * @param function the function that can be applied to a pair of objects and returns a double
31 | * @param objs list of objects
32 | * @param the object type
33 | * @return 2D matrix computed by applying function on pairs of objects.
34 | */
35 | public static double[][] computeSymmetricMatrix(BiFunction function, List objs){
36 | int n = objs.size();
37 | double[][] table = new double[n][n];
38 | for (int i = 0; i < n; i++) {
39 | T objI = objs.get(i);
40 | table[i][i] = function.apply(objI, objI); // the principal diagonal element
41 | for (int j = i + 1; j < objs.size(); j++) {
42 | table[i][j] = function.apply(objI, objs.get(j)); // the upper diagonal
43 | table[j][i] = table[i][j]; // the lower diagonal
44 | }
45 | }
46 | return table;
47 | }
48 |
49 |
50 | /**
51 | * Prints the matrix to STDOUT
52 | * @param matrix the matrix
53 | */
54 | public static void printMatrix(double[][] matrix) {
55 | for (int i = 0; i < matrix.length; i++) {
56 | for (int j = 0; j < matrix[i].length; j++) {
57 | System.out.printf("%5.2f\t", matrix[i][j]);
58 | }
59 | System.out.println();
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/autoext/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | autoext-parent
5 | edu.usc.irds.autoext
6 | 0.2-SNAPSHOT
7 |
8 | 4.0.0
9 |
10 | autoext
11 | jar
12 |
13 | autoext
14 | http://maven.apache.org
15 |
16 |
17 | UTF-8
18 | 2.32
19 | 1.9.22
20 | 1.7.12
21 | 2.5
22 |
23 |
24 |
25 |
26 | net.sourceforge.nekohtml
27 | nekohtml
28 | ${nekohtml.version}
29 |
30 |
31 | com.google.code.gson
32 | gson
33 | ${gson.version}
34 |
35 |
36 | args4j
37 | args4j
38 | ${args4j.version}
39 |
40 |
41 | edu.usc.irds.ted
42 | apted
43 | 0.1.1
44 |
45 |
46 |
47 |
48 |
49 | maven-assembly-plugin
50 |
51 |
52 | package
53 |
54 | single
55 |
56 |
57 |
58 |
59 |
60 | jar-with-dependencies
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/ContentGrep.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.apache.hadoop.io.Text
20 | import org.apache.hadoop.mapred.SequenceFileOutputFormat
21 | import org.apache.nutch.protocol.Content
22 | import org.apache.spark.rdd.RDD
23 | import org.kohsuke.args4j.Option
24 |
25 | /**
26 | * Greps the content for specific url sub strings and content type sub strings
27 | */
28 | class ContentGrep extends IOSparkJob {
29 |
30 | @Option(name = "-urlfilter", usage = "Url filter substring", required = true)
31 | var urlFilter: String = null
32 |
33 | @Option(name = "-contentfilter", usage = "Content type filter substring")
34 | var contentFilter:String = null
35 |
36 | def run(): Unit ={
37 | println("Initializing spark context")
38 | val paths = getInputPaths()
39 | println(s"Found ${paths.length} paths")
40 |
41 | val rdds = new Array[RDD[(Text, Content)]](paths.length)
42 | for( i <- paths.indices){
43 | rdds(i) = sc.sequenceFile(paths(i), classOf[Text], classOf[Content])
44 | }
45 | var rdd = sc.union(rdds)
46 | val contentFilter = this.contentFilter
47 | val urlFilter = this.urlFilter
48 | rdd = rdd.filter(rec => ((urlFilter == null || rec._2.getUrl.contains(urlFilter))
49 | && (contentFilter == null || rec._2.getContentType.contains(contentFilter))))
50 | LOG.info("Saving output at {}", outPath)
51 | rdd.saveAsHadoopFile(outPath, classOf[Text], classOf[Content], classOf[SequenceFileOutputFormat[_,_]])
52 | LOG.info("Done. Stopping spark context")
53 | sc.stop()
54 | }
55 | }
56 |
57 | object ContentGrep {
58 |
59 | def main(args: Array[String]) {
60 | new ContentGrep().run(args)
61 | }
62 | }
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/SparkJob.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.apache.hadoop.io.Text
20 | import org.apache.nutch.protocol.Content
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | import org.kohsuke.args4j.Option
23 | import org.slf4j.LoggerFactory
24 |
25 | /**
26 | * Base class for all spark jobs
27 | */
28 | trait SparkJob extends CliTool {
29 |
30 | val LOG = LoggerFactory.getLogger(getClass)
31 |
32 | @Option(name = "-master", aliases = Array("--master"),
33 | usage = "Spark master. This is not required when job is started with spark-submit")
34 | var sparkMaster: String = null
35 |
36 | @Option(name = "-app", aliases= Array("--app-name"),
37 | usage = "Name for spark context.")
38 | var appName: String = getClass.getSimpleName
39 |
40 | var sc: SparkContext = null
41 |
42 | /**
43 | * initializes spark context if not already initialized
44 | */
45 | def initSpark(): Unit ={
46 | if (sc == null) {
47 | LOG.info("Initializing Spark Context ")
48 | val conf = new SparkConf().setAppName(appName)
49 | .registerKryoClasses(Array(classOf[Text], classOf[Content]))
50 | if (sparkMaster != null) {
51 | LOG.info("Spark Master {}", sparkMaster)
52 | conf.setMaster(sparkMaster)
53 | }
54 | sc = new SparkContext(conf)
55 | }
56 | }
57 |
58 | def stopSpark(): Unit ={
59 | if (sc != null){
60 | LOG.info("Stopping spark.")
61 | sc.stop()
62 | }
63 | }
64 |
65 | /**
66 | * Abstract method which has actual job description
67 | */
68 | def run()
69 |
70 | def run(args:Array[String]): Unit ={
71 | parseArgs(args)
72 | initSpark()
73 | run()
74 | stopSpark()
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/autoext/src/test/java/edu/usc/irds/autoext/utils/XPathEvaluatorTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | import org.junit.Test;
20 | import org.w3c.dom.Element;
21 | import org.w3c.dom.NodeList;
22 |
23 | import javax.xml.parsers.DocumentBuilder;
24 | import javax.xml.parsers.DocumentBuilderFactory;
25 | import javax.xml.xpath.XPathExpression;
26 | import java.util.Set;
27 |
28 | import static org.junit.Assert.assertEquals;
29 | import static org.junit.Assert.assertTrue;
30 |
31 | /**
32 | * Created by tg on 1/16/16.
33 | */
34 | public class XPathEvaluatorTest {
35 |
36 | XPathEvaluator instance = new XPathEvaluator();
37 | Element docRoot ;
38 | {
39 | try {
40 | DocumentBuilder b = DocumentBuilderFactory.newInstance().newDocumentBuilder();
41 | docRoot = b.parse(getClass().getClassLoader()
42 | .getResourceAsStream("html/simple/1.html")).getDocumentElement();
43 | } catch (Exception e) {
44 | throw new RuntimeException(e);
45 | }
46 | }
47 |
48 | @Test
49 | public void testEval() throws Exception {
50 | XPathExpression titleExpr = instance.compile("//title/text()");
51 | NodeList list = instance.eval(docRoot, titleExpr);
52 | assertEquals(1, list.getLength());
53 | assertEquals("This is my page 1", list.item(0).getTextContent());
54 | }
55 |
56 | @Test
57 | public void testFindUniqueClassNames() throws Exception {
58 | Set names = instance.findUniqueClassNames(docRoot);
59 | assertEquals(6, names.size());
60 | assertTrue(names.contains("header"));
61 | assertTrue(names.contains("row"));
62 | assertTrue(names.contains("cell"));
63 | assertTrue(names.contains("col1"));
64 | assertTrue(names.contains("col2"));
65 | assertTrue(names.contains("table"));
66 | }
67 | }
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/hdfs/RawToSeq.scala:
--------------------------------------------------------------------------------
1 | package edu.usc.irds.autoext.hdfs
2 |
3 | import java.io.File
4 | import java.nio.file
5 | import java.nio.file.Paths
6 | import java.util.concurrent.atomic.AtomicInteger
7 |
8 | import edu.usc.irds.autoext.hdfs.RawToSeq.LOG
9 | import edu.usc.irds.autoext.spark.CliTool
10 | import edu.usc.irds.autoext.utils.Timer
11 | import org.apache.commons.io.FileUtils
12 | import org.apache.hadoop.conf.Configuration
13 | import org.apache.hadoop.fs.{FileSystem, Path}
14 | import org.apache.hadoop.io.{SequenceFile, Text}
15 | import org.apache.nutch.metadata.Metadata
16 | import org.apache.nutch.protocol.Content
17 | import org.kohsuke.args4j.Option
18 | import org.slf4j.LoggerFactory
19 |
20 | /**
21 | * This tool creates sequence file from Raw HTML files
22 | */
23 | class RawToSeq extends CliTool {
24 |
25 | @Option(name = "-in", required = true, usage = "path to directory having html pages")
26 | var in: String = null
27 |
28 | @Option(name = "-out", required = true, usage = "path to output Sequence File")
29 | var output: String = null
30 |
31 | def run(): Unit ={
32 | val config = new Configuration()
33 | val fs = FileSystem.get(config)
34 | val inDir = new File(in)
35 |
36 | val files = FileUtils.listFiles(inDir, null, true).iterator()
37 | val outPath = new Path(output)
38 | //SequenceFile.createWriter(fs, config, outPath, , classOf[Content], )
39 | val writer = SequenceFile.createWriter(config, SequenceFile.Writer.keyClass(classOf[Text]),
40 | SequenceFile.Writer.valueClass(classOf[Content]), SequenceFile.Writer.file(outPath))
41 |
42 | val timer = new Timer
43 | val delay = 2000
44 | val count = new AtomicInteger()
45 | while (files.hasNext) {
46 | val nextFile = files.next()
47 | if (nextFile.isDirectory || nextFile.getName.startsWith(".")){
48 | //that's fine, skip it
49 | } else if (nextFile.isFile) {
50 | val id = nextFile.getPath
51 | val allBytes: Array[Byte] = file.Files.readAllBytes(Paths.get(nextFile.getAbsolutePath))
52 | val content = new Content(id, id, allBytes, "text/html", new Metadata(), config)
53 | writer.append(new Text(id), content)
54 | count.incrementAndGet()
55 | } else {
56 | LOG.warn(s"Skip : $nextFile" )
57 | }
58 | if (timer.read() >= delay ){
59 | LOG.info(s"Count = $count, Last=$nextFile")
60 | timer.reset()
61 | }
62 | }
63 | writer.close()
64 | LOG.info(s"Done.. $count")
65 | }
66 | }
67 |
68 | object RawToSeq {
69 |
70 | val LOG = LoggerFactory.getLogger(RawToSeq.getClass)
71 | def main(args: Array[String]) {
72 | val i = new RawToSeq()
73 | i.parseArgs(args)
74 | i.run()
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/Timer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | /**
20 | * A simple reusable timer utility for benchmarking the code snippet.
21 | *
22 | * @author Thamme Gowda
23 | */
24 | public class Timer {
25 |
26 | private long start;
27 | private long end;
28 |
29 | /**
30 | * Creates a timer and also marks the start
31 | */
32 | public Timer() {
33 | this.start = System.currentTimeMillis();
34 | }
35 |
36 | /**
37 | * Starts the timer.
38 | * @see #reset() to reuse the timer
39 | */
40 | public void start(){
41 | this.start = System.currentTimeMillis();
42 | }
43 |
44 | /**
45 | * Resets the timer and returns the value before the reset
46 | * @return the previous value of the timer
47 | */
48 | public long reset(){
49 | long old = read();
50 | this.start = System.currentTimeMillis();
51 | return old;
52 | }
53 |
54 | /**
55 | * Stops the timer
56 | * @return the timer value at the stop
57 | * @see #read() to retrieve it later time
58 | */
59 | public long stop(){
60 | this.end = System.currentTimeMillis();
61 | return this.end - this.start;
62 | }
63 |
64 | /**
65 | * reads the timer value.
66 | * @return the timer value, computes the difference between the start and end when applicable
67 | */
68 | public long read(){
69 | return (this.end >= this.start ? this.end : System.currentTimeMillis()) - this.start;
70 | }
71 |
72 | /**
73 | * Gets the timestamp when this timer was started
74 | * @return start timestamp
75 | */
76 | public long getStart() {
77 | return start;
78 | }
79 |
80 | /**
81 | * Gets the timestamp when this timer was stopped.
82 | * @return stop timestamp. 0 if the timer was not stopped
83 | */
84 | public long getEnd() {
85 | return end;
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/Main.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import edu.usc.irds.autoext.hdfs.RawToSeq
20 |
21 | object Main {
22 |
23 | val cmds = Map[String, (Class[_], String)](
24 | "help" -> (null, "Prints this help message."),
25 | "partition" -> (classOf[ContentPartitioner], "Partitions Nutch Content based on host names."),
26 | "keydump" -> (classOf[KeyDumper], "Dumps all the keys of sequence files(s)."),
27 | "grep" -> (classOf[ContentGrep], "Greps for the records which contains url and content type filters."),
28 | "merge" -> (classOf[ContentMerge], "Merges (smaller) part files into one large sequence file."),
29 | "similarity" -> (classOf[ContentSimilarityComputer], "Computes similarity between documents."),
30 | "sncluster" -> (classOf[SharedNeighborCuster], "Cluster using Shared near neighbor algorithm."),
31 | "simcombine" -> (classOf[SimilarityCombiner], "Combines two similarity measures on a linear scale."),
32 | "dedup" -> (classOf[DeDuplicator], "Removes duplicate documents (exact url matches)."),
33 | "d3export" -> (classOf[D3Export], "Exports clusters into most popular d3js format for clusters."),
34 | "createseq" -> (classOf[RawToSeq], "Creates a sequence file (compatible with Nutch Segment) from raw HTML files.")
35 | )
36 |
37 | def printAndExit(exitCode:Int = 0, msg:String = "Usage "): Unit ={
38 | println(msg)
39 | println("Commands::")
40 | cmds.foreach({case (cmd,(cls, desc))=> println(String.format(" %-9s - %s", cmd, desc))})
41 | System.exit(exitCode)
42 | }
43 |
44 | def main(args: Array[String]) {
45 | if (args.length == 0) {
46 | printAndExit(1, "Error: Invalid args")
47 | } else if (!cmds.contains(args(0)) || args(0).equalsIgnoreCase("help")){
48 | printAndExit(1)
49 | } else {
50 | val method = cmds.get(args(0)).get._1.getDeclaredMethod("main", args.getClass)
51 | method.invoke(null, args.slice(1, args.length))
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/apted/src/main/java/edu/usc/irds/ted/apted/util/LabelDictionary.java:
--------------------------------------------------------------------------------
1 | // The MIT License (MIT)
2 | // Copyright (c) 2016 Mateusz Pawlik and Nikolaus Augsten
3 | //
4 | // Permission is hereby granted, free of charge, to any person obtaining a copy
5 | // of this software and associated documentation files (the "Software"), to deal
6 | // in the Software without restriction, including without limitation the rights
7 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 | // copies of the Software, and to permit persons to whom the Software is
9 | // furnished to do so, subject to the following conditions:
10 | //
11 | // The above copyright notice and this permission notice shall be included in
12 | // all copies or substantial portions of the Software.
13 | //
14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | // SOFTWARE.
21 |
22 | package edu.usc.irds.ted.apted.util;
23 |
24 | import java.io.Serializable;
25 | import java.util.Hashtable;
26 | import java.util.Map;
27 |
28 | /**
29 | * Dictionary to store labels to integers mappings.
30 | *
31 | * @author Nikolaus Augsten
32 | *
33 | */
34 | public class LabelDictionary implements Serializable {
35 |
36 | private static final long serialVersionUID = -5657129208276560195L;
37 |
38 | public LabelDictionary() {
39 | newLabelsAllowed = true;
40 | count = 0;
41 | StrInt = new Hashtable();
42 | IntStr = new Hashtable();
43 | }
44 |
45 | public int store(String label) {
46 | if (StrInt.containsKey(label))
47 | return ((Integer)StrInt.get(label)).intValue();
48 | if (!newLabelsAllowed) {
49 | return -1;
50 | } else {
51 | Integer intKey = new Integer(count++);
52 | StrInt.put(label, intKey);
53 | IntStr.put(intKey, label);
54 | return intKey.intValue();
55 | }
56 | }
57 |
58 | public String read(int labelID)
59 | {
60 | return (String)IntStr.get(new Integer(labelID));
61 | }
62 |
63 | public boolean isNewLabelsAllowed()
64 | {
65 | return newLabelsAllowed;
66 | }
67 |
68 | public void setNewLabelsAllowed(boolean newLabelsAllowed)
69 | {
70 | this.newLabelsAllowed = newLabelsAllowed;
71 | }
72 |
73 | public static final int KEY_DUMMY_LABEL = -1;
74 | private int count;
75 | private Map StrInt;
76 | private Map IntStr;
77 | private boolean newLabelsAllowed;
78 | }
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/IOSparkJob.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.kohsuke.args4j.{CmdLineParser, Option}
20 |
21 | /**
22 | * Trait for SparkJobs which have Inputs and outputs
23 | */
24 | trait IOSparkJob extends SparkJob {
25 |
26 | @Option(name = "-in", forbids = Array("-list"),
27 | usage = "path to a file/folder having input data")
28 | var inputPath: String = null
29 |
30 | @Option(name = "-list", forbids=Array("-in"),
31 | usage = "path to a file which contains many input paths (one path per line).")
32 | var listFilePath: String = null
33 |
34 | @Option(name = "-out", required = true, usage = "Path to file/folder where the output shall be stored")
35 | var outPath: String = null
36 |
37 | @Option(name = "-locallist", forbids = Array("-in"), depends = Array("-list"),
38 | usage = "When this flag is set the -list is forced to treat as local file." +
39 | " By default the list is read from distributed filesystem when applicable")
40 | var localList: Boolean = false
41 |
42 | override def parseArgs(args:Array[String]): Unit ={
43 | super.parseArgs(args)
44 | if (inputPath == null && listFilePath == null) {
45 | System.err.println("Either -in or -list is required.")
46 | new CmdLineParser(this).printUsage(System.err)
47 | System.exit(1)
48 | }
49 | }
50 |
51 | /**
52 | * Gets input paths to this io job
53 | * @return paths to job
54 | */
55 | def getInputPaths(): Array[String] ={
56 | if (inputPath != null) {
57 | Array(inputPath)
58 | } else if (listFilePath != null) {
59 | val lines =
60 | if (localList) {
61 | val src = scala.io.Source.fromFile(listFilePath)
62 | try src.getLines().toArray finally src.close()
63 | } else {
64 | sc.textFile(listFilePath).collect()
65 | }
66 | lines.map(_.trim).filter(l => !l.startsWith("#") && !l.isEmpty)
67 | } else {
68 | throw new RuntimeException("No input specified")
69 | }
70 | }
71 |
72 | }
73 |
74 |
--------------------------------------------------------------------------------
/autoext-spark/src/main/scala/edu/usc/irds/autoext/spark/SimilarityCombiner.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.spark
18 |
19 | import org.apache.spark.mllib.linalg.distributed.MatrixEntry
20 | import org.kohsuke.args4j.Option
21 |
22 | /**
23 | * Combines two similarities on a linear scale with given weight value
24 | */
25 | class SimilarityCombiner extends SparkJob {
26 |
27 | @Option(name = "-in1", required = true, usage = "Path to similarity Matrix 1 (Expected : saved MatrixEntry RDD).")
28 | var in1Path: String = null
29 |
30 | @Option(name = "-in2", required = true, usage = "Path to Similarity Matrix 2 (Expected : saved MatrixEntry RDD)")
31 | var in2Path: String = null
32 |
33 | @Option(name = "-out", required = true, usage = "Path to output file/folder where the result similarity matrix shall be stored.")
34 | var outPath: String = null
35 |
36 | @Option(name = "-weight", required = true,
37 | usage = "Weight/Scale for combining the similarities. The expected is [0.0, 1.0]. " +
38 | "The combining step is \n out = in1 * weight + (1.0 - weight) * in2")
39 | var weight: Double = -1.0
40 |
41 | /**
42 | * method which has actual job description
43 | */
44 | override def run(): Unit ={
45 | val weight = this.weight //local variable
46 | if (weight < 0 || weight > 1){
47 | throw new IllegalArgumentException(s"Weight $weight is out of bound. expected in range [0.0, 1.0]")
48 | }
49 | LOG.info(s"Combining $in1Path with $in2Path with scale $weight")
50 | val first = sc.objectFile[MatrixEntry](in1Path).map(e => ((e.i, e.j), e.value))
51 | val second = sc.objectFile[MatrixEntry](in2Path).map(e => ((e.i, e.j), e.value))
52 |
53 | val result = first.join(second).map({case ((i,j),(v1, v2)) => MatrixEntry(i, j, weight * v1 + (1 - weight) * v2)})
54 | result.saveAsObjectFile(outPath)
55 | LOG.info(s"Saved output at $outPath")
56 | }
57 | }
58 |
59 | object SimilarityCombiner{
60 | def main(args: Array[String]) {
61 | new SimilarityCombiner().run(args)
62 | }
63 | }
64 |
65 |
66 |
--------------------------------------------------------------------------------
/autoext/src/test/java/edu/usc/irds/autoext/tree/StyleSimComputerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.tree;
18 |
19 | import edu.usc.irds.autoext.utils.ParseUtils;
20 | import org.junit.Test;
21 | import org.w3c.dom.Document;
22 |
23 | import java.util.Arrays;
24 | import java.util.HashSet;
25 | import java.util.Set;
26 |
27 | import static org.junit.Assert.*;
28 |
29 | /**
30 | * Created by tg on 1/16/16.
31 | */
32 | public class StyleSimComputerTest {
33 |
34 | StyleSimComputer instance = new StyleSimComputer();
35 |
36 | @Test
37 | public void testCountIntersection() throws Exception {
38 |
39 | Set a = new HashSet<>(Arrays.asList(1,2,3));
40 | Set b = new HashSet<>(Arrays.asList(3, 4, 5));
41 | assertEquals(1, instance.countIntersection(a, b));
42 | b.clear();
43 | assertEquals(0, instance.countIntersection(a, b));
44 | b.addAll(a);
45 | assertEquals(3, instance.countIntersection(a, b));
46 | }
47 |
48 | @Test
49 | public void testCompute() throws Exception {
50 | Document doc1 = ParseUtils.parseFile("src/test/resources/html/simple/1.html");
51 | Document doc2 = ParseUtils.parseFile("src/test/resources/html/simple/2.html");
52 | Document doc3 = ParseUtils.parseFile("src/test/resources/html/simple/3.html");
53 |
54 | TreeNode tree1 = new TreeNode(doc1.getDocumentElement(), null);
55 | TreeNode tree2 = new TreeNode(doc2.getDocumentElement(), null);
56 | TreeNode tree3 = new TreeNode(doc3.getDocumentElement(), null);
57 |
58 | assertEquals(1.0, instance.compute(tree1, tree1), 0.001);
59 | assertEquals(1.0, instance.compute(tree2, tree2), 0.001);
60 | assertEquals(1.0, instance.compute(tree3, tree3), 0.001);
61 | assertEquals(instance.compute(tree1, tree2), instance.compute(tree2, tree1), 0.001);
62 | assertEquals(instance.compute(tree1, tree3), instance.compute(tree3, tree1), 0.001);
63 | assertEquals(instance.compute(tree2, tree3), instance.compute(tree3, tree2), 0.001);
64 |
65 | assertEquals(0.9, instance.compute(tree1, tree2), 0.25);
66 | assertEquals(0.0, instance.compute(tree1, tree3), 0.25);
67 | assertEquals(0.0, instance.compute(tree2, tree3), 0.25);
68 |
69 | }
70 | }
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/BracketTreeGen.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | import edu.usc.irds.autoext.tree.TreeNode;
20 | import org.kohsuke.args4j.CmdLineException;
21 | import org.kohsuke.args4j.CmdLineParser;
22 | import org.kohsuke.args4j.Option;
23 | import org.w3c.dom.Document;
24 | import org.xml.sax.SAXException;
25 |
26 | import java.io.File;
27 | import java.io.IOException;
28 | import java.net.URL;
29 | import java.nio.file.Files;
30 |
31 | /**
32 | * This is a CLI utility for converting an HTML file into
33 | * bracket notation labelled tree structure
34 | *
35 | */
36 | public class BracketTreeGen {
37 |
38 | @Option(name = "-in", usage = "Path to HTML file", forbids = {"-url"})
39 | private File htmlFile;
40 |
41 | @Option(name = "-url", usage = "URL of HTML doc", forbids = {"-in"})
42 | private URL htmlURL;
43 |
44 | @Option(name = "-out", usage = "Path to output file to store bracket notation tree")
45 | private File output;
46 |
47 | public static void main(String[] args) throws IOException, SAXException {
48 | //args = "-out sample.tree -url https://www.youtube.com/".split(" ");
49 | BracketTreeGen treeGen = new BracketTreeGen();
50 | CmdLineParser parser = new CmdLineParser(treeGen);
51 | try {
52 | parser.parseArgument(args);
53 | if (treeGen.htmlFile == null && treeGen.htmlURL == null){
54 | throw new CmdLineException("Either '-in' or '-url' is required");
55 | }
56 | } catch (CmdLineException e) {
57 | System.out.println(e.getLocalizedMessage());
58 | parser.printUsage(System.out);
59 | System.exit(-1);
60 | }
61 | Document doc;
62 | if (treeGen.htmlFile != null) {
63 | doc = ParseUtils.parseFile(treeGen.htmlFile.getPath());
64 |
65 | } else {
66 | doc = ParseUtils.parseURL(treeGen.htmlURL);
67 | }
68 | TreeNode node = new TreeNode(doc.getDocumentElement(), null);
69 | String bracketNotation = node.toBracketNotation();
70 | if (treeGen.output != null) {
71 | treeGen.output.getAbsoluteFile().getParentFile().mkdirs();
72 | Files.write(treeGen.output.toPath(), bracketNotation.getBytes("UTF-8"));
73 | System.out.println("Output stored in " + treeGen.output);
74 | } else {
75 | // dump to STDOUT
76 | System.out.println(bracketNotation);
77 | }
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/autoext/src/test/java/edu/usc/irds/autoext/tree/GrossSimComputerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.tree;
18 |
19 | import edu.usc.irds.autoext.base.SimilarityComputer;
20 | import edu.usc.irds.autoext.utils.ParseUtils;
21 | import org.junit.Test;
22 | import org.w3c.dom.Document;
23 |
24 | import java.util.Arrays;
25 |
26 | import static org.junit.Assert.assertEquals;
27 |
28 | /**
29 | * Created by tg on 1/16/16.
30 | */
31 | public class GrossSimComputerTest {
32 |
33 | @Test
34 | public void testCompute() throws Exception {
35 |
36 | SimilarityComputer caseSensitiveComputer = new SimilarityComputer() {
37 | @Override
38 | public double compute(String obj1, String obj2) {
39 | return obj1.equals(obj2) ? 1.0 : 0.0;
40 | }
41 | };
42 |
43 | SimilarityComputer caseInsensitiveComputer = new SimilarityComputer() {
44 | @Override
45 | public double compute(String obj1, String obj2) {
46 | return obj1.toLowerCase().equals(obj2.toLowerCase()) ? 1.0 : 0.0;
47 | }
48 | };
49 |
50 | GrossSimComputer computer = new GrossSimComputer<>(Arrays.asList(caseSensitiveComputer, caseInsensitiveComputer), Arrays.asList(0.5, 0.5));
51 | assertEquals(1.0, computer.compute("abcd", "abcd"), 0.00001);
52 | assertEquals(0.5, computer.compute("abcd", "ABCD"), 0.00001);
53 | assertEquals(0.0, computer.compute("aaa", "bbbb"), 0.00001);
54 | }
55 |
56 | @Test
57 | public void testCreateWebSimilarityComputer() throws Exception {
58 | GrossSimComputer simComputer = GrossSimComputer.createWebSimilarityComputer();
59 |
60 | Document doc1 = ParseUtils.parseFile("src/test/resources/html/simple/1.html");
61 | Document doc2 = ParseUtils.parseFile("src/test/resources/html/simple/2.html");
62 | Document doc3 = ParseUtils.parseFile("src/test/resources/html/simple/3.html");
63 |
64 | TreeNode tree1 = new TreeNode(doc1.getDocumentElement(), null);
65 | TreeNode tree2 = new TreeNode(doc2.getDocumentElement(), null);
66 | TreeNode tree3 = new TreeNode(doc3.getDocumentElement(), null);
67 | assertEquals(1.0, simComputer.compute(tree1, tree1), 0.0001);
68 | assertEquals(1.0, simComputer.compute(tree2, tree2), 0.0001);
69 | assertEquals(1.0, simComputer.compute(tree3, tree3), 0.0001);
70 | assertEquals(0.9, simComputer.compute(tree1, tree2), 0.1);
71 | assertEquals(0.3, simComputer.compute(tree1, tree3), 0.1);
72 | assertEquals(0.3, simComputer.compute(tree2, tree3), 0.1);
73 | }
74 | }
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/tree/StyleSimComputer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.tree;
18 |
19 | import edu.usc.irds.autoext.base.SimilarityComputer;
20 | import edu.usc.irds.autoext.utils.XPathEvaluator;
21 | import org.w3c.dom.Element;
22 |
23 | import java.io.Serializable;
24 | import java.util.Set;
25 |
26 | /**
27 | * Computes CSS style Similarity between two DOM trees
28 | */
29 | public class StyleSimComputer extends SimilarityComputer implements Serializable {
30 |
31 | private static final long serialVersionUID = 6680072428272456472L;
32 | private static XPathEvaluator xPathUtil = new XPathEvaluator();
33 |
34 | /**
35 | * Computes the stylistic similarity
36 | * @param elem1 first element
37 | * @param elem2 second element
38 | * @returnt the style similarity
39 | */
40 | public double compute(Element elem1, Element elem2) {
41 | Set setA = xPathUtil.findUniqueClassNames(elem1);
42 | Set setB = xPathUtil.findUniqueClassNames(elem2);
43 | int modA = setA.size();
44 | int modB = setB.size();
45 | if (modA == 0 && modB == 0) {
46 | //Cant be determined by jaccards similarity;
47 | // however, by definition, they are very similar in empty style
48 | return 1.0;
49 | }
50 | int intersectSize = countIntersection(setA, setB);
51 | // the jaccards similarity
52 | return (double) intersectSize / (modA + modB - intersectSize);
53 | }
54 |
55 | /**
56 | * Computes the size of intersection of two sets
57 | * @param small first set. preferably smaller than the second argument
58 | * @param large second set;
59 | * @param the type
60 | * @return size of intersection of sets
61 | */
62 | public int countIntersection(Set small, Set large){
63 | //assuming first argument to be smaller than the later;
64 | //however double checking to be sure
65 | if (small.size() > large.size()) {
66 | //swap the references;
67 | Set tmp = small;
68 | small = large;
69 | large = tmp;
70 | }
71 | int result = 0;
72 | for (T item : small) {
73 | if (large.contains(item)){
74 | //item found in both the sets
75 | result++;
76 | }
77 | }
78 | return result;
79 | }
80 |
81 |
82 | @Override
83 | public double compute(TreeNode obj1, TreeNode obj2) {
84 | //TODO: resolve the casts.. This could cause type cast errors
85 | return compute((Element) obj1.innerNode, (Element) obj2.innerNode);
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/XPathEvaluator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | import org.slf4j.Logger;
20 | import org.slf4j.LoggerFactory;
21 | import org.w3c.dom.Element;
22 | import org.w3c.dom.NodeList;
23 |
24 | import javax.xml.xpath.XPathConstants;
25 | import javax.xml.xpath.XPathExpression;
26 | import javax.xml.xpath.XPathExpressionException;
27 | import javax.xml.xpath.XPathFactory;
28 | import java.io.Serializable;
29 | import java.util.Collections;
30 | import java.util.HashSet;
31 | import java.util.Set;
32 |
33 | /**
34 | * An utility for evaluating XPath expressions on Documents
35 | * @author Thamme Gowda
36 | * @since Jan 16, 2016
37 | */
38 | public class XPathEvaluator implements Serializable {
39 |
40 | public static final Logger LOG = LoggerFactory.getLogger(XPathEvaluator.class.getName());
41 | private static final String CLASS_VAL_XPATH = "//*[@class]/@class";
42 | private static final long serialVersionUID = -4886553689128529323L;
43 |
44 | private XPathFactory xPathFactory;
45 | private XPathExpression cssClassValExprsn;
46 |
47 | public XPathEvaluator() {
48 | xPathFactory = XPathFactory.newInstance();
49 | try {
50 | cssClassValExprsn = compile(CLASS_VAL_XPATH);
51 | } catch (XPathExpressionException e) {
52 | LOG.error(e.getMessage(), e);
53 | throw new RuntimeException(e);
54 | }
55 | }
56 |
57 | public XPathExpression compile(String expression) throws XPathExpressionException {
58 | return xPathFactory.newXPath().compile(expression);
59 | }
60 |
61 | /**
62 | * Evaluates the given xpath expression on input DOM Element
63 | * @param element Root element
64 | * @param expression Xpath expression
65 | * @return List of Nodes obtained by evaluating the nodes
66 | * @throws XPathExpressionException when the xpath expression is invalid
67 | */
68 | public NodeList eval(Element element, XPathExpression expression)
69 | throws XPathExpressionException {
70 | return (NodeList) expression.evaluate(element, XPathConstants.NODESET);
71 | }
72 |
73 |
74 | /**
75 | * Finds all unique class names from a DOM tree rooted at given element
76 | * @param element the root element of the DOM tree
77 | * @return Set of class names
78 | */
79 | public Set findUniqueClassNames(Element element){
80 | try {
81 | NodeList list = eval(element, cssClassValExprsn);
82 | Set cssClasses = new HashSet<>();
83 | for (int i = 0; i < list.getLength(); i++) {
84 | Collections.addAll(cssClasses,
85 | list.item(i).getTextContent().trim().split("\\s+"));
86 | }
87 | return cssClasses;
88 | } catch (XPathExpressionException e) {
89 | LOG.error(e.getMessage(), e);
90 | throw new RuntimeException(e);
91 | }
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/apted/StringToIntMapper.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.apted;
18 |
19 | import edu.usc.irds.lang.Function;
20 |
21 | import java.io.BufferedWriter;
22 | import java.io.File;
23 | import java.io.FileNotFoundException;
24 | import java.io.IOException;
25 | import java.io.PrintWriter;
26 | import java.io.Serializable;
27 | import java.util.HashMap;
28 | import java.util.Map;
29 | import java.util.concurrent.atomic.AtomicInteger;
30 |
31 | /**
32 | * Mapper for converting strings to integer.
33 | * Uses counters for mapping.
34 | * Optionally writes contents to file
35 | */
36 | public class StringToIntMapper implements Function, AutoCloseable, Serializable {
37 |
38 | private Map map = new HashMap<>();
39 | private Map reverseMap = new HashMap<>();
40 | private boolean persist = false;
41 | private AtomicInteger counter = new AtomicInteger(0);
42 | private BufferedWriter writer;
43 |
44 | /**
45 | * creates a mapper instance which uses counters.
46 | * For persistent based mapper see {@link #StringToIntMapper(File)}
47 | */
48 | public StringToIntMapper(){
49 | }
50 |
51 | /**
52 | * This instance writes the mapping to given file.
53 | * Should be closed at the end to flush the contents to file
54 | * @param file file instance
55 | * @throws FileNotFoundException
56 | */
57 | public StringToIntMapper(File file) throws FileNotFoundException {
58 | this();
59 | this.persist = true;
60 | this.writer = new BufferedWriter(new PrintWriter(file));
61 | }
62 |
63 | @Override
64 | public Integer apply(String obj) {
65 | return this.map(obj);
66 | }
67 |
68 | /**
69 | * Maps a string to integer
70 | * @param obj the object which requires mapping
71 | * @return integer obtained after mapping
72 | */
73 | public Integer map(String obj){
74 | Integer mapped = map.get(obj);
75 | if (mapped == null) {
76 | mapped = counter.incrementAndGet();
77 | map.put(obj, mapped);
78 | reverseMap.put(mapped, obj);
79 | if (persist){
80 | try {
81 | writer.write(obj);
82 | writer.write("\n");
83 | } catch (IOException e) {
84 | e.printStackTrace();
85 | }
86 | }
87 | }
88 | return mapped;
89 | }
90 |
91 | /**
92 | * returns key that was mapped to this value
93 | * @param val the value for reverse lookup
94 | * @return String if present, null if not present
95 | */
96 | public String reverseMap(Integer val){
97 | return this.reverseMap.get(val);
98 | }
99 |
100 | @Override
101 | public void close() throws Exception {
102 | if (writer != null) {
103 | writer.close();
104 | }
105 | }
106 |
107 | }
108 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/tree/GrossSimComputer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.tree;
18 |
19 | import edu.usc.irds.autoext.Config;
20 | import edu.usc.irds.autoext.base.SimilarityComputer;
21 | import edu.usc.irds.autoext.utils.Checks;
22 |
23 | import java.io.Serializable;
24 | import java.util.ArrayList;
25 | import java.util.Arrays;
26 | import java.util.List;
27 |
28 | /**
29 | * Aggregates the similarities from several similarity computers
30 | * @author Thamme Gowda N
31 | * @since Jan 16, 2016
32 | */
33 | public class GrossSimComputer extends SimilarityComputer implements Serializable {
34 |
35 | private static final long serialVersionUID = -6461871245307945046L;
36 | private final List> computers;
37 | private final List weights;
38 | private int n;
39 |
40 | /**
41 | * Creates a similarity aggregator
42 | * @param computers list of similarity computers
43 | * @param weights list of weights to the computers.
44 | * The weight at the index i in this array specifies the weight for similaritycomputer at i in the argument 1.
45 | * The sum of all weights should add to 1.0
46 | */
47 | public GrossSimComputer(List> computers,
48 | List weights) {
49 | this.computers = computers;
50 | this.weights = weights;
51 | Checks.check(computers.size() == weights.size(),
52 | "The size of computers and weights should match");
53 | double sum = 0.0;
54 | for (Double weight : weights) {
55 | sum += weight;
56 | }
57 | Checks.check(Math.abs(1.0 - sum) <= 0.001,
58 | "The sum of all the weights must add up to 1.0");
59 | this.n = weights.size();
60 | }
61 |
62 | @Override
63 | public double compute(T obj1, T obj2) {
64 | double result = 0.0;
65 | for (int i = 0; i < n; i++) {
66 | result += computers.get(i).compute(obj1, obj2) * weights.get(i);
67 | }
68 | return result;
69 | }
70 |
71 | /**
72 | * A factory method for creating similarity computer that aggregates structural and stylistic measures
73 | * @return the similarity computer that internally aggregates structure and style measures;
74 | */
75 | public static GrossSimComputer createWebSimilarityComputer(){
76 | double structureSimWeight = Config.getInstance().getSimWeight();
77 | Checks.check(structureSimWeight <= 1.0 && structureSimWeight >= 0.0, "The weight should be in between [0.0, 1.0]");
78 | StructureSimComputer structSimComputer = new StructureSimComputer();
79 | StyleSimComputer styleSimComputer = new StyleSimComputer();
80 | List> similarityComputers = new ArrayList<>();
81 | similarityComputers.add(structSimComputer);
82 | similarityComputers.add(styleSimComputer);
83 | List weights = Arrays.asList(structureSimWeight, 1.0 - structureSimWeight);
84 | return new GrossSimComputer<>(similarityComputers, weights);
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/apted/APTEDComputer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.apted;
18 |
19 | import edu.usc.irds.autoext.base.EditCost;
20 | import edu.usc.irds.autoext.base.EditDistanceComputer;
21 | import edu.usc.irds.autoext.tree.TreeNode;
22 | import edu.usc.irds.lang.Function;
23 | import edu.usc.irds.ted.apted.APTED;
24 | import edu.usc.irds.ted.apted.util.LblTree;
25 |
26 | import java.io.Serializable;
27 | import java.util.List;
28 |
29 | /**
30 | *
31 | * This TED is based on AP-TED algorithm of Mateusz Pawlik and Nikolaus Augsten.
32 | * Refer to http://tree-edit-distance.dbresearch.uni-salzburg.at for more details
33 | *
34 | * @see APTED
35 | */
36 | public class APTEDComputer
37 | implements EditDistanceComputer, Serializable {
38 |
39 | public static final float INSERT_COST = 1;
40 | public static final float DELETE_COST = 1;
41 | public static final float REPLACE_COST = 1;
42 | public static final float MAX_UNIT = Math.max(Math.max(INSERT_COST, DELETE_COST), REPLACE_COST);
43 |
44 | public static class APTEDMetric implements EditCost, Serializable{
45 |
46 | @Override
47 | public double getInsertCost(Object node) {
48 | return INSERT_COST;
49 | }
50 |
51 | @Override
52 | public double getRemoveCost(Object node) {
53 | return DELETE_COST;
54 | }
55 |
56 | @Override
57 | public double getReplaceCost(Object node1, Object node2) {
58 | return REPLACE_COST;
59 | }
60 |
61 | @Override
62 | public double getNoEditCost() {
63 | return 0;
64 | }
65 |
66 | @Override
67 | public double getMaxUnitCost() {
68 | return MAX_UNIT;
69 | }
70 |
71 | @Override
72 | public boolean isSymmetric() {
73 | return true;
74 | }
75 | }
76 |
77 | private APTEDMetric cost = new APTEDMetric();
78 | private StringToIntMapper idMapper = new StringToIntMapper();
79 |
80 | @Override
81 | public double computeDistance(TreeNode object1, TreeNode object2) {
82 | APTED ted = new APTED(DELETE_COST, INSERT_COST, REPLACE_COST);
83 | LblTree tree1 = transform(object1, idMapper);
84 | LblTree tree2 = transform(object2, idMapper);
85 | return ted.nonNormalizedTreeDist(tree1, tree2);
86 | }
87 |
88 | @Override
89 | public EditCost getCostMetric() {
90 | return cost;
91 | }
92 |
93 |
94 | /**
95 | * Transforms TreeNode to LblNode
96 | * @param node TreeNode
97 | * @param idMapper mapper function that converts string id to integer id
98 | * @return an instance of LblTree
99 | */
100 | public static LblTree transform(TreeNode node, Function idMapper){
101 | int treeID = idMapper != null ? idMapper.apply(node.getExternalId()) : -1;
102 | LblTree result = new LblTree(node.getNodeName(), treeID);
103 | List children = node.getChildren();
104 | if (children != null) {
105 | for (TreeNode child : children) {
106 | result.add(transform(child, idMapper));
107 | }
108 | }
109 | return result;
110 | }
111 |
112 |
113 | }
114 |
--------------------------------------------------------------------------------
/autoext/src/main/java/edu/usc/irds/autoext/utils/D3JsFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package edu.usc.irds.autoext.utils;
18 |
19 | import com.google.gson.Gson;
20 |
21 | import java.io.FileWriter;
22 | import java.io.IOException;
23 | import java.io.Writer;
24 | import java.util.ArrayList;
25 | import java.util.HashMap;
26 | import java.util.List;
27 | import java.util.Map;
28 |
29 | /**
30 | * Utilities for transforming data to d3js format
31 | */
32 | public class D3JsFormat {
33 |
34 | public static final String INDEX_KEY = "index";
35 | public static final String CHILDREN_KEY = "children";
36 | public static final String SIZE = "size";
37 | public static final String NAME_KEY = "name";
38 | public static final String CREATED_AT = "createdAt";
39 |
40 | /**
41 | *
42 | * @param name name for top level cluster
43 | * @param clusters cluster details
44 | * @param nameMap mapping indices back to labels
45 | * @param scaleFactor scale factor for magnifying the cluster size
46 | */
47 | public static String formatClusters(String name,
48 | Map> clusters,
49 | Map nameMap,
50 | final double scaleFactor){
51 |
52 | final Map nameMapFinal = nameMap == null ?
53 | new HashMap() : nameMap;
54 |
55 | Map result = new HashMap<>();
56 | result.put(NAME_KEY, name);
57 | result.put(INDEX_KEY, -1);
58 | result.put(SIZE, clusters.size() * scaleFactor);
59 | result.put(CREATED_AT, System.currentTimeMillis());
60 |
61 | List