├── .classpath
├── .gitignore
├── .project
├── .settings
└── org.eclipse.jdt.ui.prefs
├── README.textile
├── build.xml
├── lib
├── guava-r09.jar
├── guava-src-r09.zip
├── junit-4.8.2-src.jar
├── junit-4.8.2.jar
├── nekohtml.jar
└── xercesImpl.jar
├── pg100.txt
└── src
└── spinfo
├── Collation.java
├── CollectionsGenerics.java
├── Crawling.java
├── EditDistance.java
├── HashTables.java
├── Index.java
├── Lists.java
├── Quicksort.java
├── SortSearch.java
├── TestSuite.java
├── Trees.java
└── package-info.java
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bin
2 | build
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | java
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | #Mon May 23 17:35:24 CEST 2011
2 | eclipse.preferences.version=1
3 | editor_save_participant_org.eclipse.jdt.ui.postsavelistener.cleanup=true
4 | formatter_settings_version=12
5 | sp_cleanup.add_default_serial_version_id=true
6 | sp_cleanup.add_generated_serial_version_id=false
7 | sp_cleanup.add_missing_annotations=true
8 | sp_cleanup.add_missing_deprecated_annotations=true
9 | sp_cleanup.add_missing_methods=false
10 | sp_cleanup.add_missing_nls_tags=false
11 | sp_cleanup.add_missing_override_annotations=true
12 | sp_cleanup.add_missing_override_annotations_interface_methods=true
13 | sp_cleanup.add_serial_version_id=false
14 | sp_cleanup.always_use_blocks=true
15 | sp_cleanup.always_use_parentheses_in_expressions=false
16 | sp_cleanup.always_use_this_for_non_static_field_access=false
17 | sp_cleanup.always_use_this_for_non_static_method_access=false
18 | sp_cleanup.convert_to_enhanced_for_loop=false
19 | sp_cleanup.correct_indentation=false
20 | sp_cleanup.format_source_code=true
21 | sp_cleanup.format_source_code_changes_only=false
22 | sp_cleanup.make_local_variable_final=false
23 | sp_cleanup.make_parameters_final=false
24 | sp_cleanup.make_private_fields_final=true
25 | sp_cleanup.make_type_abstract_if_missing_method=false
26 | sp_cleanup.make_variable_declarations_final=true
27 | sp_cleanup.never_use_blocks=false
28 | sp_cleanup.never_use_parentheses_in_expressions=true
29 | sp_cleanup.on_save_use_additional_actions=false
30 | sp_cleanup.organize_imports=true
31 | sp_cleanup.qualify_static_field_accesses_with_declaring_class=false
32 | sp_cleanup.qualify_static_member_accesses_through_instances_with_declaring_class=true
33 | sp_cleanup.qualify_static_member_accesses_through_subtypes_with_declaring_class=true
34 | sp_cleanup.qualify_static_member_accesses_with_declaring_class=false
35 | sp_cleanup.qualify_static_method_accesses_with_declaring_class=false
36 | sp_cleanup.remove_private_constructors=true
37 | sp_cleanup.remove_trailing_whitespaces=false
38 | sp_cleanup.remove_trailing_whitespaces_all=true
39 | sp_cleanup.remove_trailing_whitespaces_ignore_empty=false
40 | sp_cleanup.remove_unnecessary_casts=true
41 | sp_cleanup.remove_unnecessary_nls_tags=false
42 | sp_cleanup.remove_unused_imports=false
43 | sp_cleanup.remove_unused_local_variables=false
44 | sp_cleanup.remove_unused_private_fields=true
45 | sp_cleanup.remove_unused_private_members=false
46 | sp_cleanup.remove_unused_private_methods=true
47 | sp_cleanup.remove_unused_private_types=true
48 | sp_cleanup.sort_members=false
49 | sp_cleanup.sort_members_all=false
50 | sp_cleanup.use_blocks=false
51 | sp_cleanup.use_blocks_only_for_return_and_throw=false
52 | sp_cleanup.use_parentheses_in_expressions=false
53 | sp_cleanup.use_this_for_non_static_field_access=false
54 | sp_cleanup.use_this_for_non_static_field_access_only_if_necessary=true
55 | sp_cleanup.use_this_for_non_static_method_access=false
56 | sp_cleanup.use_this_for_non_static_method_access_only_if_necessary=true
57 |
--------------------------------------------------------------------------------
/README.textile:
--------------------------------------------------------------------------------
1 | "Java at the University of Cologne, Department of Linguistics (Sprachliche Informationsverarbeitung)":http://spinfo.uni-koeln.de/spinfo-java.html
--------------------------------------------------------------------------------
/build.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
--------------------------------------------------------------------------------
/lib/guava-r09.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/guava-r09.jar
--------------------------------------------------------------------------------
/lib/guava-src-r09.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/guava-src-r09.zip
--------------------------------------------------------------------------------
/lib/junit-4.8.2-src.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/junit-4.8.2-src.jar
--------------------------------------------------------------------------------
/lib/junit-4.8.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/junit-4.8.2.jar
--------------------------------------------------------------------------------
/lib/nekohtml.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/nekohtml.jar
--------------------------------------------------------------------------------
/lib/xercesImpl.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/xercesImpl.jar
--------------------------------------------------------------------------------
/src/spinfo/Collation.java:
--------------------------------------------------------------------------------
1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */
2 |
3 | package spinfo;
4 |
5 | import static org.junit.Assert.assertEquals;
6 | import static org.junit.Assert.assertFalse;
7 |
8 | import java.text.CollationKey;
9 | import java.text.Collator;
10 | import java.text.ParseException;
11 | import java.text.RuleBasedCollator;
12 | import java.util.Arrays;
13 | import java.util.Collections;
14 | import java.util.Comparator;
15 | import java.util.HashMap;
16 | import java.util.List;
17 | import java.util.Locale;
18 | import java.util.Map;
19 |
20 | import org.junit.Test;
21 |
22 | /** Lexicographic sorting and collation in Java. */
23 | public class Collation {
24 |
25 | List words = Arrays.asList("Very", "Über", "very", "ultra", "über");
26 |
27 | @Test
28 | public void basicProblem() {
29 | /*
30 | * The basic problem: Java's default sorting for strings (based on the
31 | * character's Unicode position) is insufficient for lexicographic ordering:
32 | */
33 | Collections.sort(words);
34 | /*
35 | * Because it sorts all upper case letters before all lowercase letters and
36 | * sorts all letters with diacritics behind all standard letters:
37 | */
38 | assertEquals(Arrays.asList("Very", "ultra", "very", "Über", "über"), words);
39 | /* Which is not what we would expect (upper, lower, diacritics together): */
40 | assertFalse(Arrays.asList("ultra", "Über", "über", "Very", "very").equals(
41 | words));
42 | }
43 |
44 | @Test
45 | public void basicSolution() {
46 | /* Idea: map the chars to their correct position, and sort by that mapping: */
47 | final Map collationKeys = new HashMap();
48 | collationKeys.put('U', 1); // or lower-level, with array: char['U'] = 1;
49 | collationKeys.put('u', 2);
50 | collationKeys.put('\u00dc', 3); // Ü
51 | collationKeys.put('\u00fc', 4); // ü
52 | collationKeys.put('V', 5);
53 | collationKeys.put('v', 6);
54 | /* We pass a custom sorting strategy to the sort method: */
55 | Collections.sort(words, new Comparator() {
56 | @Override
57 | public int compare(String s1, String s2) {
58 | /* For this sample, we only look at the first letter: */
59 | Character c1 = s1.charAt(0);
60 | Character c2 = s2.charAt(0);
61 | /* We don't compare the chars, but their collation keys: */
62 | return collationKeys.get(c1).compareTo(collationKeys.get(c2));
63 | }
64 | });
65 | /* For our specific case, this results in a somewhat correct order: */
66 | assertEquals(Arrays.asList("ultra", "Über", "über", "Very", "very"), words);
67 | }
68 |
69 | @Test
70 | public void collator() {
71 | /* Java contains region-specific collation rules, via Collator: */
72 | final Collator collator = Collator.getInstance(Locale.GERMAN);
73 | Collections.sort(words, new Comparator() {
74 | @Override
75 | public int compare(String s1, String s2) {
76 | return collator.compare(s1, s2);
77 | }
78 | });
79 | /* Which gets the details right, e.g. sort umlauts like their standards: */
80 | assertEquals(Arrays.asList("über", "Über", "ultra", "very", "Very"), words);
81 | }
82 |
83 | @Test
84 | public void comparable() {
85 | /*
86 | * If we control the objects sorted (unlike strings), and the sorting does
87 | * not depend on something external to the objects (unlike above, where we
88 | * sort chars by their keys), we can define the order inside our objects:
89 | */
90 | List words = Arrays.asList(new Word("Very"), new Word("ultra"),
91 | new Word("über"), new Word("Super"));
92 | /* From the usage side, it now looks like the default sorting just works: */
93 | Collections.sort(words);
94 | assertEquals(Arrays.asList(new Word("Super"), new Word("über"), new Word(
95 | "ultra"), new Word("Very")), words);
96 | }
97 |
98 | static class Word implements Comparable {
99 |
100 | private String val;
101 | private Collator collator = Collator.getInstance(); // uses system locale
102 | private CollationKey key;
103 |
104 | public Word(String val) {
105 | this.val = val;
106 | this.key = collator.getCollationKey(val); // precompute the key
107 | }
108 |
109 | @Override
110 | public int compareTo(Word that) {
111 | // return this.val.compareTo(that.val); // naive, not sufficient
112 | /* Instead of comparing the vals, we can pass them to the collator: */
113 | // return collator.compare(this.val, that.val); // always computes keys
114 | /* To improve performance, we precompute the keys, and compare these: */
115 | return this.key.compareTo(that.key);
116 | }
117 |
118 | /* Java standard method implementations below, needed for testing here: */
119 |
120 | @Override
121 | public String toString() {
122 | return val;
123 | }
124 |
125 | @Override
126 | public boolean equals(Object that) {
127 | return that instanceof Word && ((Word) that).val.equals(this.val);
128 | }
129 |
130 | @Override
131 | public int hashCode() {
132 | return val.hashCode(); // mandatory if equals, consistent with equals
133 | }
134 | }
135 |
136 | @Test
137 | public void customRules() throws ParseException {
138 | List w = Arrays.asList("Löss", "Lee", "Luv", "Löß");
139 | /* Default collator: ß after ss */
140 | sortWithCollator(w, Collator.getInstance(Locale.GERMAN)); // default german
141 | assertEquals(Arrays.asList("Lee", "Löss", "Löß", "Luv"), w);
142 | /* Custom requirement: sort ß before ss (old German spelling rules) */
143 | String defaultRules = ((RuleBasedCollator) RuleBasedCollator
144 | .getInstance(Locale.GERMAN)).getRules();
145 | String customRules = "ß < ss"; // additional custom rule, replaces default
146 | final Collator collator = new RuleBasedCollator(defaultRules + customRules);
147 | sortWithCollator(w, collator);
148 | assertEquals(Arrays.asList("Lee", "Löß", "Löss", "Luv"), w);
149 | }
150 |
151 | private void sortWithCollator(List words, final Collator collator) {
152 | Collections.sort(words, new Comparator() {
153 | @Override
154 | public int compare(String s1, String s2) {
155 | return collator.compare(s1, s2);
156 | }
157 | });
158 | }
159 |
160 | }
--------------------------------------------------------------------------------
/src/spinfo/CollectionsGenerics.java:
--------------------------------------------------------------------------------
1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */
2 |
3 | package spinfo;
4 |
5 | import static org.junit.Assert.assertEquals;
6 | import static org.junit.Assert.assertTrue;
7 |
8 | import java.util.ArrayList;
9 | import java.util.Arrays;
10 | import java.util.Collection;
11 | import java.util.Collections;
12 | import java.util.Deque;
13 | import java.util.HashMap;
14 | import java.util.HashSet;
15 | import java.util.LinkedList;
16 | import java.util.List;
17 | import java.util.Map;
18 | import java.util.Set;
19 | import java.util.SortedMap;
20 | import java.util.SortedSet;
21 | import java.util.TreeMap;
22 | import java.util.TreeSet;
23 |
24 | import org.junit.Test;
25 |
26 | /** Generic data structures and algorithms: Java generics and collections. */
27 | public class CollectionsGenerics {
28 |
29 | /* Generics */
30 |
31 | @Test
32 | public void basicGenerics() {
33 | /*
34 | * A List can contain different types of elements. We specify the type as a
35 | * parameter for the List class, e.g. Integer:
36 | */
37 | List ints = Arrays.asList(1, 2, 3, 4, 5); // auto-boxed
38 | assertTrue(ints.get(0) instanceof Integer);
39 | /* Or String: */
40 | List strings = Arrays.asList("one", "two");
41 | assertTrue(strings.get(0) instanceof String);
42 | /* Since List can be used with different types, it's called a generic class. */
43 | }
44 |
45 | @Test
46 | public void genericMethods() {
47 | /* As method used above, but implemented below: */
48 | List s = asList(new String[] { "one", "two" });
49 | /* Optional here: explicit type parameter: */
50 | s = CollectionsGenerics. asList(new String[] { "one", "two" });
51 | assertEquals("one", s.get(0));
52 | assertEquals("two", s.get(1));
53 | }
54 |
55 | /* A generic method: has a type parameter T (inferred or explicit) */
56 | private static List asList(T[] ts) {
57 | List result = new ArrayList(); // choose List impl. on creation
58 | for (T t : ts)
59 | result.add(t);
60 | return result;
61 | }
62 |
63 | @Test
64 | public void genericClasses() {
65 | /* Like for List above, we can also use generics on our classes: */
66 | Tree tree = new Tree();
67 | tree.root = new Node("value");
68 | assertEquals("value", tree.root.value);
69 | assertTrue(tree.root.value instanceof String);
70 | }
71 |
72 | static class Tree { // T becomes concrete on creation, e.g. String,
73 | // Integer, etc.
74 | Node root;
75 | }
76 |
77 | static class Node {
78 | T value;
79 | Node left;
80 | Node right;
81 |
82 | public Node(T value) {
83 | this.value = value;
84 | }
85 | }
86 |
87 | /* Collections */
88 |
89 | @Test
90 | public void collections() {
91 | List list = new ArrayList(); // refer by interface
92 | Collection coll = list; // List is a Collection
93 | Iterable iter = coll; // Collection is Iterable
94 | assertTrue(iter instanceof Iterable);
95 | assertTrue(iter instanceof Collection);
96 | assertTrue(iter instanceof List);
97 | assertTrue(iter instanceof ArrayList);
98 | /* The Collection Interface defines 4 kinds of methods: */
99 | coll.add("hi"); // 1. methods for adding elements (also addAll, ...)
100 | coll.remove("hi"); // 2. methods for removing elements (also removeAll, ...)
101 | coll.contains("hi"); // 3. methods for querying (also containsAll, ...)
102 | coll.toArray(new String[0]); // 4. methods for conversion (iterator, ...)
103 | }
104 |
105 | @Test
106 | public void sets() {
107 | Set set = new HashSet(); // no duplicates, no order
108 | set = new TreeSet(); // change impl: no duplicates, sorted, tree
109 | assertTrue(set instanceof Collection);
110 | assertTrue(set instanceof Set);
111 | assertTrue(set instanceof TreeSet);
112 | assertTrue(set instanceof SortedSet); // additional interface
113 | set.add("hi"); // O(1) for HashSet, O(log n) for TreeSet
114 | assertTrue(set.contains("hi")); // O(1) for HashSet, O(log n) for TreeSet
115 | set.add("hi"); // add existing value, should not be added
116 | assertEquals(1, set.size()); // no duplicates
117 | }
118 |
119 | @Test
120 | public void lists() {
121 | List list = new ArrayList(); // array-based impl.
122 | list = new LinkedList(); // linked list impl.
123 | assertTrue(list instanceof Collection);
124 | assertTrue(list instanceof List);
125 | assertTrue(list instanceof LinkedList);
126 | assertTrue(list instanceof Deque); // additional interface
127 | list.add("hi"); // O(1) for ArrayList and LinkedList (add at end)
128 | String s = list.get(0); // O(1) for ArrayList, O(n) for LinkedList
129 | list.remove(0); // O(1) for LinkedList (front), O(n) for ArrayList
130 | assertEquals("hi", s);
131 | }
132 |
133 | @Test
134 | public void maps() {
135 | Map map = new HashMap(); // hash table
136 | map = new TreeMap(); // change impl: sorted keys, tree
137 | assertTrue(map instanceof Map);
138 | assertTrue(map instanceof TreeMap);
139 | assertTrue(map instanceof SortedMap); // additional interface
140 | map.put("hi", 5); // O(1) for HashMap, O(log n) for TreeMap
141 | int i = map.get("hi"); // O(1) for HashMap, O(log n) for TreeMap
142 | assertEquals(5, i);
143 | }
144 |
145 | @Test
146 | public void algorithms() {
147 | /* Generic methods for working with collections, e.g. sorting and searching: */
148 | List vals = Arrays.asList(91, 23, 88, 93, 20, 37);
149 | Collections.sort(vals); // merge sort, O(n log n)
150 | assertEquals(Arrays.asList(20, 23, 37, 88, 91, 93), vals);
151 | assertEquals(2, Collections.binarySearch(vals, 37)); // binsearch, O(log n)
152 | assertEquals(5, Collections.binarySearch(vals, 93));
153 | }
154 |
155 | @Test
156 | public void wrappers() {
157 | /* We can convert collections by passing them to the constructor: */
158 | List list = Arrays.asList("one", "one", "two", "two");
159 | assertEquals(4, list.size());
160 | /* Remove duplicates by wrapping the list in a set: */
161 | Set set = new HashSet(list);
162 | assertEquals(2, set.size());
163 | }
164 | }
--------------------------------------------------------------------------------
/src/spinfo/Crawling.java:
--------------------------------------------------------------------------------
1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */
2 |
3 | package spinfo;
4 |
5 | import static org.junit.Assert.assertTrue;
6 |
7 | import java.io.IOException;
8 | import java.net.MalformedURLException;
9 | import java.net.URL;
10 | import java.util.ArrayList;
11 | import java.util.Arrays;
12 | import java.util.Collections;
13 | import java.util.HashSet;
14 | import java.util.List;
15 | import java.util.Set;
16 | import java.util.concurrent.ExecutorService;
17 | import java.util.concurrent.Executors;
18 | import java.util.concurrent.TimeUnit;
19 |
20 | import org.cyberneko.html.parsers.DOMParser;
21 | import org.junit.Test;
22 | import org.w3c.dom.Node;
23 | import org.xml.sax.SAXException;
24 |
25 | import com.google.common.base.Charsets;
26 | import com.google.common.io.Resources;
27 |
28 | /** Basic web crawling, HTML processing, and concurrency. */
29 | public class Crawling {
30 |
31 | /** Test simple content loading via URL. */
32 | @Test
33 | public void load() throws IOException {
34 | /*
35 | * Simple loading of content from a URL, using Guava (passing the charset
36 | * specified by the site), but the result is not very useful as it is (e.g.
37 | * for indexing):
38 | */
39 | System.out.println(Resources.toString(new URL("http://www.zeit.de/"),
40 | Charsets.UTF_8));
41 | }
42 |
43 | /** Test web site parsing. */
44 | @Test
45 | public void parse() throws SAXException, IOException {
46 | /* What we need is a structured processing of content and links: */
47 | WebDocument doc = Parser.parse("http://www.zeit.de/");
48 | String text = doc.text;
49 | Set links = doc.links;
50 | assertTrue("Document content should exist", text.length() > 0);
51 | assertTrue("Outgoing links should exist", links.size() > 0);
52 | System.out.println("Text: " + text);
53 | System.out.println("Links: " + links);
54 | }
55 |
56 | /** A web document representation consisting of text and links. */
57 | static class WebDocument {
58 | String text;
59 | Set links;
60 | URL url;
61 |
62 | WebDocument(String url, String text, Set links)
63 | throws MalformedURLException {
64 | this.text = text;
65 | this.links = links;
66 | this.url = new URL(url);
67 | }
68 | }
69 |
70 | /** A parser that transforms a URL into a web document representation. */
71 | static class Parser {
72 | private static Set links;
73 | private static StringBuilder builder;
74 |
75 | static WebDocument parse(String url) throws SAXException, IOException {
76 | /* We parse with NekoHTML, an error-correcting parser based on Xerces: */
77 | DOMParser parser = new DOMParser();
78 | parser.parse(url);
79 | builder = new StringBuilder();
80 | links = new HashSet();
81 | /* We start at the first element: */
82 | process(parser.getDocument().getFirstChild());
83 | /* At the end we create our resulting document object: */
84 | return new WebDocument(url, builder.toString().trim(), links);
85 | }
86 |
87 | private static void process(Node node) throws MalformedURLException {
88 | /*
89 | * We get elements by their names. We could use instanceof, and e.g. test
90 | * if something is a HTMLParagraphElement, but this is less robust, since
91 | * e.g. XHTML documents are made of elements in a different namespace.
92 | */
93 | String elementName = node.getNodeName().toLowerCase().trim();
94 | /* We treat as content here only text within a p-tag: */
95 | if (elementName.equals("p")) {
96 | String text = node.getTextContent().trim();
97 | if (text.length() > 0) {
98 | builder.append(text).append("\n\n"); // make it a paragraph
99 | }
100 | } else if (elementName.equals("a")) {
101 | if (node.hasAttributes()) {
102 | /* If the a-tag has a href attribute with http, add it to the links: */
103 | Node href = node.getAttributes().getNamedItem("href");
104 | if (href != null && href.getNodeValue().trim().startsWith("http://")) {
105 | links.add(href.getNodeValue().trim());
106 | }
107 | }
108 | }
109 | /* Done with current node, recurse on same level (if there is more): */
110 | Node sibling = node.getNextSibling();
111 | if (sibling != null) {
112 | process(sibling);
113 | }
114 | /* Done with current level, recurse to next level (if there is more): */
115 | Node child = node.getFirstChild();
116 | if (child != null) {
117 | process(child);
118 | }
119 | }
120 | }
121 |
122 | /** Test the actual crawling, quick sample. */
123 | @Test
124 | public void crawl() throws InterruptedException {
125 | /* Now that we have a way to process a single web site, we can crawl: */
126 | List seed = Arrays.asList("http://www.ub.uni-koeln.de/",
127 | "http://www.zeit.de");
128 | /* Process the seed only: */
129 | assertTrue(Crawler.crawl(seed, 0).size() == seed.size());
130 | }
131 |
132 | /** Test the actual crawling, long-running sample. */
133 | // @Test // (long-running task, comment in to run)
134 | public void crawlMore() throws InterruptedException {
135 | List seed = Arrays.asList("http://www.ub.uni-koeln.de/",
136 | "http://www.zeit.de");
137 | /* Process seed and one level down: */
138 | int linksPerSite = 5; // estimation: > 5 links / site
139 | assertTrue(Crawler.crawl(seed, 1).size() > seed.size() * linksPerSite);
140 | }
141 |
142 | /** A simple crawler that processes the seed concurrently. */
143 | static class Crawler {
144 | public static List crawl(List seed, int depth)
145 | throws InterruptedException {
146 | /*
147 | * The result of crawling will be a list of web documents. To avoid
148 | * concurrent modification of the list, we use a synchronized wrapper:
149 | */
150 | List result = Collections
151 | .synchronizedList(new ArrayList());
152 | /*
153 | * We separate the unit of work (a Runnable) and the concurrent execution
154 | * (ExecutorService), cf. Effective Java, Second Edition, Chapter 10:
155 | */
156 | ExecutorService exec = Executors.newCachedThreadPool(); // newFixedThreadPool(1);
157 | for (String url : seed) {
158 | /* For every seed URL we create and execute a runnable: */
159 | exec.execute(new CrawlerRunnable(result, url, depth));
160 | }
161 | /* We passed all work to be done: */
162 | exec.shutdown();
163 | /* Now running in the background - we don't want to go on, but wait: */
164 | boolean done = exec.awaitTermination(5, TimeUnit.HOURS);
165 | /* Print some info on the result: */
166 | System.out.printf("Crawled %s docs, in time: %s\n", result.size(), done);
167 | return result;
168 | }
169 | }
170 |
171 | /** A crawler runnable that crawls from a given starting point. */
172 | static class CrawlerRunnable implements Runnable {
173 | private int depth;
174 | private String url;
175 | private List result;
176 |
177 | public CrawlerRunnable(List result, String url, int depth) {
178 | this.result = result;
179 | this.url = url;
180 | this.depth = depth;
181 | }
182 |
183 | @Override
184 | /* Top-level entry point (called by the executor service): */
185 | public void run() {
186 | try {
187 | crawl(url, 0); // start crawling, and catch all that can go wrong here
188 | } catch (InterruptedException e) {
189 | e.printStackTrace();
190 | } catch (SAXException e) {
191 | e.printStackTrace();
192 | }
193 | }
194 |
195 | /*
196 | * The recursive crawling method: parse current page, add result, and if
197 | * below the depth limit, call itself with the outgoing links of the page.
198 | */
199 | private void crawl(final String url, final int current)
200 | throws InterruptedException, SAXException {
201 | WebDocument doc = null;
202 | try {
203 | doc = Parser.parse(url);
204 | } catch (IOException e) {
205 | System.out.println("Crawl error: " + e.getMessage());
206 | }
207 | if (doc != null) {
208 | result.add(doc);
209 | System.out.println("Crawled: " + url);
210 | Thread.sleep(300); // delay for politeness (no server request flood)
211 | if (current < depth) {
212 | for (String link : doc.links) {
213 | crawl(link, current + 1);
214 | }
215 | }
216 | }
217 | }
218 |
219 | }
220 | }
221 |
--------------------------------------------------------------------------------
/src/spinfo/EditDistance.java:
--------------------------------------------------------------------------------
1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */
2 |
3 | package spinfo;
4 |
5 | import static org.junit.Assert.assertEquals;
6 |
7 | import java.util.HashMap;
8 | import java.util.Map;
9 |
10 | import org.junit.Test;
11 |
12 | /** Edit distance with recursion, memoization, and dynamic programming. */
13 | public class EditDistance {
14 |
15 | /** Test the correctness of the different implementations. */
16 |
17 | @Test
18 | public void correctness() {
19 | runResultTest(new RecursiveEditDistance());
20 | runResultTest(new MemoizedEditDistance());
21 | runResultTest(new DynamicProgrammingEditDistance());
22 | }
23 |
24 | /** Test the performance of the different implementations. */
25 |
26 | @Test
27 | public void performance() {
28 | runPerformanceTest(new DynamicProgrammingEditDistance());
29 | runPerformanceTest(new MemoizedEditDistance());
30 | // runPerformanceTest(new RecursiveEditDistance()); /* long-running */
31 | }
32 |
33 | /** Edit distance interface: number of operations to change s1 into s2. */
34 |
35 | interface Edit {
36 | int distance(String s1, String s2);
37 | }
38 |
39 | /** Implementation based on simple recursion. */
40 |
41 | static class RecursiveEditDistance implements Edit {
42 | private String s1;
43 | private String s2;
44 |
45 | @Override
46 | public int distance(final String s1, final String s2) {
47 | this.s1 = s1;
48 | this.s2 = s2;
49 | /* Overall problem: D(i,j) for i = |S1| and j = |S2|, i.e: */
50 | return distance(s1.length(), s2.length());
51 | }
52 |
53 | /* Distance of the first i chars in s1 to the first j chars in s2 */
54 | protected int distance(final int i, final int j) {
55 | /* Uncomment to see redundant sub-solution computation: */
56 | // System.out.println(String.format("Checking pair: %s, %s", i, j));
57 | /* "Base Condition": d(0,j) is j and d(i,0) is i */
58 | if (i == 0) {
59 | return j;
60 | }
61 | if (j == 0) {
62 | return i;
63 | }
64 | /* "Recurrence Relation" */
65 | if (s1.charAt(i - 1) == s2.charAt(j - 1)) {
66 | return distance(i - 1, j - 1);
67 | }
68 | /*
69 | * For each edit x: three recursive descents, i.e. exp. runtime: O(3^x)
70 | */
71 | int del = distance(i - 1, j) + 1;
72 | int ins = distance(i, j - 1) + 1;
73 | int rep = distance(i - 1, j - 1) + 1;
74 | return Math.min(del, Math.min(ins, rep));
75 | }
76 | }
77 |
78 | /** Implementation based on memoized recursion. */
79 |
80 | static class MemoizedEditDistance extends RecursiveEditDistance {
81 | private Map map = new HashMap();
82 |
83 | @Override
84 | public int distance(final String s1, final String s2) {
85 | map.clear(); // forget memoized solution for new pair of strings
86 | return super.distance(s1, s2);
87 | }
88 |
89 | @Override
90 | protected int distance(final int i, final int j) {
91 | String pair = i + ", " + j;
92 | /*
93 | * Only if we have not seen the pair before, we delegate to superclass:
94 | */
95 | if (!map.containsKey(pair)) {
96 | map.put(pair, super.distance(i, j));
97 | }
98 | return map.get(pair); // return the memoized sub-solution
99 | }
100 | }
101 |
102 | /** Implementation based on dynamic programming. */
103 |
104 | static class DynamicProgrammingEditDistance implements Edit {
105 | @Override
106 | public int distance(final String s1, final String s2) {
107 | /* We fill the table once, i.e. linear runtime: O(i + 1 + j + 1) */
108 | int[][] table = new int[s1.length() + 1][s2.length() + 1];
109 | for (int i = 0; i < table.length; i++) {
110 | for (int j = 0; j < table[i].length; j++) {
111 | /* "Base Condition": d(0,j) is j and d(i,0) is i */
112 | if (i == 0) {
113 | table[i][j] = j;
114 | } else if (j == 0) {
115 | table[i][j] = i;
116 | } else {
117 | int del = table[i - 1][j] + 1;
118 | int ins = table[i][j - 1] + 1;
119 | int rep = table[i - 1][j - 1]
120 | + (s1.charAt(i - 1) == s2.charAt(j - 1) ? 0 : 1);
121 | table[i][j] = Math.min(del, Math.min(ins, rep));
122 | }
123 | }
124 | }
125 | /*
126 | * After having started "bottom" at 0,0, at the end we are "up" (at the
127 | * position indicating die distance of the full strings, at the lower
128 | * right corner of the table) and have our result: D(i, j):
129 | */
130 | return table[s1.length()][s2.length()];
131 | }
132 | }
133 |
134 | private void runResultTest(final Edit distance) {
135 | assertEquals(2, distance.distance("ehe", "reh"));
136 | assertEquals(2, distance.distance("eber", "leder"));
137 | assertEquals(0, distance.distance("ehe", "ehe"));
138 | assertEquals(0, distance.distance("", ""));
139 | assertEquals(1, distance.distance("ehe", "eher"));
140 | assertEquals(2, distance.distance("he", ""));
141 | assertEquals(2, distance.distance("", "he"));
142 | assertEquals(0, distance.distance("rechtschaffen", "rechtschaffen"));
143 | }
144 |
145 | private void runPerformanceTest(final Edit distance) {
146 | System.out.print("Running performance test for: "
147 | + distance.getClass().getSimpleName() + "...");
148 | long start = System.currentTimeMillis();
149 | for (int i = 0; i < 50; i++) {
150 | distance.distance("nacktschnecke", "rechtschaffen");
151 | }
152 | System.out.println(String.format(" %s ms.", System.currentTimeMillis()
153 | - start)); // typical result: 3, 200, 60000 ms. for rec., memo.,
154 | // dp
155 | }
156 |
157 | }
--------------------------------------------------------------------------------
/src/spinfo/HashTables.java:
--------------------------------------------------------------------------------
1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */
2 |
3 | package spinfo;
4 |
5 | import junit.framework.Assert;
6 |
7 | import org.junit.Test;
8 |
9 | /** Hash Tables: a useful, efficient data structure. */
10 | public class HashTables {
11 |
12 | /* The basic idea: a direct-address table */
13 |
14 | @Test
15 | public void direct() {
16 | DirectAddressTable t = new DirectAddressTable();
17 | Person tom = new Person("tom");
18 | Person jim = new Person("jim");
19 | t.put(50, tom); // e.g. student ID = 50
20 | t.put(75, jim); // e.g. student ID = 75
21 | Assert.assertEquals(tom, t.get(50));
22 | Assert.assertEquals(jim, t.get(75));
23 | }
24 |
25 | static class DirectAddressTable {
26 | Object[] table = new Object[100]; // lots of space wasted
27 |
28 | public void put(int key, Object person) { // only numeric key supported
29 | table[key] = person;
30 | }
31 |
32 | public Object get(int key) {
33 | return table[key];
34 | }
35 | }
36 |
37 | static class Person {
38 |
39 | private String name;
40 |
41 | public Person(String name) {
42 | this.name = name;
43 | }
44 |
45 | @Override
46 | public String toString() {
47 | return name;
48 | }
49 | }
50 |
51 | /* A simple hash table using chaining for collisions: */
52 |
53 | @Test
54 | public void hashed() {
55 | HashTable t = new HashTable();
56 | Person tom = new Person("tom");
57 | Person jim = new Person("jim");
58 | Person joe = new Person("joe");
59 | t.put(50, tom); // e.g. student ID = 50
60 | t.put(75, jim); // e.g. student ID = 75
61 | t.put(85, joe); // e.g. student ID = 85, hashes to same as 75 here
62 | Assert.assertEquals(tom, t.get(50));
63 | Assert.assertEquals(jim, t.get(75));
64 | Assert.assertEquals(joe, t.get(85));
65 | }
66 |
67 | static class HashTable {
68 |
69 | Element[] table = new Element[10]; // scale down
70 |
71 | static class Element {
72 | Element next;
73 | Object key; // key can be of any type
74 | Object value;
75 |
76 | public Element(Object key, Object value) {
77 | this.key = key;
78 | this.value = value;
79 | }
80 | }
81 |
82 | public void put(Object key, Object value) {
83 | Element newElement = new Element(key, value);
84 | int slot = hash(key);
85 | Element e = table[slot];
86 | table[slot] = newElement; // place new element in table
87 | /* Handle previous element in the slot with a different key: */
88 | if (e != null && !e.key.equals(newElement.key)) {
89 | newElement.next = e; // add new in front
90 | }
91 | }
92 |
93 | public Object get(Object key) {
94 | Element e = table[hash(key)];
95 | if (e == null) // no value in slot
96 | return null;
97 | /* Find element with correct key in list: */
98 | while (!(e.key.equals(key)) && e.next != null) {
99 | e = e.next;
100 | }
101 | return e.key.equals(key) ? e.value : null;
102 | }
103 |
104 | private int hash(Object key) {
105 | // simple demo hash: map key to table length
106 | if (key instanceof Integer) {
107 | return ((Integer) key) % table.length;
108 | }
109 | if (key instanceof String) {
110 | return ((String) key).length() % table.length;
111 | }
112 | return key.hashCode() % table.length;
113 | }
114 | }
115 |
116 | /* Hashing in practice: equality for custom objects */
117 |
118 | @Test
119 | public void equality() {
120 | Student s1 = new Student(5, "John", "Doe");
121 | Student s2 = new Student(8, "Jim", "Jones");
122 | Student s3 = new Student(8, "Jim", "Jones");
123 | Student s4 = new Student(5, "John", "Doe");
124 | /* hashCode has to be implemented consistent with equals: */
125 | Assert.assertEquals(s1, s4);
126 | Assert.assertEquals(s2, s3);
127 | Assert.assertEquals(s1.hashCode(), s4.hashCode());
128 | Assert.assertEquals(s2.hashCode(), s3.hashCode());
129 | Assert.assertFalse(s1.equals(s2));
130 | Assert.assertFalse(s1.hashCode() == s2.hashCode());
131 | }
132 |
133 | static class Student {
134 | int id;
135 | String first;
136 | String last;
137 |
138 | public Student(int id, String first, String last) {
139 | this.id = id;
140 | this.first = first;
141 | this.last = last;
142 | }
143 |
144 | @Override
145 | public String toString() {
146 | return String.format("%s %s (%s)", first, last, id);
147 | }
148 |
149 | @Override
150 | public int hashCode() { // use same values as in equals
151 | int result = 17;
152 | result = 31 * result + id;
153 | result = 31 * result + first.hashCode();
154 | result = 31 * result + last.hashCode();
155 | return result;
156 | }
157 |
158 | @Override
159 | public boolean equals(Object that) { // use same values as in hashCode
160 | return (that instanceof Student) && ((Student) that).id == this.id
161 | && ((Student) that).first.equals(this.first)
162 | && ((Student) that).last.equals(this.last);
163 | }
164 | }
165 |
166 | /* Hash table sample usage: counting words */
167 |
168 | @Test
169 | public void usage() {
170 | String text = "hi there hi everybody hi there again";
171 | HashTable t = count(text);
172 | Assert.assertEquals(3, t.get("hi"));
173 | Assert.assertEquals(2, t.get("there"));
174 | Assert.assertEquals(1, t.get("everybody"));
175 | }
176 |
177 | private HashTable count(String text) {
178 | HashTable t = new HashTable();
179 | String[] words = text.split(" ");
180 | for (String w : words) {
181 | Integer v = (Integer) t.get(w);
182 | if (v == null) // first occurrence
183 | v = 0;
184 | t.put(w, v + 1); // count up
185 | }
186 | return t;
187 | }
188 | }
--------------------------------------------------------------------------------
/src/spinfo/Index.java:
--------------------------------------------------------------------------------
1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */
2 |
3 | package spinfo;
4 |
5 | import static java.util.Arrays.asList;
6 | import static org.junit.Assert.assertEquals;
7 | import static org.junit.Assert.assertTrue;
8 |
9 | import java.io.File;
10 | import java.io.FileNotFoundException;
11 | import java.io.IOException;
12 | import java.net.MalformedURLException;
13 | import java.util.ArrayList;
14 | import java.util.Arrays;
15 | import java.util.Collections;
16 | import java.util.Comparator;
17 | import java.util.HashMap;
18 | import java.util.Iterator;
19 | import java.util.List;
20 | import java.util.Map;
21 | import java.util.Scanner;
22 | import java.util.Set;
23 | import java.util.SortedSet;
24 | import java.util.TreeSet;
25 | import java.util.regex.Matcher;
26 | import java.util.regex.Pattern;
27 |
28 | import org.junit.Assert;
29 | import org.junit.Test;
30 |
31 | /**
32 | * Basic indexing and preprocessing with regular expressions. Requires file
33 | * "pg100.txt", The Complete Works of William Shakespeare
34 | * (http://www.gutenberg.org/ebooks/100.txt.utf8)
35 | */
36 | public class Index {
37 |
38 | /* Before indexing, we need to determine what elements to index. */
39 |
40 | private static final Preprocessor PREPROCESSOR = new Preprocessor();
41 |
42 | @Test
43 | public void tokenization() {
44 | assertEquals(asList("hello", "world"), process("hello, world!"));
45 | assertEquals(asList("123", "test"), process("test 123, 123 test, test"));
46 | assertEquals(asList("0221-123123", "test"), process("0221-123123, test"));
47 | assertEquals(asList("123", "köln", "test"), process("test - köln - 123"));
48 | }
49 |
50 | private List process(String string) {
51 | // some wrapping for the tests (compare with sorted list)
52 | return new ArrayList(new TreeSet(
53 | PREPROCESSOR.tokenize(string)));
54 | }
55 |
56 | @Test
57 | public void patterns() {
58 | assertTrue("0221-470".matches(SpecialCase.COMPOUND.regex));
59 | assertTrue(!"Meine Nummer: 0221-470.".matches(SpecialCase.COMPOUND.regex));
60 | assertTrue(!"4711".matches(SpecialCase.COMPOUND.regex));
61 | assertTrue(!"Daimler-Benz".matches(SpecialCase.COMPOUND.regex));
62 | assertTrue("8.04".matches(SpecialCase.COMPOUND.regex));
63 | assertTrue("15:10".matches(SpecialCase.COMPOUND.regex));
64 | assertTrue("3,50".matches(SpecialCase.COMPOUND.regex));
65 | assertTrue("fabian.steeg@uni-koeln.de".matches(SpecialCase.EMAIL.regex));
66 | assertTrue("fsteeg@spinfo.uni-koeln.de".matches(SpecialCase.EMAIL.regex));
67 | assertTrue(!"fabian@home".matches(SpecialCase.EMAIL.regex));
68 | }
69 |
70 | /*
71 | * Available patterns for extraction. Uses enum instead of constants to
72 | * iterate over all patterns in constructor of Preprocessor.
73 | */
74 | enum SpecialCase {
75 | /* Phone (0221-4701751), versions (8.04), money (3,50) and time (15:15) */
76 | COMPOUND("\\d+[-.,:]\\d+"),
77 | /* Simple numbers */
78 | NUMBER("\\d+"),
79 | /* Some simple email adresses */
80 | EMAIL("[^@\\s]+@.+?\\.(de|com|eu|org|net)");
81 |
82 | String regex;
83 |
84 | SpecialCase(final String regularExpression) {
85 | this.regex = regularExpression;
86 | }
87 | }
88 |
89 | /**
90 | * A preprocessor based on regular expressions: first extracts custom
91 | * patterns, then splits on a given delimiter.
92 | */
93 | static class Preprocessor {
94 | /* Unicode-aware "non-letter" delimiter, ASCII version is \\W */
95 | private static final String UNICODE_AWARE_DELIMITER = "[^\\p{L}]";
96 | private List specialCases = new ArrayList();
97 | private String delimiter;
98 |
99 | public Preprocessor() {
100 | delimiter = UNICODE_AWARE_DELIMITER;
101 | for (SpecialCase p : SpecialCase.values()) {
102 | specialCases.add(p);
103 | }
104 | }
105 |
106 | public List tokenize(final String input) {
107 | String text = input.toLowerCase();
108 | List result = new ArrayList();
109 | text = extractSpecialCases(text, result);
110 | tokenizeStandard(text, result);
111 | return result;
112 | }
113 |
114 | private String extractSpecialCases(String text, List result) {
115 | for (SpecialCase p : specialCases) {
116 | Pattern pattern = Pattern.compile(p.regex);
117 | Matcher matcher = pattern.matcher(text);
118 | while (matcher.find()) {
119 | String group = matcher.group();
120 | result.add(group); // add special case
121 | text = text.replace(group, ""); // don't treat group as regex
122 | }
123 | }
124 | return text;
125 | }
126 |
127 | private void tokenizeStandard(String text, List result) {
128 | List list = Arrays.asList(text.split(delimiter));
129 | for (String s : list)
130 | if (s.trim().length() > 0) // filter empty strings
131 | result.add(s.trim());
132 | }
133 |
134 | }
135 |
136 | /* Once we can preprocess our corpus, we can build an index and search it: */
137 |
138 | private static final InvertedIndex INDEX = buildIndex();
139 |
140 | /** Test searching the corpus for a single term. */
141 | @Test
142 | public final void testSearch() throws MalformedURLException, IOException {
143 | long start = System.currentTimeMillis();
144 | String query = "Brutus";
145 | Set list = INDEX.search(query);
146 | System.out.printf("Result for '%s': %s, took %s ms.\n", query, list,
147 | (System.currentTimeMillis() - start));
148 | Assert.assertTrue("Search should find a single term", list.size() > 0);
149 | }
150 |
151 | /** Test searching the corpus for multiple search terms. */
152 | @Test
153 | public final void testMulti() throws MalformedURLException, IOException {
154 | long start = System.currentTimeMillis();
155 | String query = "Brutus Caesar"; // = Brutus AND Caesar
156 | Set list = INDEX.search(query);
157 | System.out.printf("Result for '%s': %s, took %s ms.\n", query, list,
158 | (System.currentTimeMillis() - start));
159 | Assert.assertTrue("Search should find multiple terms", list.size() > 0);
160 | }
161 |
162 | static class InvertedIndex {
163 |
164 | private Map> index = new HashMap>();
165 |
166 | public InvertedIndex(final List corpus) {
167 | index = index(corpus);
168 | }
169 |
170 | private Map> index(final List works) {
171 | Map> index = new HashMap>();
172 | // for each document, and each of its token, add it to the index
173 | for (int i = 0; i < works.size(); i++) {
174 | List tokens = PREPROCESSOR.tokenize(works.get(i));
175 | for (String token : tokens) {
176 | SortedSet postings = index.get(token);
177 | if (postings == null) { // first time
178 | postings = new TreeSet();
179 | index.put(token, postings);
180 | }
181 | postings.add(i); // document i contains token
182 | }
183 | }
184 | return index;
185 | }
186 |
187 | public Set search(final String query) {
188 | /* We treat all entries as AND-linked... */
189 | List queries = PREPROCESSOR.tokenize(query);
190 | /* We get the results for each query term: */
191 | List> allPostings = new ArrayList>();
192 | for (String q : queries) {
193 | SortedSet postings = index.get(q);
194 | if (postings != null)
195 | allPostings.add(postings);
196 | }
197 | /* For efficient intersection computation: sort lists by length */
198 | sortByLength(allPostings);
199 | /* Intersection of postings for all query terms is our result: */
200 | return intersectionOf(allPostings);
201 | }
202 |
203 | private void sortByLength(List> all) {
204 | Collections.sort(all, new Comparator>() {
205 | public int compare(final SortedSet o1,
206 | final SortedSet o2) {
207 | return Integer.valueOf(o1.size()).compareTo(o2.size());
208 | }
209 | });
210 | }
211 |
212 | private Set intersectionOf(List> all) {
213 | /* The result set is the intersection of the first list with all others: */
214 | SortedSet result = all.get(0);
215 | for (SortedSet set : all.subList(1, all.size())) {
216 | result = intersection(result.iterator(), set.iterator());
217 | }
218 | return result;
219 | }
220 |
221 | }
222 |
223 | /* Implementation and tests for the intersection algorithm: */
224 |
225 | @Test
226 | public void intersection() {
227 | /* Test intersection computation for AND-queries: */
228 | TreeSet PL1 = new TreeSet(Arrays.asList(4, 3, 2, 1));
229 | TreeSet PL2 = new TreeSet(Arrays.asList(2, 4, 6, 8));
230 | Assert.assertEquals(Arrays.asList(2, 4), new ArrayList(
231 | intersection(PL1.iterator(), PL2.iterator())));
232 | }
233 |
234 | public static SortedSet intersection(final Iterator i1,
235 | final Iterator i2) {
236 | SortedSet result = new TreeSet();
237 | Integer p1 = next(i1);
238 | Integer p2 = next(i2);
239 | while (p1 != null && p2 != null) {
240 | if (p1.equals(p2)) {
241 | result.add(p1);
242 | p1 = next(i1);
243 | p2 = next(i2);
244 | } else if (p1 < p2)
245 | p1 = next(i1);
246 | else
247 | p2 = next(i2);
248 | }
249 | return result;
250 | }
251 |
252 | /* A little oddity to stay close to Manning et al. 2008, p. 11: */
253 | private static Integer next(final Iterator i1) {
254 | return i1.hasNext() ? i1.next() : null;
255 | }
256 |
257 | /* Utilities: load data, build index: */
258 |
259 | private static InvertedIndex buildIndex() {
260 | List corpus = corpus();
261 | long start = System.currentTimeMillis();
262 | System.out.printf("Building index for %s texts... ", corpus.size());
263 | InvertedIndex invertedIndex = new InvertedIndex(corpus);
264 | System.out
265 | .printf("done, took %s ms.\n", System.currentTimeMillis() - start);
266 | return invertedIndex;
267 | }
268 |
269 | private static List corpus() {
270 | try {
271 | Scanner s = new Scanner(new File("pg100.txt"), "UTF-8");
272 | StringBuilder builder = new StringBuilder();
273 | while (s.hasNextLine()) {
274 | builder.append(s.nextLine()).append("\n");
275 | }
276 | /* Each work is delimited by a line ending with a year: */
277 | return Arrays.asList(builder.toString().split("1[56][0-9]{2}\n"));
278 | } catch (FileNotFoundException e) {
279 | e.printStackTrace();
280 | }
281 | return Collections.emptyList();
282 | }
283 |
284 | }
--------------------------------------------------------------------------------
/src/spinfo/Lists.java:
--------------------------------------------------------------------------------
1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */
2 |
3 | package spinfo;
4 |
5 | import static org.junit.Assert.assertEquals;
6 |
7 | import java.util.Iterator;
8 | import java.util.NoSuchElementException;
9 |
10 | import org.junit.Before;
11 | import org.junit.Test;
12 |
13 | /** Lists: elementary data structures. */
14 | public class Lists {
15 |
16 | /** Low-level, non-OOP list implementation simulating a tuple/record/struct. */
17 | @Test
18 | public void tuple() {
19 | /* Independent nodes: */
20 | Object[] first = new Object[2];
21 | Object[] second = new Object[2];
22 | Object[] third = new Object[2];
23 | /* Containing values: */
24 | first[0] = "first";
25 | second[0] = "second";
26 | third[0] = "third";
27 | /* Linked with pointers: */
28 | first[1] = second;
29 | second[1] = third;
30 | /* Can be traversed: */
31 | System.out.println("List traversal: ");
32 | Object[] current = first;
33 | while (current != null) {
34 | System.out.println(current[0]);
35 | current = (Object[]) current[1];
36 | }
37 | }
38 |
39 | /** OOP implementation of a queue, a FIFO list (first in, first out). */
40 | @Test
41 | public void queue() {
42 | Queue queue = new Queue();
43 | /* Enqueue at end: */
44 | queue.enqueue("first");
45 | queue.enqueue("second");
46 | queue.enqueue("third");
47 | /* Iterate: */
48 | System.out.println("Queue traversal: ");
49 | Node current = queue.first;
50 | while (current != null) {
51 | System.out.println(current.value);
52 | current = current.next;
53 | }
54 | /* Dequeue from front: */
55 | assertEquals("first", queue.dequeue());
56 | assertEquals("second", queue.dequeue());
57 | assertEquals("third", queue.dequeue());
58 | assertEquals(null, queue.dequeue());
59 | }
60 |
61 | /** A list element: wraps a value and a reference to the next element. */
62 | static class Node {
63 | Object value;
64 | Node next;
65 |
66 | Node(Object value) {
67 | this.value = value;
68 | }
69 | }
70 |
71 | /** The queue class enforces the restricted FIFO access. */
72 | static class Queue /**/implements Iterable