├── .classpath ├── .gitignore ├── .project ├── .settings └── org.eclipse.jdt.ui.prefs ├── README.textile ├── build.xml ├── lib ├── guava-r09.jar ├── guava-src-r09.zip ├── junit-4.8.2-src.jar ├── junit-4.8.2.jar ├── nekohtml.jar └── xercesImpl.jar ├── pg100.txt └── src └── spinfo ├── Collation.java ├── CollectionsGenerics.java ├── Crawling.java ├── EditDistance.java ├── HashTables.java ├── Index.java ├── Lists.java ├── Quicksort.java ├── SortSearch.java ├── TestSuite.java ├── Trees.java └── package-info.java /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | build -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | java 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- 1 | #Mon May 23 17:35:24 CEST 2011 2 | eclipse.preferences.version=1 3 | editor_save_participant_org.eclipse.jdt.ui.postsavelistener.cleanup=true 4 | formatter_settings_version=12 5 | sp_cleanup.add_default_serial_version_id=true 6 | sp_cleanup.add_generated_serial_version_id=false 7 | sp_cleanup.add_missing_annotations=true 8 | sp_cleanup.add_missing_deprecated_annotations=true 9 | sp_cleanup.add_missing_methods=false 10 | sp_cleanup.add_missing_nls_tags=false 11 | sp_cleanup.add_missing_override_annotations=true 12 | sp_cleanup.add_missing_override_annotations_interface_methods=true 13 | sp_cleanup.add_serial_version_id=false 14 | sp_cleanup.always_use_blocks=true 15 | sp_cleanup.always_use_parentheses_in_expressions=false 16 | sp_cleanup.always_use_this_for_non_static_field_access=false 17 | sp_cleanup.always_use_this_for_non_static_method_access=false 18 | sp_cleanup.convert_to_enhanced_for_loop=false 19 | sp_cleanup.correct_indentation=false 20 | sp_cleanup.format_source_code=true 21 | sp_cleanup.format_source_code_changes_only=false 22 | sp_cleanup.make_local_variable_final=false 23 | sp_cleanup.make_parameters_final=false 24 | sp_cleanup.make_private_fields_final=true 25 | sp_cleanup.make_type_abstract_if_missing_method=false 26 | sp_cleanup.make_variable_declarations_final=true 27 | sp_cleanup.never_use_blocks=false 28 | sp_cleanup.never_use_parentheses_in_expressions=true 29 | sp_cleanup.on_save_use_additional_actions=false 30 | sp_cleanup.organize_imports=true 31 | sp_cleanup.qualify_static_field_accesses_with_declaring_class=false 32 | sp_cleanup.qualify_static_member_accesses_through_instances_with_declaring_class=true 33 | sp_cleanup.qualify_static_member_accesses_through_subtypes_with_declaring_class=true 34 | sp_cleanup.qualify_static_member_accesses_with_declaring_class=false 35 | sp_cleanup.qualify_static_method_accesses_with_declaring_class=false 36 | sp_cleanup.remove_private_constructors=true 37 | sp_cleanup.remove_trailing_whitespaces=false 38 | sp_cleanup.remove_trailing_whitespaces_all=true 39 | sp_cleanup.remove_trailing_whitespaces_ignore_empty=false 40 | sp_cleanup.remove_unnecessary_casts=true 41 | sp_cleanup.remove_unnecessary_nls_tags=false 42 | sp_cleanup.remove_unused_imports=false 43 | sp_cleanup.remove_unused_local_variables=false 44 | sp_cleanup.remove_unused_private_fields=true 45 | sp_cleanup.remove_unused_private_members=false 46 | sp_cleanup.remove_unused_private_methods=true 47 | sp_cleanup.remove_unused_private_types=true 48 | sp_cleanup.sort_members=false 49 | sp_cleanup.sort_members_all=false 50 | sp_cleanup.use_blocks=false 51 | sp_cleanup.use_blocks_only_for_return_and_throw=false 52 | sp_cleanup.use_parentheses_in_expressions=false 53 | sp_cleanup.use_this_for_non_static_field_access=false 54 | sp_cleanup.use_this_for_non_static_field_access_only_if_necessary=true 55 | sp_cleanup.use_this_for_non_static_method_access=false 56 | sp_cleanup.use_this_for_non_static_method_access_only_if_necessary=true 57 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | "Java at the University of Cologne, Department of Linguistics (Sprachliche Informationsverarbeitung)":http://spinfo.uni-koeln.de/spinfo-java.html -------------------------------------------------------------------------------- /build.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /lib/guava-r09.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/guava-r09.jar -------------------------------------------------------------------------------- /lib/guava-src-r09.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/guava-src-r09.zip -------------------------------------------------------------------------------- /lib/junit-4.8.2-src.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/junit-4.8.2-src.jar -------------------------------------------------------------------------------- /lib/junit-4.8.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/junit-4.8.2.jar -------------------------------------------------------------------------------- /lib/nekohtml.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/nekohtml.jar -------------------------------------------------------------------------------- /lib/xercesImpl.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/xercesImpl.jar -------------------------------------------------------------------------------- /src/spinfo/Collation.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | import static org.junit.Assert.assertFalse; 7 | 8 | import java.text.CollationKey; 9 | import java.text.Collator; 10 | import java.text.ParseException; 11 | import java.text.RuleBasedCollator; 12 | import java.util.Arrays; 13 | import java.util.Collections; 14 | import java.util.Comparator; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Locale; 18 | import java.util.Map; 19 | 20 | import org.junit.Test; 21 | 22 | /** Lexicographic sorting and collation in Java. */ 23 | public class Collation { 24 | 25 | List words = Arrays.asList("Very", "Über", "very", "ultra", "über"); 26 | 27 | @Test 28 | public void basicProblem() { 29 | /* 30 | * The basic problem: Java's default sorting for strings (based on the 31 | * character's Unicode position) is insufficient for lexicographic ordering: 32 | */ 33 | Collections.sort(words); 34 | /* 35 | * Because it sorts all upper case letters before all lowercase letters and 36 | * sorts all letters with diacritics behind all standard letters: 37 | */ 38 | assertEquals(Arrays.asList("Very", "ultra", "very", "Über", "über"), words); 39 | /* Which is not what we would expect (upper, lower, diacritics together): */ 40 | assertFalse(Arrays.asList("ultra", "Über", "über", "Very", "very").equals( 41 | words)); 42 | } 43 | 44 | @Test 45 | public void basicSolution() { 46 | /* Idea: map the chars to their correct position, and sort by that mapping: */ 47 | final Map collationKeys = new HashMap(); 48 | collationKeys.put('U', 1); // or lower-level, with array: char['U'] = 1; 49 | collationKeys.put('u', 2); 50 | collationKeys.put('\u00dc', 3); // Ü 51 | collationKeys.put('\u00fc', 4); // ü 52 | collationKeys.put('V', 5); 53 | collationKeys.put('v', 6); 54 | /* We pass a custom sorting strategy to the sort method: */ 55 | Collections.sort(words, new Comparator() { 56 | @Override 57 | public int compare(String s1, String s2) { 58 | /* For this sample, we only look at the first letter: */ 59 | Character c1 = s1.charAt(0); 60 | Character c2 = s2.charAt(0); 61 | /* We don't compare the chars, but their collation keys: */ 62 | return collationKeys.get(c1).compareTo(collationKeys.get(c2)); 63 | } 64 | }); 65 | /* For our specific case, this results in a somewhat correct order: */ 66 | assertEquals(Arrays.asList("ultra", "Über", "über", "Very", "very"), words); 67 | } 68 | 69 | @Test 70 | public void collator() { 71 | /* Java contains region-specific collation rules, via Collator: */ 72 | final Collator collator = Collator.getInstance(Locale.GERMAN); 73 | Collections.sort(words, new Comparator() { 74 | @Override 75 | public int compare(String s1, String s2) { 76 | return collator.compare(s1, s2); 77 | } 78 | }); 79 | /* Which gets the details right, e.g. sort umlauts like their standards: */ 80 | assertEquals(Arrays.asList("über", "Über", "ultra", "very", "Very"), words); 81 | } 82 | 83 | @Test 84 | public void comparable() { 85 | /* 86 | * If we control the objects sorted (unlike strings), and the sorting does 87 | * not depend on something external to the objects (unlike above, where we 88 | * sort chars by their keys), we can define the order inside our objects: 89 | */ 90 | List words = Arrays.asList(new Word("Very"), new Word("ultra"), 91 | new Word("über"), new Word("Super")); 92 | /* From the usage side, it now looks like the default sorting just works: */ 93 | Collections.sort(words); 94 | assertEquals(Arrays.asList(new Word("Super"), new Word("über"), new Word( 95 | "ultra"), new Word("Very")), words); 96 | } 97 | 98 | static class Word implements Comparable { 99 | 100 | private String val; 101 | private Collator collator = Collator.getInstance(); // uses system locale 102 | private CollationKey key; 103 | 104 | public Word(String val) { 105 | this.val = val; 106 | this.key = collator.getCollationKey(val); // precompute the key 107 | } 108 | 109 | @Override 110 | public int compareTo(Word that) { 111 | // return this.val.compareTo(that.val); // naive, not sufficient 112 | /* Instead of comparing the vals, we can pass them to the collator: */ 113 | // return collator.compare(this.val, that.val); // always computes keys 114 | /* To improve performance, we precompute the keys, and compare these: */ 115 | return this.key.compareTo(that.key); 116 | } 117 | 118 | /* Java standard method implementations below, needed for testing here: */ 119 | 120 | @Override 121 | public String toString() { 122 | return val; 123 | } 124 | 125 | @Override 126 | public boolean equals(Object that) { 127 | return that instanceof Word && ((Word) that).val.equals(this.val); 128 | } 129 | 130 | @Override 131 | public int hashCode() { 132 | return val.hashCode(); // mandatory if equals, consistent with equals 133 | } 134 | } 135 | 136 | @Test 137 | public void customRules() throws ParseException { 138 | List w = Arrays.asList("Löss", "Lee", "Luv", "Löß"); 139 | /* Default collator: ß after ss */ 140 | sortWithCollator(w, Collator.getInstance(Locale.GERMAN)); // default german 141 | assertEquals(Arrays.asList("Lee", "Löss", "Löß", "Luv"), w); 142 | /* Custom requirement: sort ß before ss (old German spelling rules) */ 143 | String defaultRules = ((RuleBasedCollator) RuleBasedCollator 144 | .getInstance(Locale.GERMAN)).getRules(); 145 | String customRules = "ß < ss"; // additional custom rule, replaces default 146 | final Collator collator = new RuleBasedCollator(defaultRules + customRules); 147 | sortWithCollator(w, collator); 148 | assertEquals(Arrays.asList("Lee", "Löß", "Löss", "Luv"), w); 149 | } 150 | 151 | private void sortWithCollator(List words, final Collator collator) { 152 | Collections.sort(words, new Comparator() { 153 | @Override 154 | public int compare(String s1, String s2) { 155 | return collator.compare(s1, s2); 156 | } 157 | }); 158 | } 159 | 160 | } -------------------------------------------------------------------------------- /src/spinfo/CollectionsGenerics.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | import static org.junit.Assert.assertTrue; 7 | 8 | import java.util.ArrayList; 9 | import java.util.Arrays; 10 | import java.util.Collection; 11 | import java.util.Collections; 12 | import java.util.Deque; 13 | import java.util.HashMap; 14 | import java.util.HashSet; 15 | import java.util.LinkedList; 16 | import java.util.List; 17 | import java.util.Map; 18 | import java.util.Set; 19 | import java.util.SortedMap; 20 | import java.util.SortedSet; 21 | import java.util.TreeMap; 22 | import java.util.TreeSet; 23 | 24 | import org.junit.Test; 25 | 26 | /** Generic data structures and algorithms: Java generics and collections. */ 27 | public class CollectionsGenerics { 28 | 29 | /* Generics */ 30 | 31 | @Test 32 | public void basicGenerics() { 33 | /* 34 | * A List can contain different types of elements. We specify the type as a 35 | * parameter for the List class, e.g. Integer: 36 | */ 37 | List ints = Arrays.asList(1, 2, 3, 4, 5); // auto-boxed 38 | assertTrue(ints.get(0) instanceof Integer); 39 | /* Or String: */ 40 | List strings = Arrays.asList("one", "two"); 41 | assertTrue(strings.get(0) instanceof String); 42 | /* Since List can be used with different types, it's called a generic class. */ 43 | } 44 | 45 | @Test 46 | public void genericMethods() { 47 | /* As method used above, but implemented below: */ 48 | List s = asList(new String[] { "one", "two" }); 49 | /* Optional here: explicit type parameter: */ 50 | s = CollectionsGenerics. asList(new String[] { "one", "two" }); 51 | assertEquals("one", s.get(0)); 52 | assertEquals("two", s.get(1)); 53 | } 54 | 55 | /* A generic method: has a type parameter T (inferred or explicit) */ 56 | private static List asList(T[] ts) { 57 | List result = new ArrayList(); // choose List impl. on creation 58 | for (T t : ts) 59 | result.add(t); 60 | return result; 61 | } 62 | 63 | @Test 64 | public void genericClasses() { 65 | /* Like for List above, we can also use generics on our classes: */ 66 | Tree tree = new Tree(); 67 | tree.root = new Node("value"); 68 | assertEquals("value", tree.root.value); 69 | assertTrue(tree.root.value instanceof String); 70 | } 71 | 72 | static class Tree { // T becomes concrete on creation, e.g. String, 73 | // Integer, etc. 74 | Node root; 75 | } 76 | 77 | static class Node { 78 | T value; 79 | Node left; 80 | Node right; 81 | 82 | public Node(T value) { 83 | this.value = value; 84 | } 85 | } 86 | 87 | /* Collections */ 88 | 89 | @Test 90 | public void collections() { 91 | List list = new ArrayList(); // refer by interface 92 | Collection coll = list; // List is a Collection 93 | Iterable iter = coll; // Collection is Iterable 94 | assertTrue(iter instanceof Iterable); 95 | assertTrue(iter instanceof Collection); 96 | assertTrue(iter instanceof List); 97 | assertTrue(iter instanceof ArrayList); 98 | /* The Collection Interface defines 4 kinds of methods: */ 99 | coll.add("hi"); // 1. methods for adding elements (also addAll, ...) 100 | coll.remove("hi"); // 2. methods for removing elements (also removeAll, ...) 101 | coll.contains("hi"); // 3. methods for querying (also containsAll, ...) 102 | coll.toArray(new String[0]); // 4. methods for conversion (iterator, ...) 103 | } 104 | 105 | @Test 106 | public void sets() { 107 | Set set = new HashSet(); // no duplicates, no order 108 | set = new TreeSet(); // change impl: no duplicates, sorted, tree 109 | assertTrue(set instanceof Collection); 110 | assertTrue(set instanceof Set); 111 | assertTrue(set instanceof TreeSet); 112 | assertTrue(set instanceof SortedSet); // additional interface 113 | set.add("hi"); // O(1) for HashSet, O(log n) for TreeSet 114 | assertTrue(set.contains("hi")); // O(1) for HashSet, O(log n) for TreeSet 115 | set.add("hi"); // add existing value, should not be added 116 | assertEquals(1, set.size()); // no duplicates 117 | } 118 | 119 | @Test 120 | public void lists() { 121 | List list = new ArrayList(); // array-based impl. 122 | list = new LinkedList(); // linked list impl. 123 | assertTrue(list instanceof Collection); 124 | assertTrue(list instanceof List); 125 | assertTrue(list instanceof LinkedList); 126 | assertTrue(list instanceof Deque); // additional interface 127 | list.add("hi"); // O(1) for ArrayList and LinkedList (add at end) 128 | String s = list.get(0); // O(1) for ArrayList, O(n) for LinkedList 129 | list.remove(0); // O(1) for LinkedList (front), O(n) for ArrayList 130 | assertEquals("hi", s); 131 | } 132 | 133 | @Test 134 | public void maps() { 135 | Map map = new HashMap(); // hash table 136 | map = new TreeMap(); // change impl: sorted keys, tree 137 | assertTrue(map instanceof Map); 138 | assertTrue(map instanceof TreeMap); 139 | assertTrue(map instanceof SortedMap); // additional interface 140 | map.put("hi", 5); // O(1) for HashMap, O(log n) for TreeMap 141 | int i = map.get("hi"); // O(1) for HashMap, O(log n) for TreeMap 142 | assertEquals(5, i); 143 | } 144 | 145 | @Test 146 | public void algorithms() { 147 | /* Generic methods for working with collections, e.g. sorting and searching: */ 148 | List vals = Arrays.asList(91, 23, 88, 93, 20, 37); 149 | Collections.sort(vals); // merge sort, O(n log n) 150 | assertEquals(Arrays.asList(20, 23, 37, 88, 91, 93), vals); 151 | assertEquals(2, Collections.binarySearch(vals, 37)); // binsearch, O(log n) 152 | assertEquals(5, Collections.binarySearch(vals, 93)); 153 | } 154 | 155 | @Test 156 | public void wrappers() { 157 | /* We can convert collections by passing them to the constructor: */ 158 | List list = Arrays.asList("one", "one", "two", "two"); 159 | assertEquals(4, list.size()); 160 | /* Remove duplicates by wrapping the list in a set: */ 161 | Set set = new HashSet(list); 162 | assertEquals(2, set.size()); 163 | } 164 | } -------------------------------------------------------------------------------- /src/spinfo/Crawling.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertTrue; 6 | 7 | import java.io.IOException; 8 | import java.net.MalformedURLException; 9 | import java.net.URL; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.Collections; 13 | import java.util.HashSet; 14 | import java.util.List; 15 | import java.util.Set; 16 | import java.util.concurrent.ExecutorService; 17 | import java.util.concurrent.Executors; 18 | import java.util.concurrent.TimeUnit; 19 | 20 | import org.cyberneko.html.parsers.DOMParser; 21 | import org.junit.Test; 22 | import org.w3c.dom.Node; 23 | import org.xml.sax.SAXException; 24 | 25 | import com.google.common.base.Charsets; 26 | import com.google.common.io.Resources; 27 | 28 | /** Basic web crawling, HTML processing, and concurrency. */ 29 | public class Crawling { 30 | 31 | /** Test simple content loading via URL. */ 32 | @Test 33 | public void load() throws IOException { 34 | /* 35 | * Simple loading of content from a URL, using Guava (passing the charset 36 | * specified by the site), but the result is not very useful as it is (e.g. 37 | * for indexing): 38 | */ 39 | System.out.println(Resources.toString(new URL("http://www.zeit.de/"), 40 | Charsets.UTF_8)); 41 | } 42 | 43 | /** Test web site parsing. */ 44 | @Test 45 | public void parse() throws SAXException, IOException { 46 | /* What we need is a structured processing of content and links: */ 47 | WebDocument doc = Parser.parse("http://www.zeit.de/"); 48 | String text = doc.text; 49 | Set links = doc.links; 50 | assertTrue("Document content should exist", text.length() > 0); 51 | assertTrue("Outgoing links should exist", links.size() > 0); 52 | System.out.println("Text: " + text); 53 | System.out.println("Links: " + links); 54 | } 55 | 56 | /** A web document representation consisting of text and links. */ 57 | static class WebDocument { 58 | String text; 59 | Set links; 60 | URL url; 61 | 62 | WebDocument(String url, String text, Set links) 63 | throws MalformedURLException { 64 | this.text = text; 65 | this.links = links; 66 | this.url = new URL(url); 67 | } 68 | } 69 | 70 | /** A parser that transforms a URL into a web document representation. */ 71 | static class Parser { 72 | private static Set links; 73 | private static StringBuilder builder; 74 | 75 | static WebDocument parse(String url) throws SAXException, IOException { 76 | /* We parse with NekoHTML, an error-correcting parser based on Xerces: */ 77 | DOMParser parser = new DOMParser(); 78 | parser.parse(url); 79 | builder = new StringBuilder(); 80 | links = new HashSet(); 81 | /* We start at the first element: */ 82 | process(parser.getDocument().getFirstChild()); 83 | /* At the end we create our resulting document object: */ 84 | return new WebDocument(url, builder.toString().trim(), links); 85 | } 86 | 87 | private static void process(Node node) throws MalformedURLException { 88 | /* 89 | * We get elements by their names. We could use instanceof, and e.g. test 90 | * if something is a HTMLParagraphElement, but this is less robust, since 91 | * e.g. XHTML documents are made of elements in a different namespace. 92 | */ 93 | String elementName = node.getNodeName().toLowerCase().trim(); 94 | /* We treat as content here only text within a p-tag: */ 95 | if (elementName.equals("p")) { 96 | String text = node.getTextContent().trim(); 97 | if (text.length() > 0) { 98 | builder.append(text).append("\n\n"); // make it a paragraph 99 | } 100 | } else if (elementName.equals("a")) { 101 | if (node.hasAttributes()) { 102 | /* If the a-tag has a href attribute with http, add it to the links: */ 103 | Node href = node.getAttributes().getNamedItem("href"); 104 | if (href != null && href.getNodeValue().trim().startsWith("http://")) { 105 | links.add(href.getNodeValue().trim()); 106 | } 107 | } 108 | } 109 | /* Done with current node, recurse on same level (if there is more): */ 110 | Node sibling = node.getNextSibling(); 111 | if (sibling != null) { 112 | process(sibling); 113 | } 114 | /* Done with current level, recurse to next level (if there is more): */ 115 | Node child = node.getFirstChild(); 116 | if (child != null) { 117 | process(child); 118 | } 119 | } 120 | } 121 | 122 | /** Test the actual crawling, quick sample. */ 123 | @Test 124 | public void crawl() throws InterruptedException { 125 | /* Now that we have a way to process a single web site, we can crawl: */ 126 | List seed = Arrays.asList("http://www.ub.uni-koeln.de/", 127 | "http://www.zeit.de"); 128 | /* Process the seed only: */ 129 | assertTrue(Crawler.crawl(seed, 0).size() == seed.size()); 130 | } 131 | 132 | /** Test the actual crawling, long-running sample. */ 133 | // @Test // (long-running task, comment in to run) 134 | public void crawlMore() throws InterruptedException { 135 | List seed = Arrays.asList("http://www.ub.uni-koeln.de/", 136 | "http://www.zeit.de"); 137 | /* Process seed and one level down: */ 138 | int linksPerSite = 5; // estimation: > 5 links / site 139 | assertTrue(Crawler.crawl(seed, 1).size() > seed.size() * linksPerSite); 140 | } 141 | 142 | /** A simple crawler that processes the seed concurrently. */ 143 | static class Crawler { 144 | public static List crawl(List seed, int depth) 145 | throws InterruptedException { 146 | /* 147 | * The result of crawling will be a list of web documents. To avoid 148 | * concurrent modification of the list, we use a synchronized wrapper: 149 | */ 150 | List result = Collections 151 | .synchronizedList(new ArrayList()); 152 | /* 153 | * We separate the unit of work (a Runnable) and the concurrent execution 154 | * (ExecutorService), cf. Effective Java, Second Edition, Chapter 10: 155 | */ 156 | ExecutorService exec = Executors.newCachedThreadPool(); // newFixedThreadPool(1); 157 | for (String url : seed) { 158 | /* For every seed URL we create and execute a runnable: */ 159 | exec.execute(new CrawlerRunnable(result, url, depth)); 160 | } 161 | /* We passed all work to be done: */ 162 | exec.shutdown(); 163 | /* Now running in the background - we don't want to go on, but wait: */ 164 | boolean done = exec.awaitTermination(5, TimeUnit.HOURS); 165 | /* Print some info on the result: */ 166 | System.out.printf("Crawled %s docs, in time: %s\n", result.size(), done); 167 | return result; 168 | } 169 | } 170 | 171 | /** A crawler runnable that crawls from a given starting point. */ 172 | static class CrawlerRunnable implements Runnable { 173 | private int depth; 174 | private String url; 175 | private List result; 176 | 177 | public CrawlerRunnable(List result, String url, int depth) { 178 | this.result = result; 179 | this.url = url; 180 | this.depth = depth; 181 | } 182 | 183 | @Override 184 | /* Top-level entry point (called by the executor service): */ 185 | public void run() { 186 | try { 187 | crawl(url, 0); // start crawling, and catch all that can go wrong here 188 | } catch (InterruptedException e) { 189 | e.printStackTrace(); 190 | } catch (SAXException e) { 191 | e.printStackTrace(); 192 | } 193 | } 194 | 195 | /* 196 | * The recursive crawling method: parse current page, add result, and if 197 | * below the depth limit, call itself with the outgoing links of the page. 198 | */ 199 | private void crawl(final String url, final int current) 200 | throws InterruptedException, SAXException { 201 | WebDocument doc = null; 202 | try { 203 | doc = Parser.parse(url); 204 | } catch (IOException e) { 205 | System.out.println("Crawl error: " + e.getMessage()); 206 | } 207 | if (doc != null) { 208 | result.add(doc); 209 | System.out.println("Crawled: " + url); 210 | Thread.sleep(300); // delay for politeness (no server request flood) 211 | if (current < depth) { 212 | for (String link : doc.links) { 213 | crawl(link, current + 1); 214 | } 215 | } 216 | } 217 | } 218 | 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/spinfo/EditDistance.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | import org.junit.Test; 11 | 12 | /** Edit distance with recursion, memoization, and dynamic programming. */ 13 | public class EditDistance { 14 | 15 | /** Test the correctness of the different implementations. */ 16 | 17 | @Test 18 | public void correctness() { 19 | runResultTest(new RecursiveEditDistance()); 20 | runResultTest(new MemoizedEditDistance()); 21 | runResultTest(new DynamicProgrammingEditDistance()); 22 | } 23 | 24 | /** Test the performance of the different implementations. */ 25 | 26 | @Test 27 | public void performance() { 28 | runPerformanceTest(new DynamicProgrammingEditDistance()); 29 | runPerformanceTest(new MemoizedEditDistance()); 30 | // runPerformanceTest(new RecursiveEditDistance()); /* long-running */ 31 | } 32 | 33 | /** Edit distance interface: number of operations to change s1 into s2. */ 34 | 35 | interface Edit { 36 | int distance(String s1, String s2); 37 | } 38 | 39 | /** Implementation based on simple recursion. */ 40 | 41 | static class RecursiveEditDistance implements Edit { 42 | private String s1; 43 | private String s2; 44 | 45 | @Override 46 | public int distance(final String s1, final String s2) { 47 | this.s1 = s1; 48 | this.s2 = s2; 49 | /* Overall problem: D(i,j) for i = |S1| and j = |S2|, i.e: */ 50 | return distance(s1.length(), s2.length()); 51 | } 52 | 53 | /* Distance of the first i chars in s1 to the first j chars in s2 */ 54 | protected int distance(final int i, final int j) { 55 | /* Uncomment to see redundant sub-solution computation: */ 56 | // System.out.println(String.format("Checking pair: %s, %s", i, j)); 57 | /* "Base Condition": d(0,j) is j and d(i,0) is i */ 58 | if (i == 0) { 59 | return j; 60 | } 61 | if (j == 0) { 62 | return i; 63 | } 64 | /* "Recurrence Relation" */ 65 | if (s1.charAt(i - 1) == s2.charAt(j - 1)) { 66 | return distance(i - 1, j - 1); 67 | } 68 | /* 69 | * For each edit x: three recursive descents, i.e. exp. runtime: O(3^x) 70 | */ 71 | int del = distance(i - 1, j) + 1; 72 | int ins = distance(i, j - 1) + 1; 73 | int rep = distance(i - 1, j - 1) + 1; 74 | return Math.min(del, Math.min(ins, rep)); 75 | } 76 | } 77 | 78 | /** Implementation based on memoized recursion. */ 79 | 80 | static class MemoizedEditDistance extends RecursiveEditDistance { 81 | private Map map = new HashMap(); 82 | 83 | @Override 84 | public int distance(final String s1, final String s2) { 85 | map.clear(); // forget memoized solution for new pair of strings 86 | return super.distance(s1, s2); 87 | } 88 | 89 | @Override 90 | protected int distance(final int i, final int j) { 91 | String pair = i + ", " + j; 92 | /* 93 | * Only if we have not seen the pair before, we delegate to superclass: 94 | */ 95 | if (!map.containsKey(pair)) { 96 | map.put(pair, super.distance(i, j)); 97 | } 98 | return map.get(pair); // return the memoized sub-solution 99 | } 100 | } 101 | 102 | /** Implementation based on dynamic programming. */ 103 | 104 | static class DynamicProgrammingEditDistance implements Edit { 105 | @Override 106 | public int distance(final String s1, final String s2) { 107 | /* We fill the table once, i.e. linear runtime: O(i + 1 + j + 1) */ 108 | int[][] table = new int[s1.length() + 1][s2.length() + 1]; 109 | for (int i = 0; i < table.length; i++) { 110 | for (int j = 0; j < table[i].length; j++) { 111 | /* "Base Condition": d(0,j) is j and d(i,0) is i */ 112 | if (i == 0) { 113 | table[i][j] = j; 114 | } else if (j == 0) { 115 | table[i][j] = i; 116 | } else { 117 | int del = table[i - 1][j] + 1; 118 | int ins = table[i][j - 1] + 1; 119 | int rep = table[i - 1][j - 1] 120 | + (s1.charAt(i - 1) == s2.charAt(j - 1) ? 0 : 1); 121 | table[i][j] = Math.min(del, Math.min(ins, rep)); 122 | } 123 | } 124 | } 125 | /* 126 | * After having started "bottom" at 0,0, at the end we are "up" (at the 127 | * position indicating die distance of the full strings, at the lower 128 | * right corner of the table) and have our result: D(i, j): 129 | */ 130 | return table[s1.length()][s2.length()]; 131 | } 132 | } 133 | 134 | private void runResultTest(final Edit distance) { 135 | assertEquals(2, distance.distance("ehe", "reh")); 136 | assertEquals(2, distance.distance("eber", "leder")); 137 | assertEquals(0, distance.distance("ehe", "ehe")); 138 | assertEquals(0, distance.distance("", "")); 139 | assertEquals(1, distance.distance("ehe", "eher")); 140 | assertEquals(2, distance.distance("he", "")); 141 | assertEquals(2, distance.distance("", "he")); 142 | assertEquals(0, distance.distance("rechtschaffen", "rechtschaffen")); 143 | } 144 | 145 | private void runPerformanceTest(final Edit distance) { 146 | System.out.print("Running performance test for: " 147 | + distance.getClass().getSimpleName() + "..."); 148 | long start = System.currentTimeMillis(); 149 | for (int i = 0; i < 50; i++) { 150 | distance.distance("nacktschnecke", "rechtschaffen"); 151 | } 152 | System.out.println(String.format(" %s ms.", System.currentTimeMillis() 153 | - start)); // typical result: 3, 200, 60000 ms. for rec., memo., 154 | // dp 155 | } 156 | 157 | } -------------------------------------------------------------------------------- /src/spinfo/HashTables.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import junit.framework.Assert; 6 | 7 | import org.junit.Test; 8 | 9 | /** Hash Tables: a useful, efficient data structure. */ 10 | public class HashTables { 11 | 12 | /* The basic idea: a direct-address table */ 13 | 14 | @Test 15 | public void direct() { 16 | DirectAddressTable t = new DirectAddressTable(); 17 | Person tom = new Person("tom"); 18 | Person jim = new Person("jim"); 19 | t.put(50, tom); // e.g. student ID = 50 20 | t.put(75, jim); // e.g. student ID = 75 21 | Assert.assertEquals(tom, t.get(50)); 22 | Assert.assertEquals(jim, t.get(75)); 23 | } 24 | 25 | static class DirectAddressTable { 26 | Object[] table = new Object[100]; // lots of space wasted 27 | 28 | public void put(int key, Object person) { // only numeric key supported 29 | table[key] = person; 30 | } 31 | 32 | public Object get(int key) { 33 | return table[key]; 34 | } 35 | } 36 | 37 | static class Person { 38 | 39 | private String name; 40 | 41 | public Person(String name) { 42 | this.name = name; 43 | } 44 | 45 | @Override 46 | public String toString() { 47 | return name; 48 | } 49 | } 50 | 51 | /* A simple hash table using chaining for collisions: */ 52 | 53 | @Test 54 | public void hashed() { 55 | HashTable t = new HashTable(); 56 | Person tom = new Person("tom"); 57 | Person jim = new Person("jim"); 58 | Person joe = new Person("joe"); 59 | t.put(50, tom); // e.g. student ID = 50 60 | t.put(75, jim); // e.g. student ID = 75 61 | t.put(85, joe); // e.g. student ID = 85, hashes to same as 75 here 62 | Assert.assertEquals(tom, t.get(50)); 63 | Assert.assertEquals(jim, t.get(75)); 64 | Assert.assertEquals(joe, t.get(85)); 65 | } 66 | 67 | static class HashTable { 68 | 69 | Element[] table = new Element[10]; // scale down 70 | 71 | static class Element { 72 | Element next; 73 | Object key; // key can be of any type 74 | Object value; 75 | 76 | public Element(Object key, Object value) { 77 | this.key = key; 78 | this.value = value; 79 | } 80 | } 81 | 82 | public void put(Object key, Object value) { 83 | Element newElement = new Element(key, value); 84 | int slot = hash(key); 85 | Element e = table[slot]; 86 | table[slot] = newElement; // place new element in table 87 | /* Handle previous element in the slot with a different key: */ 88 | if (e != null && !e.key.equals(newElement.key)) { 89 | newElement.next = e; // add new in front 90 | } 91 | } 92 | 93 | public Object get(Object key) { 94 | Element e = table[hash(key)]; 95 | if (e == null) // no value in slot 96 | return null; 97 | /* Find element with correct key in list: */ 98 | while (!(e.key.equals(key)) && e.next != null) { 99 | e = e.next; 100 | } 101 | return e.key.equals(key) ? e.value : null; 102 | } 103 | 104 | private int hash(Object key) { 105 | // simple demo hash: map key to table length 106 | if (key instanceof Integer) { 107 | return ((Integer) key) % table.length; 108 | } 109 | if (key instanceof String) { 110 | return ((String) key).length() % table.length; 111 | } 112 | return key.hashCode() % table.length; 113 | } 114 | } 115 | 116 | /* Hashing in practice: equality for custom objects */ 117 | 118 | @Test 119 | public void equality() { 120 | Student s1 = new Student(5, "John", "Doe"); 121 | Student s2 = new Student(8, "Jim", "Jones"); 122 | Student s3 = new Student(8, "Jim", "Jones"); 123 | Student s4 = new Student(5, "John", "Doe"); 124 | /* hashCode has to be implemented consistent with equals: */ 125 | Assert.assertEquals(s1, s4); 126 | Assert.assertEquals(s2, s3); 127 | Assert.assertEquals(s1.hashCode(), s4.hashCode()); 128 | Assert.assertEquals(s2.hashCode(), s3.hashCode()); 129 | Assert.assertFalse(s1.equals(s2)); 130 | Assert.assertFalse(s1.hashCode() == s2.hashCode()); 131 | } 132 | 133 | static class Student { 134 | int id; 135 | String first; 136 | String last; 137 | 138 | public Student(int id, String first, String last) { 139 | this.id = id; 140 | this.first = first; 141 | this.last = last; 142 | } 143 | 144 | @Override 145 | public String toString() { 146 | return String.format("%s %s (%s)", first, last, id); 147 | } 148 | 149 | @Override 150 | public int hashCode() { // use same values as in equals 151 | int result = 17; 152 | result = 31 * result + id; 153 | result = 31 * result + first.hashCode(); 154 | result = 31 * result + last.hashCode(); 155 | return result; 156 | } 157 | 158 | @Override 159 | public boolean equals(Object that) { // use same values as in hashCode 160 | return (that instanceof Student) && ((Student) that).id == this.id 161 | && ((Student) that).first.equals(this.first) 162 | && ((Student) that).last.equals(this.last); 163 | } 164 | } 165 | 166 | /* Hash table sample usage: counting words */ 167 | 168 | @Test 169 | public void usage() { 170 | String text = "hi there hi everybody hi there again"; 171 | HashTable t = count(text); 172 | Assert.assertEquals(3, t.get("hi")); 173 | Assert.assertEquals(2, t.get("there")); 174 | Assert.assertEquals(1, t.get("everybody")); 175 | } 176 | 177 | private HashTable count(String text) { 178 | HashTable t = new HashTable(); 179 | String[] words = text.split(" "); 180 | for (String w : words) { 181 | Integer v = (Integer) t.get(w); 182 | if (v == null) // first occurrence 183 | v = 0; 184 | t.put(w, v + 1); // count up 185 | } 186 | return t; 187 | } 188 | } -------------------------------------------------------------------------------- /src/spinfo/Index.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static java.util.Arrays.asList; 6 | import static org.junit.Assert.assertEquals; 7 | import static org.junit.Assert.assertTrue; 8 | 9 | import java.io.File; 10 | import java.io.FileNotFoundException; 11 | import java.io.IOException; 12 | import java.net.MalformedURLException; 13 | import java.util.ArrayList; 14 | import java.util.Arrays; 15 | import java.util.Collections; 16 | import java.util.Comparator; 17 | import java.util.HashMap; 18 | import java.util.Iterator; 19 | import java.util.List; 20 | import java.util.Map; 21 | import java.util.Scanner; 22 | import java.util.Set; 23 | import java.util.SortedSet; 24 | import java.util.TreeSet; 25 | import java.util.regex.Matcher; 26 | import java.util.regex.Pattern; 27 | 28 | import org.junit.Assert; 29 | import org.junit.Test; 30 | 31 | /** 32 | * Basic indexing and preprocessing with regular expressions. Requires file 33 | * "pg100.txt", The Complete Works of William Shakespeare 34 | * (http://www.gutenberg.org/ebooks/100.txt.utf8) 35 | */ 36 | public class Index { 37 | 38 | /* Before indexing, we need to determine what elements to index. */ 39 | 40 | private static final Preprocessor PREPROCESSOR = new Preprocessor(); 41 | 42 | @Test 43 | public void tokenization() { 44 | assertEquals(asList("hello", "world"), process("hello, world!")); 45 | assertEquals(asList("123", "test"), process("test 123, 123 test, test")); 46 | assertEquals(asList("0221-123123", "test"), process("0221-123123, test")); 47 | assertEquals(asList("123", "köln", "test"), process("test - köln - 123")); 48 | } 49 | 50 | private List process(String string) { 51 | // some wrapping for the tests (compare with sorted list) 52 | return new ArrayList(new TreeSet( 53 | PREPROCESSOR.tokenize(string))); 54 | } 55 | 56 | @Test 57 | public void patterns() { 58 | assertTrue("0221-470".matches(SpecialCase.COMPOUND.regex)); 59 | assertTrue(!"Meine Nummer: 0221-470.".matches(SpecialCase.COMPOUND.regex)); 60 | assertTrue(!"4711".matches(SpecialCase.COMPOUND.regex)); 61 | assertTrue(!"Daimler-Benz".matches(SpecialCase.COMPOUND.regex)); 62 | assertTrue("8.04".matches(SpecialCase.COMPOUND.regex)); 63 | assertTrue("15:10".matches(SpecialCase.COMPOUND.regex)); 64 | assertTrue("3,50".matches(SpecialCase.COMPOUND.regex)); 65 | assertTrue("fabian.steeg@uni-koeln.de".matches(SpecialCase.EMAIL.regex)); 66 | assertTrue("fsteeg@spinfo.uni-koeln.de".matches(SpecialCase.EMAIL.regex)); 67 | assertTrue(!"fabian@home".matches(SpecialCase.EMAIL.regex)); 68 | } 69 | 70 | /* 71 | * Available patterns for extraction. Uses enum instead of constants to 72 | * iterate over all patterns in constructor of Preprocessor. 73 | */ 74 | enum SpecialCase { 75 | /* Phone (0221-4701751), versions (8.04), money (3,50) and time (15:15) */ 76 | COMPOUND("\\d+[-.,:]\\d+"), 77 | /* Simple numbers */ 78 | NUMBER("\\d+"), 79 | /* Some simple email adresses */ 80 | EMAIL("[^@\\s]+@.+?\\.(de|com|eu|org|net)"); 81 | 82 | String regex; 83 | 84 | SpecialCase(final String regularExpression) { 85 | this.regex = regularExpression; 86 | } 87 | } 88 | 89 | /** 90 | * A preprocessor based on regular expressions: first extracts custom 91 | * patterns, then splits on a given delimiter. 92 | */ 93 | static class Preprocessor { 94 | /* Unicode-aware "non-letter" delimiter, ASCII version is \\W */ 95 | private static final String UNICODE_AWARE_DELIMITER = "[^\\p{L}]"; 96 | private List specialCases = new ArrayList(); 97 | private String delimiter; 98 | 99 | public Preprocessor() { 100 | delimiter = UNICODE_AWARE_DELIMITER; 101 | for (SpecialCase p : SpecialCase.values()) { 102 | specialCases.add(p); 103 | } 104 | } 105 | 106 | public List tokenize(final String input) { 107 | String text = input.toLowerCase(); 108 | List result = new ArrayList(); 109 | text = extractSpecialCases(text, result); 110 | tokenizeStandard(text, result); 111 | return result; 112 | } 113 | 114 | private String extractSpecialCases(String text, List result) { 115 | for (SpecialCase p : specialCases) { 116 | Pattern pattern = Pattern.compile(p.regex); 117 | Matcher matcher = pattern.matcher(text); 118 | while (matcher.find()) { 119 | String group = matcher.group(); 120 | result.add(group); // add special case 121 | text = text.replace(group, ""); // don't treat group as regex 122 | } 123 | } 124 | return text; 125 | } 126 | 127 | private void tokenizeStandard(String text, List result) { 128 | List list = Arrays.asList(text.split(delimiter)); 129 | for (String s : list) 130 | if (s.trim().length() > 0) // filter empty strings 131 | result.add(s.trim()); 132 | } 133 | 134 | } 135 | 136 | /* Once we can preprocess our corpus, we can build an index and search it: */ 137 | 138 | private static final InvertedIndex INDEX = buildIndex(); 139 | 140 | /** Test searching the corpus for a single term. */ 141 | @Test 142 | public final void testSearch() throws MalformedURLException, IOException { 143 | long start = System.currentTimeMillis(); 144 | String query = "Brutus"; 145 | Set list = INDEX.search(query); 146 | System.out.printf("Result for '%s': %s, took %s ms.\n", query, list, 147 | (System.currentTimeMillis() - start)); 148 | Assert.assertTrue("Search should find a single term", list.size() > 0); 149 | } 150 | 151 | /** Test searching the corpus for multiple search terms. */ 152 | @Test 153 | public final void testMulti() throws MalformedURLException, IOException { 154 | long start = System.currentTimeMillis(); 155 | String query = "Brutus Caesar"; // = Brutus AND Caesar 156 | Set list = INDEX.search(query); 157 | System.out.printf("Result for '%s': %s, took %s ms.\n", query, list, 158 | (System.currentTimeMillis() - start)); 159 | Assert.assertTrue("Search should find multiple terms", list.size() > 0); 160 | } 161 | 162 | static class InvertedIndex { 163 | 164 | private Map> index = new HashMap>(); 165 | 166 | public InvertedIndex(final List corpus) { 167 | index = index(corpus); 168 | } 169 | 170 | private Map> index(final List works) { 171 | Map> index = new HashMap>(); 172 | // for each document, and each of its token, add it to the index 173 | for (int i = 0; i < works.size(); i++) { 174 | List tokens = PREPROCESSOR.tokenize(works.get(i)); 175 | for (String token : tokens) { 176 | SortedSet postings = index.get(token); 177 | if (postings == null) { // first time 178 | postings = new TreeSet(); 179 | index.put(token, postings); 180 | } 181 | postings.add(i); // document i contains token 182 | } 183 | } 184 | return index; 185 | } 186 | 187 | public Set search(final String query) { 188 | /* We treat all entries as AND-linked... */ 189 | List queries = PREPROCESSOR.tokenize(query); 190 | /* We get the results for each query term: */ 191 | List> allPostings = new ArrayList>(); 192 | for (String q : queries) { 193 | SortedSet postings = index.get(q); 194 | if (postings != null) 195 | allPostings.add(postings); 196 | } 197 | /* For efficient intersection computation: sort lists by length */ 198 | sortByLength(allPostings); 199 | /* Intersection of postings for all query terms is our result: */ 200 | return intersectionOf(allPostings); 201 | } 202 | 203 | private void sortByLength(List> all) { 204 | Collections.sort(all, new Comparator>() { 205 | public int compare(final SortedSet o1, 206 | final SortedSet o2) { 207 | return Integer.valueOf(o1.size()).compareTo(o2.size()); 208 | } 209 | }); 210 | } 211 | 212 | private Set intersectionOf(List> all) { 213 | /* The result set is the intersection of the first list with all others: */ 214 | SortedSet result = all.get(0); 215 | for (SortedSet set : all.subList(1, all.size())) { 216 | result = intersection(result.iterator(), set.iterator()); 217 | } 218 | return result; 219 | } 220 | 221 | } 222 | 223 | /* Implementation and tests for the intersection algorithm: */ 224 | 225 | @Test 226 | public void intersection() { 227 | /* Test intersection computation for AND-queries: */ 228 | TreeSet PL1 = new TreeSet(Arrays.asList(4, 3, 2, 1)); 229 | TreeSet PL2 = new TreeSet(Arrays.asList(2, 4, 6, 8)); 230 | Assert.assertEquals(Arrays.asList(2, 4), new ArrayList( 231 | intersection(PL1.iterator(), PL2.iterator()))); 232 | } 233 | 234 | public static SortedSet intersection(final Iterator i1, 235 | final Iterator i2) { 236 | SortedSet result = new TreeSet(); 237 | Integer p1 = next(i1); 238 | Integer p2 = next(i2); 239 | while (p1 != null && p2 != null) { 240 | if (p1.equals(p2)) { 241 | result.add(p1); 242 | p1 = next(i1); 243 | p2 = next(i2); 244 | } else if (p1 < p2) 245 | p1 = next(i1); 246 | else 247 | p2 = next(i2); 248 | } 249 | return result; 250 | } 251 | 252 | /* A little oddity to stay close to Manning et al. 2008, p. 11: */ 253 | private static Integer next(final Iterator i1) { 254 | return i1.hasNext() ? i1.next() : null; 255 | } 256 | 257 | /* Utilities: load data, build index: */ 258 | 259 | private static InvertedIndex buildIndex() { 260 | List corpus = corpus(); 261 | long start = System.currentTimeMillis(); 262 | System.out.printf("Building index for %s texts... ", corpus.size()); 263 | InvertedIndex invertedIndex = new InvertedIndex(corpus); 264 | System.out 265 | .printf("done, took %s ms.\n", System.currentTimeMillis() - start); 266 | return invertedIndex; 267 | } 268 | 269 | private static List corpus() { 270 | try { 271 | Scanner s = new Scanner(new File("pg100.txt"), "UTF-8"); 272 | StringBuilder builder = new StringBuilder(); 273 | while (s.hasNextLine()) { 274 | builder.append(s.nextLine()).append("\n"); 275 | } 276 | /* Each work is delimited by a line ending with a year: */ 277 | return Arrays.asList(builder.toString().split("1[56][0-9]{2}\n")); 278 | } catch (FileNotFoundException e) { 279 | e.printStackTrace(); 280 | } 281 | return Collections.emptyList(); 282 | } 283 | 284 | } -------------------------------------------------------------------------------- /src/spinfo/Lists.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.Iterator; 8 | import java.util.NoSuchElementException; 9 | 10 | import org.junit.Before; 11 | import org.junit.Test; 12 | 13 | /** Lists: elementary data structures. */ 14 | public class Lists { 15 | 16 | /** Low-level, non-OOP list implementation simulating a tuple/record/struct. */ 17 | @Test 18 | public void tuple() { 19 | /* Independent nodes: */ 20 | Object[] first = new Object[2]; 21 | Object[] second = new Object[2]; 22 | Object[] third = new Object[2]; 23 | /* Containing values: */ 24 | first[0] = "first"; 25 | second[0] = "second"; 26 | third[0] = "third"; 27 | /* Linked with pointers: */ 28 | first[1] = second; 29 | second[1] = third; 30 | /* Can be traversed: */ 31 | System.out.println("List traversal: "); 32 | Object[] current = first; 33 | while (current != null) { 34 | System.out.println(current[0]); 35 | current = (Object[]) current[1]; 36 | } 37 | } 38 | 39 | /** OOP implementation of a queue, a FIFO list (first in, first out). */ 40 | @Test 41 | public void queue() { 42 | Queue queue = new Queue(); 43 | /* Enqueue at end: */ 44 | queue.enqueue("first"); 45 | queue.enqueue("second"); 46 | queue.enqueue("third"); 47 | /* Iterate: */ 48 | System.out.println("Queue traversal: "); 49 | Node current = queue.first; 50 | while (current != null) { 51 | System.out.println(current.value); 52 | current = current.next; 53 | } 54 | /* Dequeue from front: */ 55 | assertEquals("first", queue.dequeue()); 56 | assertEquals("second", queue.dequeue()); 57 | assertEquals("third", queue.dequeue()); 58 | assertEquals(null, queue.dequeue()); 59 | } 60 | 61 | /** A list element: wraps a value and a reference to the next element. */ 62 | static class Node { 63 | Object value; 64 | Node next; 65 | 66 | Node(Object value) { 67 | this.value = value; 68 | } 69 | } 70 | 71 | /** The queue class enforces the restricted FIFO access. */ 72 | static class Queue /**/implements Iterable /* like implements List *//**/{ 73 | 74 | private Node first; 75 | private Node last; 76 | 77 | /** Add an object in constant time. */ 78 | public void enqueue(Object value) { 79 | Node n = new Node(value); 80 | if (first == null) { 81 | first = n; 82 | last = first; 83 | } else { 84 | last.next = n; 85 | last = n; 86 | } 87 | } 88 | 89 | /** Get an object in constant time. */ 90 | public Object dequeue() { 91 | if (first == null) 92 | return null; 93 | Object result = first.value; 94 | first = first.next; 95 | return result; 96 | } 97 | 98 | /**/ 99 | @Override 100 | public Iterator iterator() { 101 | return new NodeIterator(first); 102 | } 103 | /**/ 104 | } 105 | 106 | /** OOP implementation of a stack, a LIFO list (last in, first out). */ 107 | @Test 108 | public void stack() { 109 | Stack stack = new Stack(); 110 | /* Push on top, i.e. from front: */ 111 | stack.push("first"); 112 | stack.push("second"); 113 | stack.push("third"); 114 | /* Iterate: */ 115 | System.out.println("Stack traversal: "); 116 | Node current = stack.first; 117 | while (current != null) { 118 | System.out.println(current.value); 119 | current = current.next; 120 | } 121 | /* Pop from top, i.e. from front: */ 122 | assertEquals("third", stack.pop()); 123 | assertEquals("second", stack.pop()); 124 | assertEquals("first", stack.pop()); 125 | assertEquals(null, stack.pop()); 126 | } 127 | 128 | /** The stack class enforces the restricted LIFO access. */ 129 | static class Stack /**/implements Iterable /* like implements List *//**/{ 130 | 131 | private Node first; 132 | 133 | /** Add an object in constant time. */ 134 | public void push(Object value) { 135 | Node n = new Node(value); 136 | if (first == null) { 137 | first = n; 138 | } else { 139 | n.next = first; 140 | first = n; 141 | } 142 | } 143 | 144 | /** Get an object in constant time. */ 145 | public Object pop() { 146 | if (first == null) 147 | return null; 148 | Object result = first.value; 149 | first = first.next; 150 | return result; 151 | } 152 | 153 | /**/ 154 | @Override 155 | public Iterator iterator() { 156 | return new NodeIterator(first); 157 | } 158 | /**/ 159 | } 160 | 161 | /** Common to both: linear order, both are iterable. */ 162 | @Test 163 | public void list() { 164 | Queue list = new Queue(); 165 | list.enqueue("first"); 166 | list.enqueue("second"); 167 | list.enqueue("third"); 168 | System.out.println("Iterate using Iterator: "); 169 | Iterator iterator = list.iterator(); 170 | while (iterator.hasNext()) { 171 | System.out.println(iterator.next()); 172 | } 173 | System.out.println("Iterate using Iterable: "); 174 | for (Object o : list) { 175 | System.out.println(o); 176 | } 177 | } 178 | 179 | /** A sequence can be traversed. */ 180 | interface List { 181 | Iterator iterator(); 182 | } 183 | 184 | /** Representation of the stateful traversal. */ 185 | interface SimpleIterator { 186 | Object next(); 187 | 188 | boolean hasNext(); 189 | } 190 | 191 | /** Iterator implementation based on linked nodes. */ 192 | static class NodeIterator implements Iterator /* like SimpleIterator */{ 193 | private Node current; 194 | 195 | public NodeIterator(Node first) { 196 | if (first != null) 197 | current = first; 198 | else 199 | throw new IllegalArgumentException(); 200 | } 201 | 202 | @Override 203 | public Object next() { 204 | if (current.value == null) { 205 | throw new NoSuchElementException(); 206 | } 207 | Object next = current.value; 208 | current = current.next; 209 | return next; 210 | } 211 | 212 | @Override 213 | public boolean hasNext() { 214 | return current != null; 215 | } 216 | 217 | @Override 218 | public void remove() { 219 | throw new IllegalStateException("Not implemented"); 220 | } 221 | 222 | } 223 | 224 | @Before 225 | public void line() { 226 | System.out.println(); 227 | } 228 | 229 | } 230 | -------------------------------------------------------------------------------- /src/spinfo/Quicksort.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.List; 10 | import java.util.Random; 11 | 12 | import org.junit.Test; 13 | 14 | /** Quicksort: a state of the art comparison-based sorting algorithm. */ 15 | public class Quicksort { 16 | 17 | /* For complexity discussion, see also insertion sort in SortSearch.java */ 18 | /* For divide-and-conquer approach, see binary search in SortSearch.java */ 19 | 20 | @Test 21 | public void sort() { 22 | List vals = new ArrayList(Arrays.asList(91, 23, 88, 93, 23 | 20, 37)); 24 | List sort = sort(vals); 25 | assertEquals(Arrays.asList(20, 23, 37, 88, 91, 93), sort); 26 | System.out.println("Sorted: " + sort); 27 | } 28 | 29 | private List sort(List vals) { 30 | sort(vals, 0, vals.size() - 1); // start with full list 31 | return vals; // in-place algorithm, modifies vals 32 | } 33 | 34 | private void sort(List vals, int left, int right) { 35 | if (left < right) { // if a section exists 36 | int p = /**/randomPartition/**/(vals, left, right); // divide at p 37 | sort(vals, left, p - 1); // conquer left of p 38 | sort(vals, p + 1, right); // conquer right of p 39 | } 40 | } 41 | 42 | private int partition(List vals, int left, int right) { 43 | int x = vals.get(right); // pivot element (here: last element) 44 | int i = left - 1; // left .. i is <= x 45 | /* loop invariant: vals left of i are <= x, vals right of i are > x */ 46 | for (int j = left; j <= right - 1; j++) { // i + 1 .. j - 1 is > x 47 | if (vals.get(j) <= x) { // current value belongs to first partition 48 | i++; // grow first partition, <= x 49 | swap(vals, i, j); // swap smaller value to first partition 50 | } 51 | } 52 | swap(vals, i + 1, right); // swap pivot to end of smaller partition 53 | return i + 1; // return pivot position 54 | } 55 | 56 | private void swap(List vals, int a, int b) { 57 | int buf = vals.get(a); // buffer for the first val 58 | vals.set(a, vals.get(b)); // overwrite the first val with the second val 59 | vals.set(b, buf); // restore the first val at the position of the second val 60 | } 61 | 62 | /* Randomize pivot for improved average runtime for all inputs */ 63 | 64 | Random r = new Random(); 65 | 66 | private int randomPartition(List vals, int left, int right) { 67 | int i = left + r.nextInt(right - left + 1); // map 0..x to left..right 68 | swap(vals, right, i); // place random val at end, will be pivot 69 | return partition(vals, left, right); 70 | } 71 | 72 | } -------------------------------------------------------------------------------- /src/spinfo/SortSearch.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | 12 | import org.junit.Test; 13 | 14 | /** Sorting and searching: elementary algorithms. */ 15 | public class SortSearch { 16 | 17 | /* First elementary algorithmic problem: sorting */ 18 | 19 | @Test 20 | public void insertionSort() { 21 | List vals = new ArrayList(Arrays.asList(91, 23, 88, 93, 22 | 20, 37)); 23 | List sort = sort(vals); 24 | assertEquals(Arrays.asList(20, 23, 37, 88, 91, 93), sort); 25 | System.out.println("Sorted: " + sort); 26 | } 27 | 28 | private List sort(List vals) { 29 | /* loop invariant: now sorted elements, originally in vals[0..i-1] */ 30 | List sorted = new LinkedList(vals.subList(0, 1)); 31 | /* loop invariant init: contains first element of vals */ 32 | for (int i = 1; i < vals.size(); i++) { 33 | int next = vals.get(i); 34 | int index = index(sorted, next); 35 | System.out.println(sorted + " <- " + next + " @ " + index); 36 | /* loop invariant maintenance: add one, sorted */ 37 | sorted.add(index, next); 38 | } 39 | /* loop invariant termination: terminates when i==vals.size, full list */ 40 | return sorted; 41 | /* 42 | * run time depends on 1) input size, 2) how sorted input is -- best case: 43 | * no inner loop, O(n) (here: reverse sorted), worst case: full inner loop, 44 | * O(n^2) (here: already sorted). 45 | */ 46 | } 47 | 48 | private int index(List res, int next) { 49 | int i = 0; 50 | while (i < res.size() && next > res.get(i)) { 51 | i++; 52 | } 53 | return i; 54 | } 55 | 56 | /* Second elementary algorithmic problem: searching */ 57 | 58 | @Test 59 | public void linearSearch() { 60 | List vals = Arrays.asList(20, 23, 37, 88, 91, 93); 61 | assertEquals(0, linearSearch(20, vals)); 62 | assertEquals(1, linearSearch(23, vals)); 63 | assertEquals(2, linearSearch(37, vals)); 64 | assertEquals(3, linearSearch(88, vals)); 65 | assertEquals(4, linearSearch(91, vals)); 66 | assertEquals(5, linearSearch(93, vals)); 67 | assertEquals(-1, linearSearch(100, vals)); 68 | } 69 | 70 | private int linearSearch(int i, List vals) { 71 | for (int j = 0; j < vals.size(); j++) { 72 | if (vals.get(j) == i) 73 | return j; 74 | } 75 | return -1; 76 | } 77 | 78 | /* A general algorithmic strategy: divide and conquer */ 79 | 80 | private int factorial(int i) { 81 | /* factorial(0) == 1; factorial(5) == 1 * 2 * 3 * 4 * 5 */ 82 | return i == 0 ? 1 : i * factorial(i - 1); 83 | /* 84 | * We don't iterate but devide the problem into subproblems, and solve the 85 | * overall problem by combining subsolutions: i * factorial(i-1) 86 | */ 87 | } 88 | 89 | @Test 90 | public void factorial() { 91 | assertEquals(1, factorial(0)); 92 | assertEquals(1, factorial(1)); 93 | assertEquals(2, factorial(2)); 94 | assertEquals(120, factorial(5)); 95 | assertEquals(3628800, factorial(10)); 96 | } 97 | 98 | @Test(expected = StackOverflowError.class) 99 | public void factorialOverflow() { 100 | factorial(Integer.MAX_VALUE); 101 | } 102 | 103 | /* Recursive, divide-and-conquer binary search implementation */ 104 | 105 | @Test 106 | public void binarySearchFound() { 107 | List vals = Arrays.asList(20, 23, 37, 88, 91, 93); 108 | assertEquals(0, binarySearch(20, vals)); 109 | assertEquals(1, binarySearch(23, vals)); 110 | assertEquals(2, binarySearch(37, vals)); 111 | assertEquals(3, binarySearch(88, vals)); 112 | assertEquals(4, binarySearch(91, vals)); 113 | assertEquals(5, binarySearch(93, vals)); 114 | assertEquals(6, binarySearch(95, Arrays.asList(20, 23, 37, 88, 91, 93, 95))); 115 | } 116 | 117 | @Test 118 | public void binarySearchNotFound() { 119 | assertEquals(-1, binarySearch(100, Arrays.asList(20, 23, 37, 88, 91, 93))); 120 | assertEquals(-1, 121 | binarySearch(100, Arrays.asList(20, 23, 37, 88, 91, 93, 95))); 122 | } 123 | 124 | private int binarySearch(int i, List vals) { 125 | return binarySearch(i, vals, 0, vals.size() - 1); 126 | } 127 | 128 | private int binarySearch(int i, List vals, int left, int right) { 129 | if (left > right) { 130 | return -1; 131 | } 132 | int mid = left + (right - left) / 2; 133 | Integer val = vals.get(mid); 134 | if (val == i) 135 | return mid; 136 | if (val > i) { /* i in [left..m] */ 137 | return binarySearch(i, vals, left, mid - 1); 138 | } else { /* i in [m..right] */ 139 | return binarySearch(i, vals, mid + 1, right); 140 | } 141 | } 142 | 143 | /* Sort (and search) any type of objects with same principle: Comparable */ 144 | 145 | @Test 146 | public void comparable() { 147 | List books = Arrays.asList( 148 | /**/ 149 | new Book("Buddenbrooks", "Mann"), 150 | /**/ 151 | new Book("Werther", "Goethe"), 152 | /**/ 153 | new Book("Faust", "Goethe")); 154 | /* Sort first by author, second by title */ 155 | List sort = sortBook(books); 156 | assertEquals(Arrays.asList( 157 | /**/ 158 | new Book("Faust", "Goethe"), 159 | /**/ 160 | new Book("Werther", "Goethe"), 161 | /**/ 162 | new Book("Buddenbrooks", "Mann")), sort); 163 | System.out.println("Sorted: " + sort); 164 | } 165 | 166 | static class Book implements Comparable { 167 | 168 | private String title; 169 | private String author; 170 | 171 | public Book(String title, String author) { 172 | this.title = title; 173 | this.author = author; 174 | } 175 | 176 | @Override 177 | public int compareTo(Book that) { /* Define how to compare books */ 178 | if (this.author.compareTo(that.author) == 0) { 179 | return this.title.compareTo(that.title); // second by title 180 | } 181 | return this.author.compareTo(that.author); // first by author 182 | } 183 | 184 | @Override 185 | public String toString() { 186 | return author + ": " + title; /* for useful output */ 187 | } 188 | 189 | @Override 190 | public boolean equals(Object that) { /* for comparison in unit test */ 191 | return that instanceof Book && ((Book) that).author.equals(this.author) 192 | && ((Book) that).title.equals(this.title); 193 | } 194 | 195 | @Override 196 | public int hashCode() { // use same values as in equals 197 | int result = 17; 198 | result = 31 * result + author.hashCode(); 199 | result = 31 * result + title.hashCode(); 200 | return result; 201 | } 202 | 203 | } 204 | 205 | /* changed here: Book instead of Integer (else as above) */ 206 | private List sortBook(List vals) { 207 | List sorted = new LinkedList(vals.subList(0, 1)); 208 | for (int i = 1; i < vals.size(); i++) { 209 | Book next = vals.get(i); 210 | int index = indexBook(sorted, next); 211 | System.out.println(sorted + " <- " + next + " @ " + index); 212 | sorted.add(index, next); 213 | } 214 | return sorted; 215 | } 216 | 217 | private int indexBook(List res, Book next) { 218 | int i = 0; 219 | /* changed here: use compareTo instead of == (else as above) */ 220 | while (i < res.size() && next.compareTo(res.get(i)) > 0) { 221 | i++; 222 | } 223 | return i; 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /src/spinfo/TestSuite.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import org.junit.runner.RunWith; 6 | import org.junit.runners.Suite; 7 | import org.junit.runners.Suite.SuiteClasses; 8 | 9 | /** Main suite for running all tests. */ 10 | @RunWith(Suite.class) 11 | @SuiteClasses({ Collation.class, CollectionsGenerics.class, Crawling.class, 12 | EditDistance.class, HashTables.class, Index.class, Lists.class, 13 | Quicksort.class, SortSearch.class, Trees.class }) 14 | public class TestSuite { 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/spinfo/Trees.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.List; 10 | 11 | import org.junit.Test; 12 | 13 | /** Trees: a versatile data structure. */ 14 | public class Trees { 15 | 16 | /* Basic implementation strategy: nodes and links/pointers/references */ 17 | 18 | /* Simple binary tree, two pointers in each node. */ 19 | static class SimpleTree { 20 | SimpleNode root; 21 | 22 | static class SimpleNode { 23 | Object value; 24 | SimpleNode left; 25 | SimpleNode right; 26 | } 27 | } 28 | 29 | /* Simple multi-children tree, with a list of children in each node. */ 30 | static class MultiTree { 31 | MultiNode root; 32 | 33 | static class MultiNode { 34 | Object value; 35 | List children; // n references per node for n children 36 | } 37 | } 38 | 39 | /* Multi-children tree implemented like a linked list: less references. */ 40 | static class LinkedTree { 41 | LinkedNode root; 42 | 43 | static class LinkedNode { 44 | Object value; 45 | LinkedNode head; 46 | LinkedNode tail; // 2 references per node for n children 47 | } 48 | } 49 | 50 | @Test 51 | public void binarySearchTree() { 52 | BinaryTree tree = new BinaryTree(); 53 | tree.addRecursive(5); 54 | tree.addRecursive(3); 55 | tree.addRecursive(8); 56 | tree.addRecursive(2); 57 | tree.addRecursive(4); 58 | tree.addRecursive(7); 59 | tree.addRecursive(6); 60 | assertEquals(Arrays.asList(5, 3, 2, 4, 8, 7, 6), tree.preorder()); 61 | } 62 | 63 | @Test 64 | public void dotVisualization() { 65 | BinaryTree tree = new BinaryTree(); 66 | tree.addIterative(5); 67 | tree.addIterative(3); 68 | tree.addIterative(8); 69 | tree.addIterative(2); 70 | tree.addIterative(4); 71 | tree.addIterative(7); 72 | tree.addIterative(6); 73 | assertEquals("digraph{5->3;3->2;3->4;5->8;8->7;7->6;}", tree.visualize()); 74 | } 75 | 76 | static class BinaryTree { 77 | BinaryNode root; 78 | 79 | static class BinaryNode { 80 | public BinaryNode(int value) { 81 | this.value = value; 82 | } 83 | 84 | int value; 85 | BinaryNode left; 86 | BinaryNode right; 87 | 88 | public String toString() { 89 | return String.valueOf(value); 90 | } 91 | } 92 | 93 | /* Corresponding to the rekursive structure, we add recursively */ 94 | public void addRecursive(int value) { 95 | if (root == null) 96 | root = new BinaryNode(value); // done 97 | else 98 | addRecursive(root, value); // go on 99 | } 100 | 101 | private void addRecursive(BinaryNode node, int value) { 102 | if (value < node.value) { // new val should go left 103 | if (node.left == null) 104 | node.left = new BinaryNode(value); // done 105 | else 106 | addRecursive(node.left, value); // go on left 107 | } else { // new val should go right 108 | if (node.right == null) 109 | node.right = new BinaryNode(value); // done 110 | else 111 | addRecursive(node.right, value); // go on right 112 | } 113 | } 114 | 115 | /* But a recursive concept can also be implemented iteratively */ 116 | public void addIterative(int value) { 117 | if (root == null) 118 | root = new BinaryNode(value); // done 119 | else 120 | addIterative(root, value); // go on 121 | } 122 | 123 | private void addIterative(BinaryNode root, int value) { 124 | BinaryNode current = root; 125 | while (current != null) { 126 | if (value < current.value) { 127 | if (current.left == null) { 128 | current.left = new BinaryNode(value); 129 | return; // done 130 | } 131 | current = current.left; // go on 132 | } else { 133 | if (current.right == null) { 134 | current.right = new BinaryNode(value); 135 | return; // done 136 | } 137 | current = current.right; // go on 138 | } 139 | } 140 | } 141 | 142 | /* Two preorder traversals: collect values, visualize with Graphviz DOT */ 143 | 144 | public List preorder() { 145 | return preorder(root, new ArrayList()); 146 | } 147 | 148 | public List preorder(BinaryNode node, List result) { 149 | if (node == null) 150 | return result; 151 | result.add(node.value); // Pre-order: 1. root 152 | preorder(node.left, result); // 2. left 153 | preorder(node.right, result); // 3. right 154 | return result; 155 | } 156 | 157 | public String visualize() { // another Pre-order traversal 158 | StringBuilder builder = new StringBuilder(); 159 | return String.format("digraph{%s}", visualize(root, builder)); 160 | } 161 | 162 | private String visualize(BinaryNode node, StringBuilder builder) { 163 | if (node != null) { // Pre-order: 1. root 164 | if (node.left != null) 165 | builder.append(String.format("%s->%s;", node, node.left)); 166 | visualize(node.left, builder); // 2. left 167 | if (node.right != null) 168 | builder.append(String.format("%s->%s;", node, node.right)); 169 | visualize(node.right, builder); // 3. right 170 | } 171 | return builder.toString(); 172 | } 173 | 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/spinfo/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Code for the Java course at the University of Cologne, Department of Linguistics 3 | * (Sprachliche Informationsverarbeitung, http://github.com/spinfo). 4 | **/ 5 | package spinfo; --------------------------------------------------------------------------------