├── .classpath ├── .gitignore ├── .project ├── .settings └── org.eclipse.jdt.ui.prefs ├── README.textile ├── build.xml ├── lib ├── guava-r09.jar ├── guava-src-r09.zip ├── junit-4.8.2-src.jar ├── junit-4.8.2.jar ├── nekohtml.jar └── xercesImpl.jar ├── pg100.txt └── src └── spinfo ├── Collation.java ├── CollectionsGenerics.java ├── Crawling.java ├── EditDistance.java ├── HashTables.java ├── Index.java ├── Lists.java ├── Quicksort.java ├── SortSearch.java ├── TestSuite.java ├── Trees.java └── package-info.java /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | build -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | java 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- 1 | #Mon May 23 17:35:24 CEST 2011 2 | eclipse.preferences.version=1 3 | editor_save_participant_org.eclipse.jdt.ui.postsavelistener.cleanup=true 4 | formatter_settings_version=12 5 | sp_cleanup.add_default_serial_version_id=true 6 | sp_cleanup.add_generated_serial_version_id=false 7 | sp_cleanup.add_missing_annotations=true 8 | sp_cleanup.add_missing_deprecated_annotations=true 9 | sp_cleanup.add_missing_methods=false 10 | sp_cleanup.add_missing_nls_tags=false 11 | sp_cleanup.add_missing_override_annotations=true 12 | sp_cleanup.add_missing_override_annotations_interface_methods=true 13 | sp_cleanup.add_serial_version_id=false 14 | sp_cleanup.always_use_blocks=true 15 | sp_cleanup.always_use_parentheses_in_expressions=false 16 | sp_cleanup.always_use_this_for_non_static_field_access=false 17 | sp_cleanup.always_use_this_for_non_static_method_access=false 18 | sp_cleanup.convert_to_enhanced_for_loop=false 19 | sp_cleanup.correct_indentation=false 20 | sp_cleanup.format_source_code=true 21 | sp_cleanup.format_source_code_changes_only=false 22 | sp_cleanup.make_local_variable_final=false 23 | sp_cleanup.make_parameters_final=false 24 | sp_cleanup.make_private_fields_final=true 25 | sp_cleanup.make_type_abstract_if_missing_method=false 26 | sp_cleanup.make_variable_declarations_final=true 27 | sp_cleanup.never_use_blocks=false 28 | sp_cleanup.never_use_parentheses_in_expressions=true 29 | sp_cleanup.on_save_use_additional_actions=false 30 | sp_cleanup.organize_imports=true 31 | sp_cleanup.qualify_static_field_accesses_with_declaring_class=false 32 | sp_cleanup.qualify_static_member_accesses_through_instances_with_declaring_class=true 33 | sp_cleanup.qualify_static_member_accesses_through_subtypes_with_declaring_class=true 34 | sp_cleanup.qualify_static_member_accesses_with_declaring_class=false 35 | sp_cleanup.qualify_static_method_accesses_with_declaring_class=false 36 | sp_cleanup.remove_private_constructors=true 37 | sp_cleanup.remove_trailing_whitespaces=false 38 | sp_cleanup.remove_trailing_whitespaces_all=true 39 | sp_cleanup.remove_trailing_whitespaces_ignore_empty=false 40 | sp_cleanup.remove_unnecessary_casts=true 41 | sp_cleanup.remove_unnecessary_nls_tags=false 42 | sp_cleanup.remove_unused_imports=false 43 | sp_cleanup.remove_unused_local_variables=false 44 | sp_cleanup.remove_unused_private_fields=true 45 | sp_cleanup.remove_unused_private_members=false 46 | sp_cleanup.remove_unused_private_methods=true 47 | sp_cleanup.remove_unused_private_types=true 48 | sp_cleanup.sort_members=false 49 | sp_cleanup.sort_members_all=false 50 | sp_cleanup.use_blocks=false 51 | sp_cleanup.use_blocks_only_for_return_and_throw=false 52 | sp_cleanup.use_parentheses_in_expressions=false 53 | sp_cleanup.use_this_for_non_static_field_access=false 54 | sp_cleanup.use_this_for_non_static_field_access_only_if_necessary=true 55 | sp_cleanup.use_this_for_non_static_method_access=false 56 | sp_cleanup.use_this_for_non_static_method_access_only_if_necessary=true 57 | -------------------------------------------------------------------------------- /README.textile: -------------------------------------------------------------------------------- 1 | "Java at the University of Cologne, Department of Linguistics (Sprachliche Informationsverarbeitung)":http://spinfo.uni-koeln.de/spinfo-java.html -------------------------------------------------------------------------------- /build.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /lib/guava-r09.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/guava-r09.jar -------------------------------------------------------------------------------- /lib/guava-src-r09.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/guava-src-r09.zip -------------------------------------------------------------------------------- /lib/junit-4.8.2-src.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/junit-4.8.2-src.jar -------------------------------------------------------------------------------- /lib/junit-4.8.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/junit-4.8.2.jar -------------------------------------------------------------------------------- /lib/nekohtml.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/nekohtml.jar -------------------------------------------------------------------------------- /lib/xercesImpl.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spinfo/java/05e7f7d367d0210d4c019adfe5e480eca67502f9/lib/xercesImpl.jar -------------------------------------------------------------------------------- /src/spinfo/Collation.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | import static org.junit.Assert.assertFalse; 7 | 8 | import java.text.CollationKey; 9 | import java.text.Collator; 10 | import java.text.ParseException; 11 | import java.text.RuleBasedCollator; 12 | import java.util.Arrays; 13 | import java.util.Collections; 14 | import java.util.Comparator; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Locale; 18 | import java.util.Map; 19 | 20 | import org.junit.Test; 21 | 22 | /** Lexicographic sorting and collation in Java. */ 23 | public class Collation { 24 | 25 | List words = Arrays.asList("Very", "Über", "very", "ultra", "über"); 26 | 27 | @Test 28 | public void basicProblem() { 29 | /* 30 | * The basic problem: Java's default sorting for strings (based on the 31 | * character's Unicode position) is insufficient for lexicographic ordering: 32 | */ 33 | Collections.sort(words); 34 | /* 35 | * Because it sorts all upper case letters before all lowercase letters and 36 | * sorts all letters with diacritics behind all standard letters: 37 | */ 38 | assertEquals(Arrays.asList("Very", "ultra", "very", "Über", "über"), words); 39 | /* Which is not what we would expect (upper, lower, diacritics together): */ 40 | assertFalse(Arrays.asList("ultra", "Über", "über", "Very", "very").equals( 41 | words)); 42 | } 43 | 44 | @Test 45 | public void basicSolution() { 46 | /* Idea: map the chars to their correct position, and sort by that mapping: */ 47 | final Map collationKeys = new HashMap(); 48 | collationKeys.put('U', 1); // or lower-level, with array: char['U'] = 1; 49 | collationKeys.put('u', 2); 50 | collationKeys.put('\u00dc', 3); // Ü 51 | collationKeys.put('\u00fc', 4); // ü 52 | collationKeys.put('V', 5); 53 | collationKeys.put('v', 6); 54 | /* We pass a custom sorting strategy to the sort method: */ 55 | Collections.sort(words, new Comparator() { 56 | @Override 57 | public int compare(String s1, String s2) { 58 | /* For this sample, we only look at the first letter: */ 59 | Character c1 = s1.charAt(0); 60 | Character c2 = s2.charAt(0); 61 | /* We don't compare the chars, but their collation keys: */ 62 | return collationKeys.get(c1).compareTo(collationKeys.get(c2)); 63 | } 64 | }); 65 | /* For our specific case, this results in a somewhat correct order: */ 66 | assertEquals(Arrays.asList("ultra", "Über", "über", "Very", "very"), words); 67 | } 68 | 69 | @Test 70 | public void collator() { 71 | /* Java contains region-specific collation rules, via Collator: */ 72 | final Collator collator = Collator.getInstance(Locale.GERMAN); 73 | Collections.sort(words, new Comparator() { 74 | @Override 75 | public int compare(String s1, String s2) { 76 | return collator.compare(s1, s2); 77 | } 78 | }); 79 | /* Which gets the details right, e.g. sort umlauts like their standards: */ 80 | assertEquals(Arrays.asList("über", "Über", "ultra", "very", "Very"), words); 81 | } 82 | 83 | @Test 84 | public void comparable() { 85 | /* 86 | * If we control the objects sorted (unlike strings), and the sorting does 87 | * not depend on something external to the objects (unlike above, where we 88 | * sort chars by their keys), we can define the order inside our objects: 89 | */ 90 | List words = Arrays.asList(new Word("Very"), new Word("ultra"), 91 | new Word("über"), new Word("Super")); 92 | /* From the usage side, it now looks like the default sorting just works: */ 93 | Collections.sort(words); 94 | assertEquals(Arrays.asList(new Word("Super"), new Word("über"), new Word( 95 | "ultra"), new Word("Very")), words); 96 | } 97 | 98 | static class Word implements Comparable { 99 | 100 | private String val; 101 | private Collator collator = Collator.getInstance(); // uses system locale 102 | private CollationKey key; 103 | 104 | public Word(String val) { 105 | this.val = val; 106 | this.key = collator.getCollationKey(val); // precompute the key 107 | } 108 | 109 | @Override 110 | public int compareTo(Word that) { 111 | // return this.val.compareTo(that.val); // naive, not sufficient 112 | /* Instead of comparing the vals, we can pass them to the collator: */ 113 | // return collator.compare(this.val, that.val); // always computes keys 114 | /* To improve performance, we precompute the keys, and compare these: */ 115 | return this.key.compareTo(that.key); 116 | } 117 | 118 | /* Java standard method implementations below, needed for testing here: */ 119 | 120 | @Override 121 | public String toString() { 122 | return val; 123 | } 124 | 125 | @Override 126 | public boolean equals(Object that) { 127 | return that instanceof Word && ((Word) that).val.equals(this.val); 128 | } 129 | 130 | @Override 131 | public int hashCode() { 132 | return val.hashCode(); // mandatory if equals, consistent with equals 133 | } 134 | } 135 | 136 | @Test 137 | public void customRules() throws ParseException { 138 | List w = Arrays.asList("Löss", "Lee", "Luv", "Löß"); 139 | /* Default collator: ß after ss */ 140 | sortWithCollator(w, Collator.getInstance(Locale.GERMAN)); // default german 141 | assertEquals(Arrays.asList("Lee", "Löss", "Löß", "Luv"), w); 142 | /* Custom requirement: sort ß before ss (old German spelling rules) */ 143 | String defaultRules = ((RuleBasedCollator) RuleBasedCollator 144 | .getInstance(Locale.GERMAN)).getRules(); 145 | String customRules = "ß < ss"; // additional custom rule, replaces default 146 | final Collator collator = new RuleBasedCollator(defaultRules + customRules); 147 | sortWithCollator(w, collator); 148 | assertEquals(Arrays.asList("Lee", "Löß", "Löss", "Luv"), w); 149 | } 150 | 151 | private void sortWithCollator(List words, final Collator collator) { 152 | Collections.sort(words, new Comparator() { 153 | @Override 154 | public int compare(String s1, String s2) { 155 | return collator.compare(s1, s2); 156 | } 157 | }); 158 | } 159 | 160 | } -------------------------------------------------------------------------------- /src/spinfo/CollectionsGenerics.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | import static org.junit.Assert.assertTrue; 7 | 8 | import java.util.ArrayList; 9 | import java.util.Arrays; 10 | import java.util.Collection; 11 | import java.util.Collections; 12 | import java.util.Deque; 13 | import java.util.HashMap; 14 | import java.util.HashSet; 15 | import java.util.LinkedList; 16 | import java.util.List; 17 | import java.util.Map; 18 | import java.util.Set; 19 | import java.util.SortedMap; 20 | import java.util.SortedSet; 21 | import java.util.TreeMap; 22 | import java.util.TreeSet; 23 | 24 | import org.junit.Test; 25 | 26 | /** Generic data structures and algorithms: Java generics and collections. */ 27 | public class CollectionsGenerics { 28 | 29 | /* Generics */ 30 | 31 | @Test 32 | public void basicGenerics() { 33 | /* 34 | * A List can contain different types of elements. We specify the type as a 35 | * parameter for the List class, e.g. Integer: 36 | */ 37 | List ints = Arrays.asList(1, 2, 3, 4, 5); // auto-boxed 38 | assertTrue(ints.get(0) instanceof Integer); 39 | /* Or String: */ 40 | List strings = Arrays.asList("one", "two"); 41 | assertTrue(strings.get(0) instanceof String); 42 | /* Since List can be used with different types, it's called a generic class. */ 43 | } 44 | 45 | @Test 46 | public void genericMethods() { 47 | /* As method used above, but implemented below: */ 48 | List s = asList(new String[] { "one", "two" }); 49 | /* Optional here: explicit type parameter: */ 50 | s = CollectionsGenerics. asList(new String[] { "one", "two" }); 51 | assertEquals("one", s.get(0)); 52 | assertEquals("two", s.get(1)); 53 | } 54 | 55 | /* A generic method: has a type parameter T (inferred or explicit) */ 56 | private static List asList(T[] ts) { 57 | List result = new ArrayList(); // choose List impl. on creation 58 | for (T t : ts) 59 | result.add(t); 60 | return result; 61 | } 62 | 63 | @Test 64 | public void genericClasses() { 65 | /* Like for List above, we can also use generics on our classes: */ 66 | Tree tree = new Tree(); 67 | tree.root = new Node("value"); 68 | assertEquals("value", tree.root.value); 69 | assertTrue(tree.root.value instanceof String); 70 | } 71 | 72 | static class Tree { // T becomes concrete on creation, e.g. String, 73 | // Integer, etc. 74 | Node root; 75 | } 76 | 77 | static class Node { 78 | T value; 79 | Node left; 80 | Node right; 81 | 82 | public Node(T value) { 83 | this.value = value; 84 | } 85 | } 86 | 87 | /* Collections */ 88 | 89 | @Test 90 | public void collections() { 91 | List list = new ArrayList(); // refer by interface 92 | Collection coll = list; // List is a Collection 93 | Iterable iter = coll; // Collection is Iterable 94 | assertTrue(iter instanceof Iterable); 95 | assertTrue(iter instanceof Collection); 96 | assertTrue(iter instanceof List); 97 | assertTrue(iter instanceof ArrayList); 98 | /* The Collection Interface defines 4 kinds of methods: */ 99 | coll.add("hi"); // 1. methods for adding elements (also addAll, ...) 100 | coll.remove("hi"); // 2. methods for removing elements (also removeAll, ...) 101 | coll.contains("hi"); // 3. methods for querying (also containsAll, ...) 102 | coll.toArray(new String[0]); // 4. methods for conversion (iterator, ...) 103 | } 104 | 105 | @Test 106 | public void sets() { 107 | Set set = new HashSet(); // no duplicates, no order 108 | set = new TreeSet(); // change impl: no duplicates, sorted, tree 109 | assertTrue(set instanceof Collection); 110 | assertTrue(set instanceof Set); 111 | assertTrue(set instanceof TreeSet); 112 | assertTrue(set instanceof SortedSet); // additional interface 113 | set.add("hi"); // O(1) for HashSet, O(log n) for TreeSet 114 | assertTrue(set.contains("hi")); // O(1) for HashSet, O(log n) for TreeSet 115 | set.add("hi"); // add existing value, should not be added 116 | assertEquals(1, set.size()); // no duplicates 117 | } 118 | 119 | @Test 120 | public void lists() { 121 | List list = new ArrayList(); // array-based impl. 122 | list = new LinkedList(); // linked list impl. 123 | assertTrue(list instanceof Collection); 124 | assertTrue(list instanceof List); 125 | assertTrue(list instanceof LinkedList); 126 | assertTrue(list instanceof Deque); // additional interface 127 | list.add("hi"); // O(1) for ArrayList and LinkedList (add at end) 128 | String s = list.get(0); // O(1) for ArrayList, O(n) for LinkedList 129 | list.remove(0); // O(1) for LinkedList (front), O(n) for ArrayList 130 | assertEquals("hi", s); 131 | } 132 | 133 | @Test 134 | public void maps() { 135 | Map map = new HashMap(); // hash table 136 | map = new TreeMap(); // change impl: sorted keys, tree 137 | assertTrue(map instanceof Map); 138 | assertTrue(map instanceof TreeMap); 139 | assertTrue(map instanceof SortedMap); // additional interface 140 | map.put("hi", 5); // O(1) for HashMap, O(log n) for TreeMap 141 | int i = map.get("hi"); // O(1) for HashMap, O(log n) for TreeMap 142 | assertEquals(5, i); 143 | } 144 | 145 | @Test 146 | public void algorithms() { 147 | /* Generic methods for working with collections, e.g. sorting and searching: */ 148 | List vals = Arrays.asList(91, 23, 88, 93, 20, 37); 149 | Collections.sort(vals); // merge sort, O(n log n) 150 | assertEquals(Arrays.asList(20, 23, 37, 88, 91, 93), vals); 151 | assertEquals(2, Collections.binarySearch(vals, 37)); // binsearch, O(log n) 152 | assertEquals(5, Collections.binarySearch(vals, 93)); 153 | } 154 | 155 | @Test 156 | public void wrappers() { 157 | /* We can convert collections by passing them to the constructor: */ 158 | List list = Arrays.asList("one", "one", "two", "two"); 159 | assertEquals(4, list.size()); 160 | /* Remove duplicates by wrapping the list in a set: */ 161 | Set set = new HashSet(list); 162 | assertEquals(2, set.size()); 163 | } 164 | } -------------------------------------------------------------------------------- /src/spinfo/Crawling.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertTrue; 6 | 7 | import java.io.IOException; 8 | import java.net.MalformedURLException; 9 | import java.net.URL; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.Collections; 13 | import java.util.HashSet; 14 | import java.util.List; 15 | import java.util.Set; 16 | import java.util.concurrent.ExecutorService; 17 | import java.util.concurrent.Executors; 18 | import java.util.concurrent.TimeUnit; 19 | 20 | import org.cyberneko.html.parsers.DOMParser; 21 | import org.junit.Test; 22 | import org.w3c.dom.Node; 23 | import org.xml.sax.SAXException; 24 | 25 | import com.google.common.base.Charsets; 26 | import com.google.common.io.Resources; 27 | 28 | /** Basic web crawling, HTML processing, and concurrency. */ 29 | public class Crawling { 30 | 31 | /** Test simple content loading via URL. */ 32 | @Test 33 | public void load() throws IOException { 34 | /* 35 | * Simple loading of content from a URL, using Guava (passing the charset 36 | * specified by the site), but the result is not very useful as it is (e.g. 37 | * for indexing): 38 | */ 39 | System.out.println(Resources.toString(new URL("http://www.zeit.de/"), 40 | Charsets.UTF_8)); 41 | } 42 | 43 | /** Test web site parsing. */ 44 | @Test 45 | public void parse() throws SAXException, IOException { 46 | /* What we need is a structured processing of content and links: */ 47 | WebDocument doc = Parser.parse("http://www.zeit.de/"); 48 | String text = doc.text; 49 | Set links = doc.links; 50 | assertTrue("Document content should exist", text.length() > 0); 51 | assertTrue("Outgoing links should exist", links.size() > 0); 52 | System.out.println("Text: " + text); 53 | System.out.println("Links: " + links); 54 | } 55 | 56 | /** A web document representation consisting of text and links. */ 57 | static class WebDocument { 58 | String text; 59 | Set links; 60 | URL url; 61 | 62 | WebDocument(String url, String text, Set links) 63 | throws MalformedURLException { 64 | this.text = text; 65 | this.links = links; 66 | this.url = new URL(url); 67 | } 68 | } 69 | 70 | /** A parser that transforms a URL into a web document representation. */ 71 | static class Parser { 72 | private static Set links; 73 | private static StringBuilder builder; 74 | 75 | static WebDocument parse(String url) throws SAXException, IOException { 76 | /* We parse with NekoHTML, an error-correcting parser based on Xerces: */ 77 | DOMParser parser = new DOMParser(); 78 | parser.parse(url); 79 | builder = new StringBuilder(); 80 | links = new HashSet(); 81 | /* We start at the first element: */ 82 | process(parser.getDocument().getFirstChild()); 83 | /* At the end we create our resulting document object: */ 84 | return new WebDocument(url, builder.toString().trim(), links); 85 | } 86 | 87 | private static void process(Node node) throws MalformedURLException { 88 | /* 89 | * We get elements by their names. We could use instanceof, and e.g. test 90 | * if something is a HTMLParagraphElement, but this is less robust, since 91 | * e.g. XHTML documents are made of elements in a different namespace. 92 | */ 93 | String elementName = node.getNodeName().toLowerCase().trim(); 94 | /* We treat as content here only text within a p-tag: */ 95 | if (elementName.equals("p")) { 96 | String text = node.getTextContent().trim(); 97 | if (text.length() > 0) { 98 | builder.append(text).append("\n\n"); // make it a paragraph 99 | } 100 | } else if (elementName.equals("a")) { 101 | if (node.hasAttributes()) { 102 | /* If the a-tag has a href attribute with http, add it to the links: */ 103 | Node href = node.getAttributes().getNamedItem("href"); 104 | if (href != null && href.getNodeValue().trim().startsWith("http://")) { 105 | links.add(href.getNodeValue().trim()); 106 | } 107 | } 108 | } 109 | /* Done with current node, recurse on same level (if there is more): */ 110 | Node sibling = node.getNextSibling(); 111 | if (sibling != null) { 112 | process(sibling); 113 | } 114 | /* Done with current level, recurse to next level (if there is more): */ 115 | Node child = node.getFirstChild(); 116 | if (child != null) { 117 | process(child); 118 | } 119 | } 120 | } 121 | 122 | /** Test the actual crawling, quick sample. */ 123 | @Test 124 | public void crawl() throws InterruptedException { 125 | /* Now that we have a way to process a single web site, we can crawl: */ 126 | List seed = Arrays.asList("http://www.ub.uni-koeln.de/", 127 | "http://www.zeit.de"); 128 | /* Process the seed only: */ 129 | assertTrue(Crawler.crawl(seed, 0).size() == seed.size()); 130 | } 131 | 132 | /** Test the actual crawling, long-running sample. */ 133 | // @Test // (long-running task, comment in to run) 134 | public void crawlMore() throws InterruptedException { 135 | List seed = Arrays.asList("http://www.ub.uni-koeln.de/", 136 | "http://www.zeit.de"); 137 | /* Process seed and one level down: */ 138 | int linksPerSite = 5; // estimation: > 5 links / site 139 | assertTrue(Crawler.crawl(seed, 1).size() > seed.size() * linksPerSite); 140 | } 141 | 142 | /** A simple crawler that processes the seed concurrently. */ 143 | static class Crawler { 144 | public static List crawl(List seed, int depth) 145 | throws InterruptedException { 146 | /* 147 | * The result of crawling will be a list of web documents. To avoid 148 | * concurrent modification of the list, we use a synchronized wrapper: 149 | */ 150 | List result = Collections 151 | .synchronizedList(new ArrayList()); 152 | /* 153 | * We separate the unit of work (a Runnable) and the concurrent execution 154 | * (ExecutorService), cf. Effective Java, Second Edition, Chapter 10: 155 | */ 156 | ExecutorService exec = Executors.newCachedThreadPool(); // newFixedThreadPool(1); 157 | for (String url : seed) { 158 | /* For every seed URL we create and execute a runnable: */ 159 | exec.execute(new CrawlerRunnable(result, url, depth)); 160 | } 161 | /* We passed all work to be done: */ 162 | exec.shutdown(); 163 | /* Now running in the background - we don't want to go on, but wait: */ 164 | boolean done = exec.awaitTermination(5, TimeUnit.HOURS); 165 | /* Print some info on the result: */ 166 | System.out.printf("Crawled %s docs, in time: %s\n", result.size(), done); 167 | return result; 168 | } 169 | } 170 | 171 | /** A crawler runnable that crawls from a given starting point. */ 172 | static class CrawlerRunnable implements Runnable { 173 | private int depth; 174 | private String url; 175 | private List result; 176 | 177 | public CrawlerRunnable(List result, String url, int depth) { 178 | this.result = result; 179 | this.url = url; 180 | this.depth = depth; 181 | } 182 | 183 | @Override 184 | /* Top-level entry point (called by the executor service): */ 185 | public void run() { 186 | try { 187 | crawl(url, 0); // start crawling, and catch all that can go wrong here 188 | } catch (InterruptedException e) { 189 | e.printStackTrace(); 190 | } catch (SAXException e) { 191 | e.printStackTrace(); 192 | } 193 | } 194 | 195 | /* 196 | * The recursive crawling method: parse current page, add result, and if 197 | * below the depth limit, call itself with the outgoing links of the page. 198 | */ 199 | private void crawl(final String url, final int current) 200 | throws InterruptedException, SAXException { 201 | WebDocument doc = null; 202 | try { 203 | doc = Parser.parse(url); 204 | } catch (IOException e) { 205 | System.out.println("Crawl error: " + e.getMessage()); 206 | } 207 | if (doc != null) { 208 | result.add(doc); 209 | System.out.println("Crawled: " + url); 210 | Thread.sleep(300); // delay for politeness (no server request flood) 211 | if (current < depth) { 212 | for (String link : doc.links) { 213 | crawl(link, current + 1); 214 | } 215 | } 216 | } 217 | } 218 | 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/spinfo/EditDistance.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | import org.junit.Test; 11 | 12 | /** Edit distance with recursion, memoization, and dynamic programming. */ 13 | public class EditDistance { 14 | 15 | /** Test the correctness of the different implementations. */ 16 | 17 | @Test 18 | public void correctness() { 19 | runResultTest(new RecursiveEditDistance()); 20 | runResultTest(new MemoizedEditDistance()); 21 | runResultTest(new DynamicProgrammingEditDistance()); 22 | } 23 | 24 | /** Test the performance of the different implementations. */ 25 | 26 | @Test 27 | public void performance() { 28 | runPerformanceTest(new DynamicProgrammingEditDistance()); 29 | runPerformanceTest(new MemoizedEditDistance()); 30 | // runPerformanceTest(new RecursiveEditDistance()); /* long-running */ 31 | } 32 | 33 | /** Edit distance interface: number of operations to change s1 into s2. */ 34 | 35 | interface Edit { 36 | int distance(String s1, String s2); 37 | } 38 | 39 | /** Implementation based on simple recursion. */ 40 | 41 | static class RecursiveEditDistance implements Edit { 42 | private String s1; 43 | private String s2; 44 | 45 | @Override 46 | public int distance(final String s1, final String s2) { 47 | this.s1 = s1; 48 | this.s2 = s2; 49 | /* Overall problem: D(i,j) for i = |S1| and j = |S2|, i.e: */ 50 | return distance(s1.length(), s2.length()); 51 | } 52 | 53 | /* Distance of the first i chars in s1 to the first j chars in s2 */ 54 | protected int distance(final int i, final int j) { 55 | /* Uncomment to see redundant sub-solution computation: */ 56 | // System.out.println(String.format("Checking pair: %s, %s", i, j)); 57 | /* "Base Condition": d(0,j) is j and d(i,0) is i */ 58 | if (i == 0) { 59 | return j; 60 | } 61 | if (j == 0) { 62 | return i; 63 | } 64 | /* "Recurrence Relation" */ 65 | if (s1.charAt(i - 1) == s2.charAt(j - 1)) { 66 | return distance(i - 1, j - 1); 67 | } 68 | /* 69 | * For each edit x: three recursive descents, i.e. exp. runtime: O(3^x) 70 | */ 71 | int del = distance(i - 1, j) + 1; 72 | int ins = distance(i, j - 1) + 1; 73 | int rep = distance(i - 1, j - 1) + 1; 74 | return Math.min(del, Math.min(ins, rep)); 75 | } 76 | } 77 | 78 | /** Implementation based on memoized recursion. */ 79 | 80 | static class MemoizedEditDistance extends RecursiveEditDistance { 81 | private Map map = new HashMap(); 82 | 83 | @Override 84 | public int distance(final String s1, final String s2) { 85 | map.clear(); // forget memoized solution for new pair of strings 86 | return super.distance(s1, s2); 87 | } 88 | 89 | @Override 90 | protected int distance(final int i, final int j) { 91 | String pair = i + ", " + j; 92 | /* 93 | * Only if we have not seen the pair before, we delegate to superclass: 94 | */ 95 | if (!map.containsKey(pair)) { 96 | map.put(pair, super.distance(i, j)); 97 | } 98 | return map.get(pair); // return the memoized sub-solution 99 | } 100 | } 101 | 102 | /** Implementation based on dynamic programming. */ 103 | 104 | static class DynamicProgrammingEditDistance implements Edit { 105 | @Override 106 | public int distance(final String s1, final String s2) { 107 | /* We fill the table once, i.e. linear runtime: O(i + 1 + j + 1) */ 108 | int[][] table = new int[s1.length() + 1][s2.length() + 1]; 109 | for (int i = 0; i < table.length; i++) { 110 | for (int j = 0; j < table[i].length; j++) { 111 | /* "Base Condition": d(0,j) is j and d(i,0) is i */ 112 | if (i == 0) { 113 | table[i][j] = j; 114 | } else if (j == 0) { 115 | table[i][j] = i; 116 | } else { 117 | int del = table[i - 1][j] + 1; 118 | int ins = table[i][j - 1] + 1; 119 | int rep = table[i - 1][j - 1] 120 | + (s1.charAt(i - 1) == s2.charAt(j - 1) ? 0 : 1); 121 | table[i][j] = Math.min(del, Math.min(ins, rep)); 122 | } 123 | } 124 | } 125 | /* 126 | * After having started "bottom" at 0,0, at the end we are "up" (at the 127 | * position indicating die distance of the full strings, at the lower 128 | * right corner of the table) and have our result: D(i, j): 129 | */ 130 | return table[s1.length()][s2.length()]; 131 | } 132 | } 133 | 134 | private void runResultTest(final Edit distance) { 135 | assertEquals(2, distance.distance("ehe", "reh")); 136 | assertEquals(2, distance.distance("eber", "leder")); 137 | assertEquals(0, distance.distance("ehe", "ehe")); 138 | assertEquals(0, distance.distance("", "")); 139 | assertEquals(1, distance.distance("ehe", "eher")); 140 | assertEquals(2, distance.distance("he", "")); 141 | assertEquals(2, distance.distance("", "he")); 142 | assertEquals(0, distance.distance("rechtschaffen", "rechtschaffen")); 143 | } 144 | 145 | private void runPerformanceTest(final Edit distance) { 146 | System.out.print("Running performance test for: " 147 | + distance.getClass().getSimpleName() + "..."); 148 | long start = System.currentTimeMillis(); 149 | for (int i = 0; i < 50; i++) { 150 | distance.distance("nacktschnecke", "rechtschaffen"); 151 | } 152 | System.out.println(String.format(" %s ms.", System.currentTimeMillis() 153 | - start)); // typical result: 3, 200, 60000 ms. for rec., memo., 154 | // dp 155 | } 156 | 157 | } -------------------------------------------------------------------------------- /src/spinfo/HashTables.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import junit.framework.Assert; 6 | 7 | import org.junit.Test; 8 | 9 | /** Hash Tables: a useful, efficient data structure. */ 10 | public class HashTables { 11 | 12 | /* The basic idea: a direct-address table */ 13 | 14 | @Test 15 | public void direct() { 16 | DirectAddressTable t = new DirectAddressTable(); 17 | Person tom = new Person("tom"); 18 | Person jim = new Person("jim"); 19 | t.put(50, tom); // e.g. student ID = 50 20 | t.put(75, jim); // e.g. student ID = 75 21 | Assert.assertEquals(tom, t.get(50)); 22 | Assert.assertEquals(jim, t.get(75)); 23 | } 24 | 25 | static class DirectAddressTable { 26 | Object[] table = new Object[100]; // lots of space wasted 27 | 28 | public void put(int key, Object person) { // only numeric key supported 29 | table[key] = person; 30 | } 31 | 32 | public Object get(int key) { 33 | return table[key]; 34 | } 35 | } 36 | 37 | static class Person { 38 | 39 | private String name; 40 | 41 | public Person(String name) { 42 | this.name = name; 43 | } 44 | 45 | @Override 46 | public String toString() { 47 | return name; 48 | } 49 | } 50 | 51 | /* A simple hash table using chaining for collisions: */ 52 | 53 | @Test 54 | public void hashed() { 55 | HashTable t = new HashTable(); 56 | Person tom = new Person("tom"); 57 | Person jim = new Person("jim"); 58 | Person joe = new Person("joe"); 59 | t.put(50, tom); // e.g. student ID = 50 60 | t.put(75, jim); // e.g. student ID = 75 61 | t.put(85, joe); // e.g. student ID = 85, hashes to same as 75 here 62 | Assert.assertEquals(tom, t.get(50)); 63 | Assert.assertEquals(jim, t.get(75)); 64 | Assert.assertEquals(joe, t.get(85)); 65 | } 66 | 67 | static class HashTable { 68 | 69 | Element[] table = new Element[10]; // scale down 70 | 71 | static class Element { 72 | Element next; 73 | Object key; // key can be of any type 74 | Object value; 75 | 76 | public Element(Object key, Object value) { 77 | this.key = key; 78 | this.value = value; 79 | } 80 | } 81 | 82 | public void put(Object key, Object value) { 83 | Element newElement = new Element(key, value); 84 | int slot = hash(key); 85 | Element e = table[slot]; 86 | table[slot] = newElement; // place new element in table 87 | /* Handle previous element in the slot with a different key: */ 88 | if (e != null && !e.key.equals(newElement.key)) { 89 | newElement.next = e; // add new in front 90 | } 91 | } 92 | 93 | public Object get(Object key) { 94 | Element e = table[hash(key)]; 95 | if (e == null) // no value in slot 96 | return null; 97 | /* Find element with correct key in list: */ 98 | while (!(e.key.equals(key)) && e.next != null) { 99 | e = e.next; 100 | } 101 | return e.key.equals(key) ? e.value : null; 102 | } 103 | 104 | private int hash(Object key) { 105 | // simple demo hash: map key to table length 106 | if (key instanceof Integer) { 107 | return ((Integer) key) % table.length; 108 | } 109 | if (key instanceof String) { 110 | return ((String) key).length() % table.length; 111 | } 112 | return key.hashCode() % table.length; 113 | } 114 | } 115 | 116 | /* Hashing in practice: equality for custom objects */ 117 | 118 | @Test 119 | public void equality() { 120 | Student s1 = new Student(5, "John", "Doe"); 121 | Student s2 = new Student(8, "Jim", "Jones"); 122 | Student s3 = new Student(8, "Jim", "Jones"); 123 | Student s4 = new Student(5, "John", "Doe"); 124 | /* hashCode has to be implemented consistent with equals: */ 125 | Assert.assertEquals(s1, s4); 126 | Assert.assertEquals(s2, s3); 127 | Assert.assertEquals(s1.hashCode(), s4.hashCode()); 128 | Assert.assertEquals(s2.hashCode(), s3.hashCode()); 129 | Assert.assertFalse(s1.equals(s2)); 130 | Assert.assertFalse(s1.hashCode() == s2.hashCode()); 131 | } 132 | 133 | static class Student { 134 | int id; 135 | String first; 136 | String last; 137 | 138 | public Student(int id, String first, String last) { 139 | this.id = id; 140 | this.first = first; 141 | this.last = last; 142 | } 143 | 144 | @Override 145 | public String toString() { 146 | return String.format("%s %s (%s)", first, last, id); 147 | } 148 | 149 | @Override 150 | public int hashCode() { // use same values as in equals 151 | int result = 17; 152 | result = 31 * result + id; 153 | result = 31 * result + first.hashCode(); 154 | result = 31 * result + last.hashCode(); 155 | return result; 156 | } 157 | 158 | @Override 159 | public boolean equals(Object that) { // use same values as in hashCode 160 | return (that instanceof Student) && ((Student) that).id == this.id 161 | && ((Student) that).first.equals(this.first) 162 | && ((Student) that).last.equals(this.last); 163 | } 164 | } 165 | 166 | /* Hash table sample usage: counting words */ 167 | 168 | @Test 169 | public void usage() { 170 | String text = "hi there hi everybody hi there again"; 171 | HashTable t = count(text); 172 | Assert.assertEquals(3, t.get("hi")); 173 | Assert.assertEquals(2, t.get("there")); 174 | Assert.assertEquals(1, t.get("everybody")); 175 | } 176 | 177 | private HashTable count(String text) { 178 | HashTable t = new HashTable(); 179 | String[] words = text.split(" "); 180 | for (String w : words) { 181 | Integer v = (Integer) t.get(w); 182 | if (v == null) // first occurrence 183 | v = 0; 184 | t.put(w, v + 1); // count up 185 | } 186 | return t; 187 | } 188 | } -------------------------------------------------------------------------------- /src/spinfo/Index.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static java.util.Arrays.asList; 6 | import static org.junit.Assert.assertEquals; 7 | import static org.junit.Assert.assertTrue; 8 | 9 | import java.io.File; 10 | import java.io.FileNotFoundException; 11 | import java.io.IOException; 12 | import java.net.MalformedURLException; 13 | import java.util.ArrayList; 14 | import java.util.Arrays; 15 | import java.util.Collections; 16 | import java.util.Comparator; 17 | import java.util.HashMap; 18 | import java.util.Iterator; 19 | import java.util.List; 20 | import java.util.Map; 21 | import java.util.Scanner; 22 | import java.util.Set; 23 | import java.util.SortedSet; 24 | import java.util.TreeSet; 25 | import java.util.regex.Matcher; 26 | import java.util.regex.Pattern; 27 | 28 | import org.junit.Assert; 29 | import org.junit.Test; 30 | 31 | /** 32 | * Basic indexing and preprocessing with regular expressions. Requires file 33 | * "pg100.txt", The Complete Works of William Shakespeare 34 | * (http://www.gutenberg.org/ebooks/100.txt.utf8) 35 | */ 36 | public class Index { 37 | 38 | /* Before indexing, we need to determine what elements to index. */ 39 | 40 | private static final Preprocessor PREPROCESSOR = new Preprocessor(); 41 | 42 | @Test 43 | public void tokenization() { 44 | assertEquals(asList("hello", "world"), process("hello, world!")); 45 | assertEquals(asList("123", "test"), process("test 123, 123 test, test")); 46 | assertEquals(asList("0221-123123", "test"), process("0221-123123, test")); 47 | assertEquals(asList("123", "köln", "test"), process("test - köln - 123")); 48 | } 49 | 50 | private List process(String string) { 51 | // some wrapping for the tests (compare with sorted list) 52 | return new ArrayList(new TreeSet( 53 | PREPROCESSOR.tokenize(string))); 54 | } 55 | 56 | @Test 57 | public void patterns() { 58 | assertTrue("0221-470".matches(SpecialCase.COMPOUND.regex)); 59 | assertTrue(!"Meine Nummer: 0221-470.".matches(SpecialCase.COMPOUND.regex)); 60 | assertTrue(!"4711".matches(SpecialCase.COMPOUND.regex)); 61 | assertTrue(!"Daimler-Benz".matches(SpecialCase.COMPOUND.regex)); 62 | assertTrue("8.04".matches(SpecialCase.COMPOUND.regex)); 63 | assertTrue("15:10".matches(SpecialCase.COMPOUND.regex)); 64 | assertTrue("3,50".matches(SpecialCase.COMPOUND.regex)); 65 | assertTrue("fabian.steeg@uni-koeln.de".matches(SpecialCase.EMAIL.regex)); 66 | assertTrue("fsteeg@spinfo.uni-koeln.de".matches(SpecialCase.EMAIL.regex)); 67 | assertTrue(!"fabian@home".matches(SpecialCase.EMAIL.regex)); 68 | } 69 | 70 | /* 71 | * Available patterns for extraction. Uses enum instead of constants to 72 | * iterate over all patterns in constructor of Preprocessor. 73 | */ 74 | enum SpecialCase { 75 | /* Phone (0221-4701751), versions (8.04), money (3,50) and time (15:15) */ 76 | COMPOUND("\\d+[-.,:]\\d+"), 77 | /* Simple numbers */ 78 | NUMBER("\\d+"), 79 | /* Some simple email adresses */ 80 | EMAIL("[^@\\s]+@.+?\\.(de|com|eu|org|net)"); 81 | 82 | String regex; 83 | 84 | SpecialCase(final String regularExpression) { 85 | this.regex = regularExpression; 86 | } 87 | } 88 | 89 | /** 90 | * A preprocessor based on regular expressions: first extracts custom 91 | * patterns, then splits on a given delimiter. 92 | */ 93 | static class Preprocessor { 94 | /* Unicode-aware "non-letter" delimiter, ASCII version is \\W */ 95 | private static final String UNICODE_AWARE_DELIMITER = "[^\\p{L}]"; 96 | private List specialCases = new ArrayList(); 97 | private String delimiter; 98 | 99 | public Preprocessor() { 100 | delimiter = UNICODE_AWARE_DELIMITER; 101 | for (SpecialCase p : SpecialCase.values()) { 102 | specialCases.add(p); 103 | } 104 | } 105 | 106 | public List tokenize(final String input) { 107 | String text = input.toLowerCase(); 108 | List result = new ArrayList(); 109 | text = extractSpecialCases(text, result); 110 | tokenizeStandard(text, result); 111 | return result; 112 | } 113 | 114 | private String extractSpecialCases(String text, List result) { 115 | for (SpecialCase p : specialCases) { 116 | Pattern pattern = Pattern.compile(p.regex); 117 | Matcher matcher = pattern.matcher(text); 118 | while (matcher.find()) { 119 | String group = matcher.group(); 120 | result.add(group); // add special case 121 | text = text.replace(group, ""); // don't treat group as regex 122 | } 123 | } 124 | return text; 125 | } 126 | 127 | private void tokenizeStandard(String text, List result) { 128 | List list = Arrays.asList(text.split(delimiter)); 129 | for (String s : list) 130 | if (s.trim().length() > 0) // filter empty strings 131 | result.add(s.trim()); 132 | } 133 | 134 | } 135 | 136 | /* Once we can preprocess our corpus, we can build an index and search it: */ 137 | 138 | private static final InvertedIndex INDEX = buildIndex(); 139 | 140 | /** Test searching the corpus for a single term. */ 141 | @Test 142 | public final void testSearch() throws MalformedURLException, IOException { 143 | long start = System.currentTimeMillis(); 144 | String query = "Brutus"; 145 | Set list = INDEX.search(query); 146 | System.out.printf("Result for '%s': %s, took %s ms.\n", query, list, 147 | (System.currentTimeMillis() - start)); 148 | Assert.assertTrue("Search should find a single term", list.size() > 0); 149 | } 150 | 151 | /** Test searching the corpus for multiple search terms. */ 152 | @Test 153 | public final void testMulti() throws MalformedURLException, IOException { 154 | long start = System.currentTimeMillis(); 155 | String query = "Brutus Caesar"; // = Brutus AND Caesar 156 | Set list = INDEX.search(query); 157 | System.out.printf("Result for '%s': %s, took %s ms.\n", query, list, 158 | (System.currentTimeMillis() - start)); 159 | Assert.assertTrue("Search should find multiple terms", list.size() > 0); 160 | } 161 | 162 | static class InvertedIndex { 163 | 164 | private Map> index = new HashMap>(); 165 | 166 | public InvertedIndex(final List corpus) { 167 | index = index(corpus); 168 | } 169 | 170 | private Map> index(final List works) { 171 | Map> index = new HashMap>(); 172 | // for each document, and each of its token, add it to the index 173 | for (int i = 0; i < works.size(); i++) { 174 | List tokens = PREPROCESSOR.tokenize(works.get(i)); 175 | for (String token : tokens) { 176 | SortedSet postings = index.get(token); 177 | if (postings == null) { // first time 178 | postings = new TreeSet(); 179 | index.put(token, postings); 180 | } 181 | postings.add(i); // document i contains token 182 | } 183 | } 184 | return index; 185 | } 186 | 187 | public Set search(final String query) { 188 | /* We treat all entries as AND-linked... */ 189 | List queries = PREPROCESSOR.tokenize(query); 190 | /* We get the results for each query term: */ 191 | List> allPostings = new ArrayList>(); 192 | for (String q : queries) { 193 | SortedSet postings = index.get(q); 194 | if (postings != null) 195 | allPostings.add(postings); 196 | } 197 | /* For efficient intersection computation: sort lists by length */ 198 | sortByLength(allPostings); 199 | /* Intersection of postings for all query terms is our result: */ 200 | return intersectionOf(allPostings); 201 | } 202 | 203 | private void sortByLength(List> all) { 204 | Collections.sort(all, new Comparator>() { 205 | public int compare(final SortedSet o1, 206 | final SortedSet o2) { 207 | return Integer.valueOf(o1.size()).compareTo(o2.size()); 208 | } 209 | }); 210 | } 211 | 212 | private Set intersectionOf(List> all) { 213 | /* The result set is the intersection of the first list with all others: */ 214 | SortedSet result = all.get(0); 215 | for (SortedSet set : all.subList(1, all.size())) { 216 | result = intersection(result.iterator(), set.iterator()); 217 | } 218 | return result; 219 | } 220 | 221 | } 222 | 223 | /* Implementation and tests for the intersection algorithm: */ 224 | 225 | @Test 226 | public void intersection() { 227 | /* Test intersection computation for AND-queries: */ 228 | TreeSet PL1 = new TreeSet(Arrays.asList(4, 3, 2, 1)); 229 | TreeSet PL2 = new TreeSet(Arrays.asList(2, 4, 6, 8)); 230 | Assert.assertEquals(Arrays.asList(2, 4), new ArrayList( 231 | intersection(PL1.iterator(), PL2.iterator()))); 232 | } 233 | 234 | public static SortedSet intersection(final Iterator i1, 235 | final Iterator i2) { 236 | SortedSet result = new TreeSet(); 237 | Integer p1 = next(i1); 238 | Integer p2 = next(i2); 239 | while (p1 != null && p2 != null) { 240 | if (p1.equals(p2)) { 241 | result.add(p1); 242 | p1 = next(i1); 243 | p2 = next(i2); 244 | } else if (p1 < p2) 245 | p1 = next(i1); 246 | else 247 | p2 = next(i2); 248 | } 249 | return result; 250 | } 251 | 252 | /* A little oddity to stay close to Manning et al. 2008, p. 11: */ 253 | private static Integer next(final Iterator i1) { 254 | return i1.hasNext() ? i1.next() : null; 255 | } 256 | 257 | /* Utilities: load data, build index: */ 258 | 259 | private static InvertedIndex buildIndex() { 260 | List corpus = corpus(); 261 | long start = System.currentTimeMillis(); 262 | System.out.printf("Building index for %s texts... ", corpus.size()); 263 | InvertedIndex invertedIndex = new InvertedIndex(corpus); 264 | System.out 265 | .printf("done, took %s ms.\n", System.currentTimeMillis() - start); 266 | return invertedIndex; 267 | } 268 | 269 | private static List corpus() { 270 | try { 271 | Scanner s = new Scanner(new File("pg100.txt"), "UTF-8"); 272 | StringBuilder builder = new StringBuilder(); 273 | while (s.hasNextLine()) { 274 | builder.append(s.nextLine()).append("\n"); 275 | } 276 | /* Each work is delimited by a line ending with a year: */ 277 | return Arrays.asList(builder.toString().split("1[56][0-9]{2}\n")); 278 | } catch (FileNotFoundException e) { 279 | e.printStackTrace(); 280 | } 281 | return Collections.emptyList(); 282 | } 283 | 284 | } -------------------------------------------------------------------------------- /src/spinfo/Lists.java: -------------------------------------------------------------------------------- 1 | /** Copyright 2011 Fabian Steeg, University of Cologne, http://github.com/spinfo */ 2 | 3 | package spinfo; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | import java.util.Iterator; 8 | import java.util.NoSuchElementException; 9 | 10 | import org.junit.Before; 11 | import org.junit.Test; 12 | 13 | /** Lists: elementary data structures. */ 14 | public class Lists { 15 | 16 | /** Low-level, non-OOP list implementation simulating a tuple/record/struct. */ 17 | @Test 18 | public void tuple() { 19 | /* Independent nodes: */ 20 | Object[] first = new Object[2]; 21 | Object[] second = new Object[2]; 22 | Object[] third = new Object[2]; 23 | /* Containing values: */ 24 | first[0] = "first"; 25 | second[0] = "second"; 26 | third[0] = "third"; 27 | /* Linked with pointers: */ 28 | first[1] = second; 29 | second[1] = third; 30 | /* Can be traversed: */ 31 | System.out.println("List traversal: "); 32 | Object[] current = first; 33 | while (current != null) { 34 | System.out.println(current[0]); 35 | current = (Object[]) current[1]; 36 | } 37 | } 38 | 39 | /** OOP implementation of a queue, a FIFO list (first in, first out). */ 40 | @Test 41 | public void queue() { 42 | Queue queue = new Queue(); 43 | /* Enqueue at end: */ 44 | queue.enqueue("first"); 45 | queue.enqueue("second"); 46 | queue.enqueue("third"); 47 | /* Iterate: */ 48 | System.out.println("Queue traversal: "); 49 | Node current = queue.first; 50 | while (current != null) { 51 | System.out.println(current.value); 52 | current = current.next; 53 | } 54 | /* Dequeue from front: */ 55 | assertEquals("first", queue.dequeue()); 56 | assertEquals("second", queue.dequeue()); 57 | assertEquals("third", queue.dequeue()); 58 | assertEquals(null, queue.dequeue()); 59 | } 60 | 61 | /** A list element: wraps a value and a reference to the next element. */ 62 | static class Node { 63 | Object value; 64 | Node next; 65 | 66 | Node(Object value) { 67 | this.value = value; 68 | } 69 | } 70 | 71 | /** The queue class enforces the restricted FIFO access. */ 72 | static class Queue /**/implements Iterable