├── .gitignore ├── Procfile ├── README.md ├── pom.xml ├── src └── main │ ├── java │ ├── com │ │ └── mohaps │ │ │ └── tldr │ │ │ ├── Main.java │ │ │ ├── SummarizedFeed.java │ │ │ ├── SummarizedFeedEntry.java │ │ │ ├── Summary.java │ │ │ ├── TLDRServlet.java │ │ │ ├── summarize │ │ │ ├── Defaults.java │ │ │ ├── Factory.java │ │ │ ├── IStopWords.java │ │ │ ├── ISummarizer.java │ │ │ ├── ITokenizer.java │ │ │ ├── OpenNLPTokenizer.java │ │ │ ├── RegExTokenizer.java │ │ │ ├── StopWords.java │ │ │ ├── Summarizer.java │ │ │ └── SummaryCache.java │ │ │ └── utils │ │ │ ├── Feeds.java │ │ │ ├── Pages.java │ │ │ ├── PorterStemmer.java │ │ │ └── Words.java │ └── org │ │ └── tartarus │ │ └── snowball │ │ ├── Among.java │ │ ├── SnowballProgram.java │ │ ├── SnowballStemmer.java │ │ └── ext │ │ ├── danishStemmer.java │ │ ├── dutchStemmer.java │ │ ├── englishStemmer.java │ │ ├── finnishStemmer.java │ │ ├── frenchStemmer.java │ │ ├── germanStemmer.java │ │ ├── hungarianStemmer.java │ │ ├── italianStemmer.java │ │ ├── norwegianStemmer.java │ │ ├── porterStemmer.java │ │ ├── portugueseStemmer.java │ │ ├── romanianStemmer.java │ │ ├── russianStemmer.java │ │ ├── spanishStemmer.java │ │ ├── swedishStemmer.java │ │ └── turkishStemmer.java │ └── webapp │ ├── WEB-INF │ ├── en-sent.bin │ ├── en-token.bin │ └── web.xml │ ├── apple-touch-icon.png │ ├── css │ ├── main.css │ └── normalize.css │ ├── favicon.ico │ ├── feed_summary.jsp │ ├── images │ └── tldrzr_logo_header.png │ ├── index.html │ ├── js │ ├── main.js │ ├── plugins.js │ └── vendor │ │ └── modernizr-2.6.2.min.js │ └── text_summary.jsp └── system.properties /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | .settings 4 | .springBeans 5 | target 6 | .DS_Store 7 | 8 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: java $JAVA_OPTS -cp target/classes:target/dependency/* com.mohaps.tldr.Main 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TL;DRzr - A simple algorithmic summarizer 2 | Website: https://tldrzr.herokuapp.com 3 | Author: Saurav Mohapatra (mohaps@gmail.com) 4 | 5 | Copyright (c) 2013, Saurav Mohapatra 6 | All rights reserved. 7 | 8 | ## License 9 | 10 | Redistribution and use in source and binary forms, with or without modification, are permitted 11 | provided that the following conditions are met: 12 | 13 | a) Redistributions of source code must retain the above copyright notice, 14 | this list of conditions and the following disclaimer. 15 | 16 | b) Redistributions in binary form must reproduce the above copyright notice, 17 | this list of conditions and the following disclaimer in the documentation 18 | and/or other materials provided with the distribution. 19 | 20 | c) Neither the name of TL;DRzr nor the names of its contributors may be used 21 | to endorse or promote products derived from this software without specific 22 | prior written permission. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 25 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 26 | SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | ## Introduction 32 | 33 | TL;DRzr (Pronounced as _tee-el-dee-rai-zer_) is a simple algorithmic summary generator written in java. It's deployed as a heroku app accessible at https://tldrzr.herokuapp.com 34 | 35 | I wrote this as a weekend hack / fun project. Feel free to use the code as you please. Would love a mention or a line back at mohaps AT gmail DOT com if you find this useful. My twitter is [@mohaps](https://twitter.com/mohaps) and my blog is at http://mohaps.com 36 | 37 | I'm currently working on the v2 of this project with better extraction, modularization and updates to use Java 8 features. 38 | 39 | Join the Hacker News Discussion About TL;DRzr : https://news.ycombinator.com/item?id=5523538 40 | 41 | Hacker News Post about the source code release : https://news.ycombinator.com/item?id=5535827 42 | 43 | 44 | ## Prerequisites (for Build) 45 | 46 | JDK 1.6+ 47 | Apache Maven 2.x+ 48 | 49 | ## Running the application locally 50 | 51 | First build with: 52 | 53 | $mvn clean install 54 | 55 | Then run it with: 56 | 57 | $java -cp target/classes:target/dependency/* com.mohaps.tldr.Main 58 | 59 | 60 | ## How does it work? 61 | 62 | TL;DRzr uses an algorithm derived from Classifier4J. I used the basic algo from Classifier4j, optimized it and added some refinements. 63 | 64 | The basic algorithm for summarization is like this. It first tokenizes the text into words and then calculates the top N most frequent words (discarding stop words and single occurence words). It then scans the sentences and gets the first N sentences which feature any or all of the most frequent words. The sentences are sorted based on first occurence in original text and concatenated to create the summary. The user has control over how long the generated summary should be in terms of sentence count. 65 | 66 | For implementation details a good starting point is the [Summarizer](https://github.com/mohaps/tldrzr/blob/master/src/main/java/com/mohaps/tldr/summarize/Summarizer.java) class. 67 | 68 | TL;DRzr is written in Java and uses Jsoup for html text scraping, ROME for RSS Feed parsing (which depends on JDOM). The parsing of sentences and word tokenization uses OpenNLP. It uses the Porter2 stemmer algorithm from here to process the tokens emitted by the tokenizer. The new summarize any url feature uses BoilerPipe 69 | 70 | ## Credits 71 | 72 | TL;DRzr is a weekend project/quick hack demo created by Saurav Mohapatra. I wrote this as a fun weekend hack after reading about the Summly acquisition by Yahoo!. I had drunk too many Red Bulls and sleep was not too forthcoming. :) I always wished to try out Heroku and after a couple of hours of googling + coding, I put this together. 73 | 74 | The algorithm is a keyword density based one. As this is my current hobby project, I shall work on improving the algorithm. I plan on opensourcing this codebase on github.. 75 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 36 | 38 | 4.0.0 39 | 1.0-SNAPSHOT 40 | TLDRizer 41 | jar 42 | 43 | 44 | 1.6 45 | UTF-8 46 | 47 | 48 | 49 | 50 | 51 | javax.servlet 52 | servlet-api 53 | 2.5 54 | 55 | 56 | jstl 57 | jstl 58 | 1.2 59 | 60 | 61 | 62 | 63 | org.eclipse.jetty 64 | jetty-servlet 65 | 7.6.0.v20120127 66 | 67 | 68 | org.eclipse.jetty 69 | jetty-webapp 70 | 7.6.0.v20120127 71 | 72 | 73 | org.mortbay.jetty 74 | jsp-2.1-glassfish 75 | 2.1.v20100127 76 | 77 | 78 | 79 | org.jsoup 80 | jsoup 81 | 1.7.2 82 | 83 | 84 | 85 | rome 86 | rome 87 | 1.0 88 | 89 | 90 | 91 | com.google.code.gson 92 | gson 93 | 2.2.2 94 | 95 | 96 | 97 | org.apache.opennlp 98 | opennlp-tools 99 | 1.5.2-incubating 100 | 101 | 102 | 103 | de.l3s.boilerpipe 104 | boilerpipe 105 | 1.1.0 106 | 107 | 108 | 109 | xerces 110 | xerces 111 | 2.4.0 112 | 113 | 114 | 115 | net.sourceforge.nekohtml 116 | nekohtml 117 | 1.9.18 118 | 119 | 125 | 126 | 127 | 128 | maven2-repository.dev.java.net 129 | Java.net Repository for Maven 130 | http://download.java.net/maven/2/ 131 | default 132 | 133 | 134 | repo1.maven.org 135 | OpenNLP repository 136 | http://repo1.maven.org/maven2/org/apache/opennlp/ 137 | default 138 | 139 | 140 | 141 | 142 | 143 | org.apache.maven.plugins 144 | maven-dependency-plugin 145 | 2.4 146 | 147 | 148 | copy-dependencies 149 | package 150 | 151 | copy-dependencies 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | org.eclipse.m2e 161 | lifecycle-mapping 162 | 1.0.0 163 | 164 | 165 | 166 | 167 | 168 | org.apache.maven.plugins 169 | maven-dependency-plugin 170 | [2.0,) 171 | 172 | copy-dependencies 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | com.mohaps 187 | tldrzr 188 | A web application that generates algorithmic summaries for english text. 189 | 190 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/Main.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr; 34 | 35 | import org.eclipse.jetty.server.Server; 36 | import org.eclipse.jetty.webapp.WebAppContext; 37 | 38 | /** 39 | * 40 | * This class launches the web application in an embedded Jetty container. 41 | * This is the entry point to your application. The Java command that is used for 42 | * launching should fire this main method. 43 | * 44 | */ 45 | public class Main { 46 | 47 | /** 48 | * @param args 49 | */ 50 | public static void main(String[] args) throws Exception{ 51 | String webappDirLocation = "src/main/webapp/"; 52 | 53 | //The port that we should run on can be set into an environment variable 54 | //Look for that variable and default to 8080 if it isn't there. 55 | String webPort = System.getenv("PORT"); 56 | if(webPort == null || webPort.isEmpty()) { 57 | webPort = "8080"; 58 | } 59 | 60 | Server server = new Server(Integer.valueOf(webPort)); 61 | WebAppContext root = new WebAppContext(); 62 | 63 | root.setContextPath("/"); 64 | root.setDescriptor(webappDirLocation+"/WEB-INF/web.xml"); 65 | root.setResourceBase(webappDirLocation); 66 | 67 | //Parent loader priority is a class loader setting that Jetty accepts. 68 | //By default Jetty will behave like most web containers in that it will 69 | //allow your application to replace non-server libraries that are part of the 70 | //container. Setting parent loader priority to true changes this behavior. 71 | //Read more here: http://wiki.eclipse.org/Jetty/Reference/Jetty_Classloading 72 | root.setParentLoaderPriority(true); 73 | 74 | server.setHandler(root); 75 | 76 | server.start(); 77 | server.join(); 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/SummarizedFeed.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr; 34 | 35 | import java.util.List; 36 | 37 | /** 38 | * A helper class representing a summarized for easy JSP rendering 39 | * @author mohaps 40 | * 41 | */ 42 | public class SummarizedFeed { 43 | public SummarizedFeed(String url, List entries, long millis) { 44 | this.url = url; 45 | this.entries = entries; 46 | this.millis = millis; 47 | } 48 | private String url; 49 | private List entries; 50 | private long millis; 51 | public String getUrl() { 52 | return url; 53 | } 54 | public void setUrl(String url) { 55 | this.url = url; 56 | } 57 | public List getEntries() { 58 | return entries; 59 | } 60 | public void setEntries(List entries) { 61 | this.entries = entries; 62 | } 63 | public long getMillis() { 64 | return millis; 65 | } 66 | public void setMillis(long millis) { 67 | this.millis = millis; 68 | } 69 | public int getItemCount() { 70 | return this.entries.size(); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/SummarizedFeedEntry.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr; 34 | 35 | import java.util.Set; 36 | 37 | /** 38 | * An entry in a summarized feed for easy JSP rendering 39 | * @author mohaps 40 | * 41 | */ 42 | public class SummarizedFeedEntry { 43 | 44 | public SummarizedFeedEntry(String title, String author, String link, 45 | String text, String summary, Set keywords) { 46 | this.title = title; 47 | this.author = author; 48 | this.link = link; 49 | this.text = text; 50 | this.summary = summary; 51 | this.keywords = keywords; 52 | } 53 | private String title; 54 | private String author; 55 | private String link; 56 | private String text; 57 | private String summary; 58 | private Set keywords; 59 | public String getTitle() { 60 | return title; 61 | } 62 | public void setTitle(String title) { 63 | this.title = title; 64 | } 65 | public String getAuthor() { 66 | return author; 67 | } 68 | public void setAuthor(String author) { 69 | this.author = author; 70 | } 71 | public String getLink() { 72 | return link; 73 | } 74 | public void setLink(String link) { 75 | this.link = link; 76 | } 77 | public String getText() { 78 | return text; 79 | } 80 | public void setText(String text) { 81 | this.text = text; 82 | } 83 | public String getSummary() { 84 | return summary; 85 | } 86 | public void setSummary(String summary) { 87 | this.summary = summary; 88 | } 89 | 90 | public Set getKeywords(){ return this.keywords; } 91 | public void setKeywords(){ this.keywords = keywords; } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/Summary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr; 34 | 35 | /** 36 | * A summary of supplied text for easy JSP rendering 37 | * @author mohaps 38 | * 39 | */ 40 | public class Summary { 41 | public Summary(){} 42 | public Summary(String text, String summary, int sentence_count, long millis) { 43 | this.original = text; 44 | this.summary = summary; 45 | this.sentence_count = sentence_count; 46 | this.millis = millis; 47 | } 48 | private String original; 49 | private String summary; 50 | private int sentence_count; 51 | private long millis; 52 | public String getOriginal() { 53 | return original; 54 | } 55 | public void setOriginal(String text) { 56 | this.original = text; 57 | } 58 | public String getSummary() { 59 | return summary; 60 | } 61 | public void setSummary(String summary) { 62 | this.summary = summary; 63 | } 64 | @Override 65 | public String toString() { 66 | return "Summary [text=" + original + ", summary=" + summary + "]"; 67 | } 68 | public int getSentence_count() { 69 | return sentence_count; 70 | } 71 | public void setSentence_count(int sentence_count) { 72 | this.sentence_count = sentence_count; 73 | } 74 | public long getMillis() { 75 | return millis; 76 | } 77 | public void setMillis(long millis) { 78 | this.millis = millis; 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/TLDRServlet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr; 34 | 35 | import java.io.IOException; 36 | import java.util.ArrayList; 37 | import java.util.List; 38 | import java.util.Set; 39 | import java.util.logging.Logger; 40 | import javax.servlet.ServletConfig; 41 | import javax.servlet.ServletException; 42 | import javax.servlet.ServletOutputStream; 43 | import javax.servlet.http.HttpServlet; 44 | import javax.servlet.http.HttpServletRequest; 45 | import javax.servlet.http.HttpServletResponse; 46 | 47 | import org.jsoup.Jsoup; 48 | 49 | import com.mohaps.tldr.summarize.Defaults; 50 | import com.mohaps.tldr.summarize.Factory; 51 | import com.mohaps.tldr.utils.Feeds; 52 | import com.mohaps.tldr.utils.Words; 53 | import com.mohaps.tldr.utils.Feeds.Item; 54 | 55 | import com.google.gson.stream.JsonWriter; 56 | 57 | /** 58 | * The web endpoint for TL;DRzr service/API 59 | * @author mohaps 60 | * 61 | */ 62 | public class TLDRServlet extends HttpServlet { 63 | 64 | 65 | /** 66 | * 67 | */ 68 | private static final long serialVersionUID = 1L; 69 | 70 | @Override 71 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) 72 | throws ServletException, IOException { 73 | 74 | String pathInfo = req.getPathInfo(); 75 | if (pathInfo == null || pathInfo.length() == 0 || pathInfo.equals("/")) { 76 | String feedUrl = req.getParameter("feed_url"); 77 | if (feedUrl != null && feedUrl.length() > 0) { 78 | summarizeFeedUrl(feedUrl, req, resp); 79 | } else { 80 | showHomePage(req, resp); 81 | } 82 | } else { 83 | resp.sendError(404, "Path " + pathInfo + " not found"); 84 | } 85 | } 86 | 87 | @Override 88 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) 89 | throws ServletException, IOException { 90 | String pathInfo = req.getPathInfo(); 91 | if (pathInfo != null && pathInfo.startsWith("/text")) { 92 | handleSummarizeTextCall(req, resp); 93 | } else if (pathInfo != null && pathInfo.startsWith("/feed")) { 94 | handleSummarizeFeedCall(req, resp); 95 | } else if (pathInfo != null && pathInfo.startsWith("/api")) { 96 | handleAPICall(req, resp); 97 | } else { 98 | resp.sendError(404, "could not locate POST endpoint " + pathInfo); 99 | } 100 | } 101 | 102 | // show the home page 103 | protected void showHomePage(HttpServletRequest req, HttpServletResponse resp) 104 | throws ServletException, IOException { 105 | resp.sendRedirect("/"); 106 | } 107 | 108 | // handle an API call (currently it's just a post to /tldr/api/summarize that takes input_text and sentence_count (default: 5) 109 | protected void handleAPICall(HttpServletRequest req, 110 | HttpServletResponse resp) throws ServletException, IOException { 111 | if(req.getContentLength() > Defaults.MAX_API_INPUT_LENGTH) { 112 | resp.sendError(400, "Request is over API limit of "+Defaults.MAX_API_INPUT_LENGTH+" bytes"); 113 | return; 114 | } 115 | String pathInfo = req.getPathInfo(); 116 | if (pathInfo.startsWith("/api/summarize")) { 117 | String inputText = req.getParameter("input_text"); 118 | int sentenceCount = Integer.parseInt(req 119 | .getParameter("sentence_count")); 120 | if (sentenceCount <= 0) { 121 | sentenceCount = Defaults.MAX_SENTENCES; 122 | } 123 | String summaryText = null; 124 | 125 | long start = System.currentTimeMillis(); 126 | long millis = 0; 127 | try { 128 | if(inputText == null || inputText.length() == 0){ summaryText = ""; } 129 | else { 130 | summaryText = Factory.getSummarizer().summarize(inputText, 131 | sentenceCount); 132 | } 133 | } catch (Exception ex) { 134 | throw new IOException("Failed to summarize", ex); 135 | } finally { 136 | millis = System.currentTimeMillis() - start; 137 | } 138 | resp.setContentType("application/json"); 139 | // serialize out as json 140 | JsonWriter writer = new JsonWriter(resp.getWriter()); 141 | writer.beginObject(); 142 | writer.name("summary_text").value(summaryText); 143 | writer.name("time_taken_millis").value(millis); 144 | writer.endObject(); 145 | writer.flush(); 146 | writer.close(); 147 | resp.getWriter().flush(); 148 | resp.getWriter().close(); 149 | 150 | } else { 151 | resp.sendError(404, "API endpoint " + pathInfo + " not found!"); 152 | } 153 | } 154 | 155 | // summarize a feed from a url (if the url is non feed, then try to extract article text) 156 | protected void handleSummarizeFeedCall(HttpServletRequest req, 157 | HttpServletResponse resp) throws ServletException, IOException { 158 | 159 | String feedUrl = req.getParameter("feed_url"); 160 | summarizeFeedUrl(feedUrl, req, resp); 161 | } 162 | 163 | 164 | // summarize a feed from a url (if the url is non feed, then try to extract article text) 165 | protected void summarizeFeedUrl(String feedUrl, HttpServletRequest req, 166 | HttpServletResponse resp) throws ServletException, IOException { 167 | 168 | String contentType = Feeds.getContentType(feedUrl); 169 | if (contentType != null 170 | && (contentType.startsWith("text/html") || contentType 171 | .startsWith("text/plain"))) { 172 | summarizePageText(feedUrl, req, resp); 173 | } else { 174 | String scStr = req.getParameter("sentence_count"); 175 | int sentenceCount = scStr == null?Defaults.MAX_SENTENCES:Integer.parseInt(scStr); 176 | if(sentenceCount <= 0){ sentenceCount = Defaults.MAX_SENTENCES; } 177 | List entries = new ArrayList(); 178 | long start = System.currentTimeMillis(); 179 | long millis = 0; 180 | try { 181 | List feedItems = Feeds.fetchFeedItems(feedUrl); 182 | for (Item item : feedItems) { 183 | String summary = Factory.getSummarizer().summarize( 184 | item.getText(), sentenceCount); 185 | // TODO: (mohaps) hook up keywords after stem word fix (currently working on this) 186 | // might have to wait till I get topic modelling integrated till I turn this back on 187 | 188 | //Set keywords = Factory.getSummarizer().keywords(item.getText(), 10); 189 | Set keywords = null; 190 | SummarizedFeedEntry entry = new SummarizedFeedEntry( 191 | item.getTitle(), item.getAuthor(), item.getLink(), 192 | item.getText(), summary, keywords); 193 | entries.add(entry); 194 | } 195 | } catch (Exception ex) { 196 | throw new IOException("Failed to summarize feed url : " 197 | + feedUrl, ex); 198 | } finally { 199 | millis = System.currentTimeMillis() - start; 200 | } 201 | SummarizedFeed summarizedFeed = new SummarizedFeed(feedUrl, 202 | entries, millis); 203 | req.setAttribute("summarized_feed", summarizedFeed); 204 | req.getRequestDispatcher("/feed_summary.jsp").forward(req, resp); 205 | } 206 | } 207 | 208 | 209 | // summarize a page text from a url 210 | protected void summarizePageText(String pageUrl, HttpServletRequest req, 211 | HttpServletResponse resp) throws ServletException, IOException { 212 | try { 213 | String inputText = Feeds.extractPageBodyText(pageUrl); 214 | String scStr = req.getParameter("sentence_count"); 215 | int sentenceCount = scStr == null?5:Integer.parseInt(scStr); 216 | if(sentenceCount == 0){ sentenceCount = 5; } 217 | summarizeText(pageUrl, inputText, sentenceCount, req, resp); 218 | } catch (Exception ex) { 219 | resp.sendError(500, "Failed to get text from : " + pageUrl 220 | + " error=" + ex.getLocalizedMessage()); 221 | } 222 | } 223 | 224 | // summarize supplied text 225 | protected void summarizeText(String inputUrl, String inputText, int sentenceCount, 226 | HttpServletRequest req, HttpServletResponse resp) 227 | throws ServletException, IOException { 228 | 229 | String summaryText = null; 230 | long start = System.currentTimeMillis(); 231 | long millis = 0; 232 | try { 233 | summaryText = Factory.getSummarizer().summarize(inputText, 234 | sentenceCount); 235 | } catch (Exception ex) { 236 | throw new IOException("Failed to summarize", ex); 237 | } finally { 238 | millis = System.currentTimeMillis() - start; 239 | } 240 | Summary summary = new Summary(inputText, summaryText, 241 | sentenceCount, millis); 242 | req.setAttribute("summary", summary); 243 | if(inputUrl != null) { 244 | req.setAttribute("summary_url", inputUrl); 245 | } 246 | req.getRequestDispatcher("/text_summary.jsp").forward(req, resp); 247 | } 248 | 249 | // handle a summarize text call from the web 250 | protected void handleSummarizeTextCall(HttpServletRequest req, 251 | HttpServletResponse resp) throws ServletException, IOException { 252 | 253 | String inputText = req.getParameter("input_text"); 254 | int sentenceCount = Integer 255 | .parseInt(req.getParameter("sentence_count")); 256 | summarizeText(null, inputText, sentenceCount, req, resp); 257 | } 258 | 259 | } 260 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/Defaults.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | /** 36 | * Default values and constants 37 | * @author mohaps 38 | * 39 | */ 40 | public final class Defaults { 41 | public static final int MAX_SENTENCES = 4; 42 | public static final int MAX_MOST_FREQUENT_WORDS = 20; 43 | public static final int MIN_WORDS_PER_SENTENCE = 5; 44 | public static final int AVG_WORDS_PER_SENTENCE = 20; 45 | 46 | public static final String REGEX_WHITESPACE = "\\W"; 47 | public static final String REGEX_WORDS = "\\s"; 48 | public static final String REGEX_SENTENCES = "(\\.|!|\\?)+(\\s|\\z)+"; 49 | public static final String BLANK_STRING = ""; 50 | public static final byte[] BLANK_BYTES = new byte[0]; 51 | public static final boolean SHOULD_IGNORE_SINGLE_OCCURENCES = true; 52 | public static final String[] BLANK_STRING_ARRAY = new String[0]; 53 | public static final int SUMMARY_LENGTH = 4; 54 | public static final int MAX_API_INPUT_LENGTH = 4*1024; 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/Factory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | /** 36 | * All the default implementations in one handy place (pre-initialized) 37 | * @author mohaps 38 | * 39 | */ 40 | public final class Factory { 41 | public static final IStopWords DEFAULT_STOPWORDS = new StopWords(); 42 | public static final ITokenizer DEFAULT_TOKENIZER = new OpenNLPTokenizer(); 43 | public static final ISummarizer DEFAULT_SUMMARIZER = new Summarizer(DEFAULT_STOPWORDS, DEFAULT_TOKENIZER); 44 | public static final IStopWords getStopWords() { 45 | return DEFAULT_STOPWORDS; 46 | } 47 | public static final ITokenizer getTokenizer() { 48 | return DEFAULT_TOKENIZER; 49 | } 50 | public static final ISummarizer getSummarizer() { 51 | return DEFAULT_SUMMARIZER; 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/IStopWords.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | /** 36 | * Stopwords filter 37 | * @author mohaps 38 | * 39 | */ 40 | public interface IStopWords { 41 | boolean isStopWord(String word); 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/ISummarizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.util.Set; 36 | 37 | /** 38 | * The summarizer interface 39 | * @author mohaps 40 | * 41 | */ 42 | public interface ISummarizer { 43 | /** 44 | * summarize given text (upto sentenceCount sentences) 45 | * @param input - the input text to summarize 46 | * @param sentenceCount - macimum sentence length of the summary 47 | * @param maxFrequentWords - how many keywords to extract at the most 48 | * @param shouldIgnoreSingleOccurences - if we should consider words with occurence count 1 49 | * @return 50 | * @throws Exception 51 | */ 52 | String summarize(final String input, int sentenceCount, int maxFrequentWords, boolean shouldIgnoreSingleOccurences) throws Exception; 53 | String summarize(String input, int sentenceCount) throws Exception; 54 | 55 | /** 56 | * WIP: extract keywords from a given input text 57 | * @param input 58 | * @param maxKeyWords - number of keywords to extract 59 | * @return 60 | * @throws Exception 61 | */ 62 | Set keywords(final String input, int maxKeyWords) throws Exception; 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/ITokenizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | /** 36 | * tokenize a sentence into words 37 | * @author mohaps 38 | * 39 | */ 40 | public interface ITokenizer { 41 | String[] tokenize(String input) throws Exception; 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/OpenNLPTokenizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.io.IOException; 36 | import java.io.InputStream; 37 | 38 | import com.mohaps.tldr.utils.Words; 39 | 40 | import opennlp.tools.tokenize.Tokenizer; 41 | import opennlp.tools.tokenize.TokenizerME; 42 | import opennlp.tools.tokenize.TokenizerModel; 43 | 44 | /** 45 | * Uses the OpenNLP tokenizer (falls back to RegExTokenizer if it can't find the model files for OpenNLP) 46 | * @author mohaps 47 | * 48 | */ 49 | public class OpenNLPTokenizer implements ITokenizer { 50 | 51 | private static TokenizerModel TOKENIZER_MODEL; 52 | private static RegExTokenizer FALLBACK; 53 | static { 54 | try { 55 | InputStream inputFile = Words.class.getClassLoader().getResourceAsStream("en-token.bin"); 56 | if(inputFile != null) { 57 | try { 58 | TOKENIZER_MODEL = new TokenizerModel(inputFile); 59 | System.out.println(">> OpenNLP Tokenizer Model loaded!"); 60 | } finally { 61 | if(inputFile != null) { 62 | try { inputFile.close(); } catch (Throwable t){} 63 | } 64 | } 65 | } 66 | } catch (IOException ex) { 67 | System.err.println("Failed to load token model for OpenNLP. error = "+ex.getLocalizedMessage()+". Will fall back to regex based token parsing"); 68 | ex.printStackTrace(); 69 | } finally { 70 | if(TOKENIZER_MODEL == null) { 71 | FALLBACK = new RegExTokenizer(); 72 | } 73 | } 74 | } 75 | 76 | public String[] tokenize(String input) throws Exception { 77 | if(TOKENIZER_MODEL != null) { 78 | Tokenizer tokenizer = new TokenizerME(TOKENIZER_MODEL); 79 | return tokenizer.tokenize(input); 80 | } else { 81 | return FALLBACK.tokenize(input); 82 | } 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/RegExTokenizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | /** 35 | * Simple regex based tokenizer (used as a fallback) 36 | * @author mohaps 37 | * 38 | */ 39 | public class RegExTokenizer implements ITokenizer { 40 | private String tokenRegEx; 41 | public RegExTokenizer() { 42 | this(Defaults.REGEX_WORDS); 43 | } 44 | public RegExTokenizer(String tokenRegEx) { 45 | this.tokenRegEx = tokenRegEx; 46 | if(this.tokenRegEx == null) { 47 | this.tokenRegEx = Defaults.REGEX_WORDS; 48 | } 49 | } 50 | public String[] tokenize(String input) throws Exception{ 51 | if(input == null || input.length() == 0){ 52 | return Defaults.BLANK_STRING_ARRAY; 53 | } else { 54 | return input.split(tokenRegEx); 55 | } 56 | } 57 | public String toString() { 58 | return new StringBuilder("RegExTokenizer(regex=\"").append(tokenRegEx).append("\"").toString(); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/StopWords.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.util.HashSet; 36 | import java.util.Set; 37 | /** 38 | * Default set of stop words 39 | * @author mohaps 40 | * 41 | */ 42 | public class StopWords implements IStopWords { 43 | public static final String[] STOPWORDS = new String[] { "a", "able", 44 | "about", "across", "after", "all", "almost", "also", "am", "among", 45 | "an", "and", "any", "are", "as", "at", "be", "because", "been", 46 | "but", "by", "can", "cannot", "can\'t", "could", "dear", "did", 47 | "do", "does", "either", "else", "ever", "every", "for", "from", 48 | "get", "got", "had", "has", "have", "he", "her", "hers", "him", 49 | "his", "how", "however", "i", "if", "in", "into", "is", "it", 50 | "its", "just", "least", "let", "like", "likely", "may", "me", 51 | "might", "most", "must", "my", "neither", "no", "nor", "not", "of", 52 | "off", "often", "on", "only", "or", "other", "our", "own", 53 | "rather", "said", "say", "says", "she", "should", "since", "so", 54 | "some", "than", "that", "the", "their", "them", "then", "there", 55 | "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", 56 | "was", "we", "were", "what", "when", "where", "which", "while", 57 | "who", "whom", "why", "will", "with", "would", "won\'t", "yet", 58 | "you", "your" }; 59 | 60 | private Set stopWords = new HashSet(); 61 | 62 | public StopWords() { 63 | for(int i = 0; i < STOPWORDS.length; i++) { 64 | stopWords.add(STOPWORDS[i]); 65 | } 66 | } 67 | 68 | public boolean isStopWord(String word) { 69 | return word == null || word.length() < 2 || stopWords.contains(word); 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/Summarizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.security.MessageDigest; 36 | import java.util.Comparator; 37 | import java.util.HashMap; 38 | import java.util.Iterator; 39 | import java.util.Set; 40 | import java.util.TreeSet; 41 | 42 | import com.mohaps.tldr.utils.Words; 43 | /** 44 | * The summarizer implementation (uses keyword density to create a summary) 45 | * @author mohaps 46 | * 47 | */ 48 | public class Summarizer implements ISummarizer { 49 | private IStopWords stopWords; 50 | private ITokenizer tokenizer; 51 | 52 | public Summarizer(IStopWords stopWords, ITokenizer tokenizer) { 53 | this.stopWords = stopWords; 54 | this.tokenizer = tokenizer; 55 | } 56 | 57 | public String summarize(final String inputRaw, int sentenceCount, 58 | int maxFrequentWords, boolean shouldIgnoreSingleOccurences) 59 | throws Exception { 60 | 61 | // for short bursts just return the input itself 62 | if (inputRaw.length() < sentenceCount * Defaults.AVG_WORDS_PER_SENTENCE) { 63 | return inputRaw; 64 | } else { 65 | 66 | // check summary cache for input hit (this optimizes repeated summarize calls) 67 | byte[] inputHash = sha1(inputRaw, ":sentences=", Integer.toString(sentenceCount)); 68 | String cached = SummaryCache.instance().get(inputHash); 69 | if (cached != null) { 70 | return cached; 71 | } else { 72 | // change U.S. to US etc. 73 | final String input = Words.dotCorrection(inputRaw); 74 | // get top 100 most frequent words that are not stop words 75 | Set frequentWords = Words.getMostFrequent(input, 76 | tokenizer, stopWords, maxFrequentWords, 77 | shouldIgnoreSingleOccurences?2:1); 78 | // now let's get the unique sentences 79 | Set sentences = Words.parseSentences(input, tokenizer, 80 | Defaults.MIN_WORDS_PER_SENTENCE); 81 | 82 | // hashmap to cache sentence indices 83 | final HashMap sentenceIndex = new HashMap(); 84 | // we'll sort the sentences based on their appearance in the 85 | // input 86 | // text 87 | Set outputSentences = new TreeSet( 88 | new Comparator() { 89 | public int compare(String sentence1, 90 | String sentence2) { 91 | int index1 = -1; 92 | int index2 = -1; 93 | // check cache for sentence 1 94 | Integer index1Obj = sentenceIndex 95 | .get(sentence1); 96 | if (index1Obj == null) { 97 | index1 = input.indexOf(sentence1); 98 | sentenceIndex.put(sentence1, new Integer( 99 | index1)); 100 | } else { 101 | index1 = index1Obj.intValue(); 102 | } 103 | // check cache for sentence 2 104 | Integer index2Obj = sentenceIndex 105 | .get(sentence2); 106 | if (index2Obj == null) { 107 | index2 = input.indexOf(sentence2); 108 | sentenceIndex.put(sentence2, new Integer( 109 | index2)); 110 | } else { 111 | index2 = index2Obj.intValue(); 112 | } 113 | 114 | return index1 - index2; 115 | } 116 | }); 117 | 118 | // now look through the sentences and build summary ( not 119 | // exceeding 120 | // sentenceCount sentences ) 121 | Iterator iter = sentences.iterator(); 122 | while (iter.hasNext() 123 | && outputSentences.size() < sentenceCount) { 124 | String actualSentence = iter.next(); 125 | String workingSentence = actualSentence.toLowerCase(); 126 | Iterator words = frequentWords.iterator(); 127 | while (words.hasNext()) { 128 | String word = words.next(); 129 | if (workingSentence.indexOf(word) >= 0) { 130 | outputSentences.add(actualSentence); 131 | } 132 | if (outputSentences.size() >= sentenceCount) { 133 | break; 134 | } 135 | } 136 | 137 | } 138 | // clear the sentence index cache 139 | sentenceIndex.clear(); 140 | // build the paragraph 141 | StringBuilder sb = new StringBuilder(); 142 | Iterator summarySentences = outputSentences.iterator(); 143 | while (summarySentences.hasNext()) { 144 | sb.append(summarySentences.next()).append("."); 145 | if (summarySentences.hasNext()) { 146 | sb.append(" "); 147 | } 148 | } 149 | // bob's your uncle :) 150 | String summary = sb.toString(); 151 | // update summary cache 152 | SummaryCache.instance().put(inputHash, summary); 153 | return summary; 154 | } 155 | } 156 | } 157 | 158 | public static final byte[] sha1(String ...inputs) throws Exception { 159 | MessageDigest md = MessageDigest.getInstance("SHA1"); 160 | for(String input : inputs) { 161 | md.update(input.getBytes()); 162 | } 163 | return md.digest(); 164 | } 165 | 166 | public String summarize(final String input, int sentenceCount) 167 | throws Exception { 168 | return summarize(input, sentenceCount, 169 | Defaults.MAX_MOST_FREQUENT_WORDS, 170 | Defaults.SHOULD_IGNORE_SINGLE_OCCURENCES); 171 | 172 | } 173 | 174 | public Set keywords(String input, int maxKeyWords) throws Exception { 175 | 176 | return Words.getMostFrequent(input, 177 | tokenizer, stopWords, maxKeyWords, 178 | 4); 179 | } 180 | 181 | } 182 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/SummaryCache.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.util.*; 36 | /** 37 | * A summary cache (psuedo-LRU, doesn't refresh keys) 38 | * In a real world service, this would be in memcached/redis type store. 39 | * @author mohaps 40 | * 41 | */ 42 | public final class SummaryCache { 43 | public static final int MAX_CACHE_SIZE = 40; 44 | private static final SummaryCache sInstance = new SummaryCache(); 45 | public static final SummaryCache instance() { return sInstance; } 46 | static final class Key { 47 | private byte[] key; 48 | Key(byte[] key) { this.key = key; } 49 | public boolean equals(Object o) { 50 | if(o != null && o instanceof Key) { 51 | return Arrays.equals(((Key)o).key, key); 52 | } 53 | return false; 54 | } 55 | public int hashCode() { return Arrays.hashCode(key); } 56 | } 57 | private Map cache = new LinkedHashMap(); 58 | public void put(byte[] textHash, String summary) { 59 | cache.put(new Key(textHash), summary); 60 | //very crude psuedo-LRU style eviction 61 | //since we're not updating the keys on access 62 | //it just evicts the oldest added one 63 | //TODO: make it full LRU 64 | if(cache.size() > MAX_CACHE_SIZE + 10) { 65 | while(cache.size() > MAX_CACHE_SIZE){ 66 | cache.remove(cache.keySet().iterator().next()); 67 | } 68 | } 69 | } 70 | public String get(byte[] inputHash) { 71 | //TODO: should update key order here 72 | return cache.get(new Key(inputHash)); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/utils/Feeds.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.utils; 34 | 35 | import java.util.*; 36 | import java.util.logging.Level; 37 | import java.util.logging.Logger; 38 | import java.net.*; 39 | import java.io.*; 40 | 41 | import org.jsoup.Jsoup; 42 | 43 | import com.mohaps.tldr.summarize.Defaults; 44 | import com.mohaps.tldr.summarize.Factory; 45 | import com.sun.syndication.feed.synd.SyndContent; 46 | import com.sun.syndication.feed.synd.SyndEntry; 47 | import com.sun.syndication.feed.synd.SyndFeed; 48 | import com.sun.syndication.io.SyndFeedInput; 49 | import com.sun.syndication.io.XmlReader; 50 | 51 | import de.l3s.boilerpipe.extractors.ArticleExtractor; 52 | 53 | /** 54 | * Kitchen sink utility class for dealing with Feeds 55 | * 56 | * @author mohaps 57 | * 58 | */ 59 | public final class Feeds { 60 | static { 61 | CookieHandler.setDefault(new CookieManager()); 62 | } 63 | public static class Item { 64 | private String title; 65 | private String author; 66 | private String link; 67 | private String text; 68 | 69 | public Item() { 70 | 71 | } 72 | 73 | public Item(String title, String author, String link, String text) { 74 | super(); 75 | this.title = title; 76 | this.author = author; 77 | this.link = link; 78 | this.text = text; 79 | } 80 | 81 | public String getTitle() { 82 | return title; 83 | } 84 | 85 | public String getAuthor() { 86 | return author; 87 | } 88 | 89 | public String getLink() { 90 | return link; 91 | } 92 | 93 | public String getText() { 94 | return text; 95 | } 96 | 97 | @Override 98 | public String toString() { 99 | return "Feeds.Item [title=" + title + ", author=" + author 100 | + ", link=" + link + "]"; 101 | } 102 | 103 | public void setTitle(String title) { 104 | this.title = title; 105 | } 106 | 107 | public void setAuthor(String author) { 108 | this.author = author; 109 | } 110 | 111 | public void setLink(String link) { 112 | this.link = link; 113 | } 114 | 115 | public void setText(String text) { 116 | this.text = text; 117 | } 118 | 119 | } 120 | 121 | public static final String extractPageBodyText(String pageUrl) 122 | throws Exception { 123 | URL url = new URL(pageUrl); 124 | URLConnection conn = url.openConnection(); 125 | 126 | if (url.getProtocol().startsWith("http")) { 127 | 128 | if (((HttpURLConnection) conn).getResponseCode() == 303) { 129 | String location = conn.getHeaderField("Location"); 130 | System.out.println(">> 303 Other : " + location); 131 | return Words.replaceSmartQuotes(ArticleExtractor.INSTANCE 132 | .getText(fetchPageText(location))); 133 | } 134 | } 135 | 136 | String text = Words.replaceSmartQuotes(ArticleExtractor.INSTANCE 137 | .getText(new URL(pageUrl))); 138 | if (text == null || text.length() == 0) { 139 | text = Words.replaceSmartQuotes(ArticleExtractor.INSTANCE 140 | .getText(Feeds.fetchPageText(pageUrl))); 141 | } 142 | return Jsoup.parse(text).body().text(); 143 | }/* 144 | * public static final String escapeHtml(String input) { if(input == null || 145 | * input.length() == 0){ return ""; } else { try { return 146 | * org.apache.commons.lang3.StringEscapeUtils.escapeHtml4(input); } 147 | * catch(Exception ex){ return input; } } } 148 | */ 149 | 150 | public static final String fetchPageText(String pageUrl) throws Exception { 151 | URL url = new URL(pageUrl); 152 | URLConnection conn = url.openConnection(); 153 | if (url.getProtocol().startsWith("http")) { 154 | if (((HttpURLConnection) conn).getResponseCode() == 303) { 155 | String location = conn.getHeaderField("Location"); 156 | System.out.println(">> 303 Other : " + location); 157 | return fetchPageText(location); 158 | } 159 | } 160 | InputStream in = conn.getInputStream(); 161 | int contentLength = conn.getContentLength(); 162 | String contentType = conn.getContentType(); 163 | if (contentType != null 164 | && !(contentType.toLowerCase().startsWith("text/") || contentType 165 | .toLowerCase().endsWith("xml"))) { 166 | throw new Exception("Content type " + contentType 167 | + " detected at page " + pageUrl + " is non-textual"); 168 | } 169 | String encoding = conn.getContentEncoding(); 170 | Logger.getLogger("Feeds").log(Level.INFO, 171 | "!---- Encoding Detected : " + encoding); 172 | if (encoding == null) { 173 | encoding = "ISO-8859-1"; 174 | } 175 | if (contentLength >= 0) { 176 | System.out.println(">> Reading " + contentLength + " bytes!!!"); 177 | byte[] buf = new byte[contentLength]; 178 | int bread = readUpto(in, contentLength, buf, 0, buf.length); 179 | return bread > 0 ? new String(buf, 0, bread, encoding) 180 | : Defaults.BLANK_STRING; 181 | } else { 182 | System.out.println(">> No content length specified for " + pageUrl 183 | + ". Falling back to reading line by line..."); 184 | StringBuilder sb = new StringBuilder(); 185 | BufferedReader reader = new BufferedReader( 186 | new InputStreamReader(in)); 187 | for (String line = reader.readLine(); line != null; line = reader 188 | .readLine()) { 189 | sb.append(line).append("\n"); 190 | } 191 | return Words.replaceSmartQuotes(sb.toString()); 192 | } 193 | } 194 | 195 | public static int readUpto(InputStream in, int contentLength, byte[] buf, 196 | int offset, int length) throws Exception { 197 | int totalRead = 0; 198 | if (buf == null || offset < 0 || length < 0 199 | || (offset + length) < contentLength) { 200 | return 0; 201 | } else { 202 | while (totalRead < contentLength) { 203 | int bread = in.read(buf, totalRead, contentLength - totalRead); 204 | if (bread > 0) { 205 | totalRead += bread; 206 | } else { 207 | break; 208 | } 209 | } 210 | return totalRead; 211 | } 212 | } 213 | 214 | public static final String getContentType(String feedUrl) { 215 | try { 216 | URL url = new URL(feedUrl); 217 | 218 | HttpURLConnection connection = (HttpURLConnection) url 219 | .openConnection(); 220 | connection.setRequestMethod("HEAD"); 221 | connection.connect(); 222 | try { 223 | return connection.getContentType(); 224 | } finally { 225 | try { 226 | connection.disconnect(); 227 | } catch (Exception ex) { 228 | } 229 | } 230 | 231 | } catch (Exception ex) { 232 | return null; 233 | } 234 | } 235 | 236 | public static final List fetchFeedItems(String feedUrl) 237 | throws Exception { 238 | URL url = new URL(feedUrl); 239 | SyndFeedInput input = new SyndFeedInput(); 240 | // XmlReader.setDefaultEncoding("ISO-8859-1"); 241 | XmlReader xmlReader = new XmlReader(url); 242 | SyndFeed feed = input.build(xmlReader); 243 | ArrayList items = new ArrayList(); 244 | @SuppressWarnings("unchecked") 245 | Iterator entries = feed.getEntries().iterator(); 246 | while (entries.hasNext()) { 247 | SyndEntry entry = entries.next(); 248 | String title = entry.getTitle(); 249 | if (title == null) { 250 | title = "unknown"; 251 | } 252 | title = Words.replaceSmartQuotes(title); 253 | String link = entry.getLink(); 254 | if (link == null) { 255 | link = feedUrl; 256 | } 257 | String author = entry.getAuthor(); 258 | if (author == null || author.length() == 0) { 259 | author = "unknown"; 260 | } 261 | 262 | @SuppressWarnings("unchecked") 263 | List contents = entry.getContents(); 264 | if (contents.size() == 0) { 265 | String rawDesc; 266 | if (entry.getDescription() == null) { 267 | rawDesc = fetchPageText(entry.getLink()); 268 | } else { 269 | rawDesc = entry.getDescription().getValue(); 270 | } 271 | String desc = rawDesc != null ? Jsoup.parse( 272 | Words.replaceSmartQuotes(rawDesc)).text() : entry 273 | .getLink(); 274 | items.add(new Item(title, author, link, desc)); 275 | } else { 276 | // System.out.println(title); 277 | for (SyndContent content : contents) { 278 | if (content.getType().equalsIgnoreCase("html")) { 279 | String html = Jsoup.parse( 280 | Words.replaceSmartQuotes(content.getValue())) 281 | .text(); 282 | items.add(new Item(title, author, link, html)); 283 | } else { 284 | System.out.println(">> non html content type : " 285 | + content.getType()); 286 | } 287 | } 288 | } 289 | } 290 | return items; 291 | } 292 | 293 | public static final void main(String[] args) throws Exception { 294 | String feedUrl = args.length > 1 ? args[1] 295 | : "http://www.nytimes.com/2013/04/12/world/asia/north-korea-may-have-nuclear-missile-capability-us-agency-says.html"; 296 | String html = Feeds.fetchPageText(feedUrl); 297 | System.out.println(">> html : " + html); 298 | String pageText = Feeds.extractPageBodyText(feedUrl); 299 | System.out.println(pageText); 300 | } 301 | 302 | } 303 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/utils/Pages.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.utils; 34 | 35 | import org.jsoup.Jsoup; 36 | /** 37 | * Page text kitchen sink 38 | * @author mohaps 39 | * 40 | */ 41 | public final class Pages { 42 | 43 | public static final String getBodyTextFromHTML(String html) throws Exception { 44 | return Jsoup.parse(html).body().text(); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/utils/PorterStemmer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.utils; 34 | 35 | /** 36 | * Stemmer, implementing the Porter Stemming Algorithm (from http://tartarus.org/martin/PorterStemmer/java.txt) 37 | * 38 | * The Stemmer class transforms a word into its root form. The input word can be 39 | * provided a character at time (by calling add()), or at once by calling one of 40 | * the various stem(something) methods. 41 | */ 42 | 43 | public class PorterStemmer { 44 | private char[] b; 45 | private int i, /* offset into b */ 46 | i_end, /* offset to end of stemmed word */ 47 | j, k; 48 | private static final int INC = 50; 49 | 50 | /* unit of size whereby b is increased */ 51 | public PorterStemmer() { 52 | b = new char[INC]; 53 | i = 0; 54 | i_end = 0; 55 | } 56 | 57 | /** 58 | * Add a character to the word being stemmed. When you are finished adding 59 | * characters, you can call stem(void) to stem the word. 60 | */ 61 | public void add(char ch) { 62 | if (i == b.length) { 63 | char[] new_b = new char[i + INC]; 64 | for (int c = 0; c < i; c++) 65 | new_b[c] = b[c]; 66 | b = new_b; 67 | } 68 | b[i++] = ch; 69 | } 70 | 71 | /** 72 | * Adds wLen characters to the word being stemmed contained in a portion of 73 | * a char[] array. This is like repeated calls of add(char ch), but faster. 74 | */ 75 | 76 | public void add(char[] w, int wLen) { 77 | if (i + wLen >= b.length) { 78 | char[] new_b = new char[i + wLen + INC]; 79 | for (int c = 0; c < i; c++) 80 | new_b[c] = b[c]; 81 | b = new_b; 82 | } 83 | for (int c = 0; c < wLen; c++) 84 | b[i++] = w[c]; 85 | } 86 | 87 | public void add(String s) { 88 | add(s.toLowerCase().toCharArray(), s.length()); 89 | } 90 | 91 | public String addAndStem(String s) { 92 | add(s); 93 | stem(); 94 | return toString(); 95 | } 96 | 97 | /** 98 | * After a word has been stemmed, it can be retrieved by toString(), or a 99 | * reference to the internal buffer can be retrieved by getResultBuffer and 100 | * getResultLength (which is generally more efficient.) 101 | */ 102 | public String toString() { 103 | return new String(b, 0, i_end); 104 | } 105 | 106 | /** 107 | * Returns the length of the word resulting from the stemming process. 108 | */ 109 | public int getResultLength() { 110 | return i_end; 111 | } 112 | 113 | /** 114 | * Returns a reference to a character buffer containing the results of the 115 | * stemming process. You also need to consult getResultLength() to determine 116 | * the length of the result. 117 | */ 118 | public char[] getResultBuffer() { 119 | return b; 120 | } 121 | 122 | /* cons(i) is true <=> b[i] is a consonant. */ 123 | private final boolean cons(int i) { 124 | switch (b[i]) { 125 | case 'a': 126 | case 'e': 127 | case 'i': 128 | case 'o': 129 | case 'u': 130 | return false; 131 | case 'y': 132 | return (i == 0) ? true : !cons(i - 1); 133 | default: 134 | return true; 135 | } 136 | } 137 | 138 | /* 139 | * m() measures the number of consonant sequences between 0 and j. if c is a 140 | * consonant sequence and v a vowel sequence, and <..> indicates arbitrary 141 | * presence, 142 | * 143 | * gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3 144 | * .... 145 | */ 146 | private final int m() { 147 | int n = 0; 148 | int i = 0; 149 | while (true) { 150 | if (i > j) 151 | return n; 152 | if (!cons(i)) 153 | break; 154 | i++; 155 | } 156 | i++; 157 | while (true) { 158 | while (true) { 159 | if (i > j) 160 | return n; 161 | if (cons(i)) 162 | break; 163 | i++; 164 | } 165 | i++; 166 | n++; 167 | while (true) { 168 | if (i > j) 169 | return n; 170 | if (!cons(i)) 171 | break; 172 | i++; 173 | } 174 | i++; 175 | } 176 | } 177 | 178 | /* vowelinstem() is true <=> 0,...j contains a vowel */ 179 | private final boolean vowelinstem() { 180 | int i; 181 | for (i = 0; i <= j; i++) 182 | if (!cons(i)) 183 | return true; 184 | return false; 185 | } 186 | 187 | /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ 188 | private final boolean doublec(int j) { 189 | if (j < 1) 190 | return false; 191 | if (b[j] != b[j - 1]) 192 | return false; 193 | return cons(j); 194 | } 195 | 196 | /* 197 | * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant 198 | * and also if the second c is not w,x or y. this is used when trying to 199 | * restore an e at the end of a short word. e.g. 200 | * 201 | * cav(e), lov(e), hop(e), crim(e), but snow, box, tray. 202 | */ 203 | private final boolean cvc(int i) { 204 | if (i < 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) 205 | return false; 206 | { 207 | int ch = b[i]; 208 | if (ch == 'w' || ch == 'x' || ch == 'y') 209 | return false; 210 | } 211 | return true; 212 | } 213 | 214 | private final boolean ends(String s) { 215 | int l = s.length(); 216 | int o = k - l + 1; 217 | if (o < 0) 218 | return false; 219 | for (int i = 0; i < l; i++) 220 | if (b[o + i] != s.charAt(i)) 221 | return false; 222 | j = k - l; 223 | return true; 224 | } 225 | 226 | /* 227 | * setto(s) sets (j+1),...k to the characters in the string s, readjusting 228 | * k. 229 | */ 230 | private final void setto(String s) { 231 | int l = s.length(); 232 | int o = j + 1; 233 | for (int i = 0; i < l; i++) 234 | b[o + i] = s.charAt(i); 235 | k = j + l; 236 | } 237 | 238 | /* r(s) is used further down. */ 239 | 240 | private final void r(String s) { 241 | if (m() > 0) 242 | setto(s); 243 | } 244 | 245 | /* 246 | * step1() gets rid of plurals and -ed or -ing. e.g. 247 | * 248 | * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat 249 | * 250 | * feed -> feed agreed -> agree disabled -> disable 251 | * 252 | * matting -> mat mating -> mate meeting -> meet milling -> mill messing -> 253 | * mess 254 | * 255 | * meetings -> meet 256 | */ 257 | private final void step1() { 258 | if (b[k] == 's') { 259 | if (ends("sses")) 260 | k -= 2; 261 | else if (ends("ies")) 262 | setto("i"); 263 | else if (b[k - 1] != 's') 264 | k--; 265 | } 266 | if (ends("eed")) { 267 | if (m() > 0) 268 | k--; 269 | } else if ((ends("ed") || ends("ing")) && vowelinstem()) { 270 | k = j; 271 | if (ends("at")) 272 | setto("ate"); 273 | else if (ends("bl")) 274 | setto("ble"); 275 | else if (ends("iz")) 276 | setto("ize"); 277 | else if (doublec(k)) { 278 | k--; 279 | { 280 | int ch = b[k]; 281 | if (ch == 'l' || ch == 's' || ch == 'z') 282 | k++; 283 | } 284 | } else if (m() == 1 && cvc(k)) 285 | setto("e"); 286 | } 287 | } 288 | 289 | /* step2() turns terminal y to i when there is another vowel in the stem. */ 290 | 291 | private final void step2() { 292 | if (ends("y") && vowelinstem()) 293 | b[k] = 'i'; 294 | } 295 | 296 | /* 297 | * step3() maps double suffices to single ones. so -ization ( = -ize plus 298 | * -ation) maps to -ize etc. note that the string before the suffix must 299 | * give m() > 0. 300 | */ 301 | 302 | private final void step3() { 303 | if (k == 0) 304 | return; 305 | /* For Bug 1 */ 306 | switch (b[k - 1]) { 307 | case 'a': 308 | if (ends("ational")) { 309 | r("ate"); 310 | break; 311 | } 312 | if (ends("tional")) { 313 | r("tion"); 314 | break; 315 | } 316 | break; 317 | case 'c': 318 | if (ends("enci")) { 319 | r("ence"); 320 | break; 321 | } 322 | if (ends("anci")) { 323 | r("ance"); 324 | break; 325 | } 326 | break; 327 | case 'e': 328 | if (ends("izer")) { 329 | r("ize"); 330 | break; 331 | } 332 | break; 333 | case 'g': 334 | if (ends("logi")) { 335 | r("log"); 336 | break; 337 | } 338 | break; 339 | case 'l': 340 | if (ends("bli")) { 341 | r("ble"); 342 | break; 343 | } 344 | if (ends("alli")) { 345 | r("al"); 346 | break; 347 | } 348 | if (ends("entli")) { 349 | r("ent"); 350 | break; 351 | } 352 | if (ends("eli")) { 353 | r("e"); 354 | break; 355 | } 356 | if (ends("ousli")) { 357 | r("ous"); 358 | break; 359 | } 360 | break; 361 | case 'o': 362 | if (ends("ization")) { 363 | r("ize"); 364 | break; 365 | } 366 | if (ends("ation")) { 367 | r("ate"); 368 | break; 369 | } 370 | if (ends("ator")) { 371 | r("ate"); 372 | break; 373 | } 374 | break; 375 | case 's': 376 | if (ends("alism")) { 377 | r("al"); 378 | break; 379 | } 380 | if (ends("iveness")) { 381 | r("ive"); 382 | break; 383 | } 384 | if (ends("fulness")) { 385 | r("ful"); 386 | break; 387 | } 388 | if (ends("ousness")) { 389 | r("ous"); 390 | break; 391 | } 392 | break; 393 | case 't': 394 | if (ends("aliti")) { 395 | r("al"); 396 | break; 397 | } 398 | if (ends("iviti")) { 399 | r("ive"); 400 | break; 401 | } 402 | if (ends("biliti")) { 403 | r("ble"); 404 | break; 405 | } 406 | break; 407 | } 408 | } 409 | 410 | /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ 411 | 412 | private final void step4() { 413 | switch (b[k]) { 414 | case 'e': 415 | if (ends("icate")) { 416 | r("ic"); 417 | break; 418 | } 419 | if (ends("ative")) { 420 | r(""); 421 | break; 422 | } 423 | if (ends("alize")) { 424 | r("al"); 425 | break; 426 | } 427 | break; 428 | case 'i': 429 | if (ends("iciti")) { 430 | r("ic"); 431 | break; 432 | } 433 | break; 434 | case 'l': 435 | if (ends("ical")) { 436 | r("ic"); 437 | break; 438 | } 439 | if (ends("ful")) { 440 | r(""); 441 | break; 442 | } 443 | break; 444 | case 's': 445 | if (ends("ness")) { 446 | r(""); 447 | break; 448 | } 449 | break; 450 | case '0': 451 | break; 452 | } 453 | } 454 | 455 | /* step5() takes off -ant, -ence etc., in context vcvc. */ 456 | 457 | private final void step5() { 458 | if (k == 0) 459 | return; /* for Bug 1 */ 460 | switch (b[k - 1]) { 461 | case 'a': 462 | if (ends("al")) 463 | break; 464 | return; 465 | case 'c': 466 | if (ends("ance")) 467 | break; 468 | if (ends("ence")) 469 | break; 470 | return; 471 | case 'e': 472 | if (ends("er")) 473 | break; 474 | return; 475 | case 'i': 476 | if (ends("ic")) 477 | break; 478 | return; 479 | case 'l': 480 | if (ends("able")) 481 | break; 482 | if (ends("ible")) 483 | break; 484 | return; 485 | case 'n': 486 | if (ends("ant")) 487 | break; 488 | if (ends("ement")) 489 | break; 490 | if (ends("ment")) 491 | break; 492 | /* element etc. not stripped before the m */ 493 | if (ends("ent")) 494 | break; 495 | return; 496 | case 'o': 497 | if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) 498 | break; 499 | /* j >= 0 fixes Bug 2 */ 500 | if (ends("ou")) 501 | break; 502 | return; 503 | /* takes care of -ous */ 504 | case 's': 505 | if (ends("ism")) 506 | break; 507 | return; 508 | case 't': 509 | if (ends("ate")) 510 | break; 511 | if (ends("iti")) 512 | break; 513 | return; 514 | case 'u': 515 | if (ends("ous")) 516 | break; 517 | return; 518 | case 'v': 519 | if (ends("ive")) 520 | break; 521 | return; 522 | case 'z': 523 | if (ends("ize")) 524 | break; 525 | return; 526 | default: 527 | return; 528 | } 529 | if (m() > 1) 530 | k = j; 531 | } 532 | 533 | /* step6() removes a final -e if m() > 1. */ 534 | private final void step6() { 535 | j = k; 536 | if (b[k] == 'e') { 537 | int a = m(); 538 | if (a > 1 || a == 1 && !cvc(k - 1)) 539 | k--; 540 | } 541 | if (b[k] == 'l' && doublec(k) && m() > 1) 542 | k--; 543 | } 544 | 545 | /** 546 | * Stem the word placed into the Stemmer buffer through calls to add(). 547 | * Returns true if the stemming process resulted in a word different from 548 | * the input. You can retrieve the result with 549 | * getResultLength()/getResultBuffer() or toString(). 550 | */ 551 | public void stem() { 552 | k = i - 1; 553 | if (k > 1) { 554 | step1(); 555 | step2(); 556 | step3(); 557 | step4(); 558 | step5(); 559 | step6(); 560 | } 561 | i_end = k + 1; 562 | i = 0; 563 | } 564 | 565 | /** 566 | * Test program for demonstrating the Stemmer. It reads text from a a list 567 | * of files, stems each word, and writes the result to standard output. Note 568 | * that the word stemmed is expected to be in lower case: forcing lower case 569 | * must be done outside the Stemmer class. Usage: Stemmer file-name 570 | * file-name ... 571 | */ 572 | // public static void main(String[] args) { 573 | // char[] w = new char[501]; 574 | // Stemmer s = new Stemmer(); 575 | // for (int i = 0; i < args.length; i++) { 576 | // try { 577 | // FileInputStream in = new FileInputStream(args[i]); 578 | // 579 | // try { 580 | // while (true) 581 | // 582 | // { 583 | // int ch = in.read(); 584 | // if (Character.isLetter((char) ch)) { 585 | // int j = 0; 586 | // while (true) { 587 | // ch = Character.toLowerCase((char) ch); 588 | // w[j] = (char) ch; 589 | // if (j < 500) 590 | // j++; 591 | // ch = in.read(); 592 | // if (!Character.isLetter((char) ch)) { 593 | // /* to test add(char ch) */ 594 | // for (int c = 0; c < j; c++) 595 | // s.add(w[c]); 596 | // 597 | // /* or, to test add(char[] w, int j) */ 598 | // /* s.add(w, j); */ 599 | // 600 | // s.stem(); 601 | // { 602 | // String u; 603 | // 604 | // /* and now, to test toString() : */ 605 | // u = s.toString(); 606 | // 607 | // /* 608 | // * to test getResultBuffer(), 609 | // * getResultLength() : 610 | // */ 611 | // /* 612 | // * u = new String(s.getResultBuffer(), 613 | // * 0, s.getResultLength()); 614 | // */ 615 | // 616 | // System.out.print(u); 617 | // } 618 | // break; 619 | // } 620 | // } 621 | // } 622 | // if (ch < 0) 623 | // break; 624 | // System.out.print((char) ch); 625 | // } 626 | // } catch (IOException e) { 627 | // System.out.println("error reading " + args[i]); 628 | // break; 629 | // } 630 | // } catch (FileNotFoundException e) { 631 | // System.out.println("file " + args[i] + " not found"); 632 | // break; 633 | // } 634 | // } 635 | // } 636 | 637 | // public String run(String word) { 638 | // char[] w = new char[501]; 639 | // Stemmer s = new Stemmer(); 640 | // char[] letters = word.toCharArray(); 641 | // 642 | // for (int i = 0; i < letters.length; i++) { 643 | // int ch = letters[i]; 644 | // if (Character.isLetter((char) ch)) { 645 | // int j = 0; 646 | // 647 | // ch = Character.toLowerCase((char) ch); 648 | // w[j] = (char) ch; 649 | // if (j < 500) 650 | // j++; 651 | // ch = letters[i]; 652 | // if (!Character.isLetter((char) ch)) { 653 | // /* to test add(char ch) */ 654 | // for (int c = 0; c < j; c++) 655 | // s.add(w[c]); 656 | // 657 | // /* or, to test add(char[] w, int j) */ 658 | // /* s.add(w, j); */ 659 | // 660 | // s.stem(); 661 | // String u; 662 | // 663 | // /* and now, to test toString() : */ 664 | // u = s.toString(); 665 | // 666 | // /* 667 | // * to test getResultBuffer(), getResultLength() 668 | // * : 669 | // */ 670 | // /* 671 | // * u = new String(s.getResultBuffer(), 0, 672 | // * s.getResultLength()); 673 | // */ 674 | // 675 | // System.out.print("Word stemmed from " + word+ " to " + u); 676 | // return u; 677 | // // break; 678 | // } 679 | // } 680 | // if (ch < 0) break; 681 | // System.out.print("Erroneous character: " + (char) ch); 682 | // } 683 | // return new String(""); 684 | // } 685 | 686 | } -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/utils/Words.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.utils; 34 | 35 | import java.util.ArrayList; 36 | import java.util.Collections; 37 | import java.util.Comparator; 38 | import java.util.HashMap; 39 | import java.util.HashSet; 40 | import java.util.Iterator; 41 | import java.util.Set; 42 | 43 | import opennlp.tools.sentdetect.SentenceDetectorME; 44 | import opennlp.tools.sentdetect.SentenceModel; 45 | 46 | import com.mohaps.tldr.summarize.Defaults; 47 | import com.mohaps.tldr.summarize.IStopWords; 48 | import com.mohaps.tldr.summarize.ITokenizer; 49 | 50 | import java.io.*; 51 | 52 | import org.tartarus.snowball.SnowballStemmer; 53 | import org.tartarus.snowball.ext.englishStemmer; 54 | /** 55 | * Utility methods to use for word operations 56 | * Will try to use OpenNLP by default, failing that will fall back to regex based manipulation/extraction 57 | * @author mohaps 58 | * 59 | */ 60 | public final class Words { 61 | private static SentenceModel SENTENCE_MODEL; 62 | static { 63 | try { 64 | InputStream inputFile = Words.class.getClassLoader() 65 | .getResourceAsStream("en-sent.bin"); 66 | if (inputFile != null) { 67 | try { 68 | SENTENCE_MODEL = new SentenceModel(inputFile); 69 | System.out.println(">> OpenNLP Sentence Model loaded!"); 70 | } finally { 71 | if (inputFile != null) { 72 | try { 73 | inputFile.close(); 74 | } catch (Throwable t) { 75 | } 76 | } 77 | } 78 | } 79 | } catch (IOException ex) { 80 | System.err 81 | .println("Failed to load sentence model for OpenNLP. error = " 82 | + ex.getLocalizedMessage() 83 | + ". Will fall back to regex based sentence parsing"); 84 | ex.printStackTrace(); 85 | } 86 | } 87 | 88 | private static class Word { 89 | private String word; 90 | private int frequency; 91 | 92 | public Word(String word) { 93 | this.word = word.toLowerCase(); 94 | this.frequency = 1; 95 | } 96 | 97 | public String getWord() { 98 | return word; 99 | } 100 | 101 | public int getFrequency() { 102 | return frequency; 103 | } 104 | 105 | public int increment() { 106 | return ++frequency; 107 | } 108 | 109 | public int hashCode() { 110 | return word.hashCode(); 111 | } 112 | 113 | public String toString() { 114 | return new StringBuilder(word).append("(").append(frequency) 115 | .append(")").toString(); 116 | } 117 | } 118 | public static final Set getMostFrequent(String input, 119 | ITokenizer tokenizer, IStopWords stopWords, int maxCount, 120 | int minimumOccurences) throws Exception { 121 | 122 | HashMap words = new HashMap(); 123 | ArrayList wordList = new ArrayList(); 124 | String[] wordTokens = tokenizer.tokenize(input); 125 | SnowballStemmer stemmer = new englishStemmer(); 126 | for (int i = 0; i < wordTokens.length; i++) { 127 | if(isWord(wordTokens[i]) && wordTokens[i].length() > 4) { 128 | stemmer.setCurrent(wordTokens[i]); 129 | stemmer.stem(); 130 | String wordToken = stemmer.getCurrent(); 131 | if (isWord(wordToken) && !stopWords.isStopWord(wordToken) && wordToken.length() > 4) { 132 | Word w = words.get(wordToken); 133 | if (w != null) { 134 | w.increment(); 135 | } else { 136 | w = new Word(wordToken); 137 | words.put(wordToken, w); 138 | wordList.add(w); 139 | } 140 | } 141 | } 142 | } 143 | Collections.sort(wordList, new Comparator() { 144 | 145 | public int compare(Word w1, Word w2) { 146 | if (w1.getFrequency() > w2.getFrequency()) { 147 | return -1; 148 | } else if (w1.getFrequency() < w2.getFrequency()) { 149 | return 1; 150 | } else { 151 | String s1 = w1.getWord(); 152 | String s2 = w2.getWord(); 153 | 154 | for (int i = 0; i < s1.length() && i < s2.length(); i++) { 155 | if (s1.charAt(i) > s2.charAt(i)) { 156 | return -1; 157 | } else if (s1.charAt(i) < s2.charAt(i)) { 158 | return 1; 159 | } 160 | } 161 | 162 | if (s1.length() > s2.length()) { 163 | return -1; 164 | } else if (s1.length() < s2.length()) { 165 | return 1; 166 | } else { 167 | return 0; 168 | } 169 | } 170 | 171 | } 172 | 173 | }); 174 | HashSet ret = new HashSet(); 175 | Iterator iter = wordList.iterator(); 176 | while (iter.hasNext() && ret.size() <= maxCount) { 177 | Word w = iter.next(); 178 | if(w.getFrequency() >= minimumOccurences) { 179 | ret.add(w.getWord()); 180 | } 181 | } 182 | return ret; 183 | } 184 | 185 | public static final boolean isWord(String word) { 186 | return (word != null && word.trim().length() > 0); 187 | } 188 | 189 | public static Set parseSentences(String input, 190 | ITokenizer tokenizer, int minimumWordsInASentence) throws Exception { 191 | if (SENTENCE_MODEL != null) { 192 | return parseSentencesNLP(input, tokenizer, minimumWordsInASentence); 193 | } else { 194 | return parseSentencesRegEx(input, tokenizer, 195 | minimumWordsInASentence); 196 | } 197 | } 198 | 199 | public static Set parseSentencesNLP(String input, 200 | ITokenizer tokenizer, int minimumWordsInASentence) throws Exception { 201 | SentenceDetectorME sentenceDetector = new SentenceDetectorME( 202 | SENTENCE_MODEL); 203 | String[] rawSentences = sentenceDetector.sentDetect(input); 204 | HashSet sentences = new HashSet(); 205 | for (int i = 0; i < rawSentences.length; i++) { 206 | String rawSentence = rawSentences[i]; 207 | String[] words = tokenizer.tokenize(rawSentence); 208 | if (words.length >= minimumWordsInASentence) { 209 | sentences.add(rawSentence); 210 | } 211 | } 212 | return sentences; 213 | } 214 | 215 | public static Set parseSentencesRegEx(String input, 216 | ITokenizer tokenizer, int minimumWordsInASentence) throws Exception { 217 | String[] rawSentences = input.split(Defaults.REGEX_SENTENCES); 218 | HashSet sentences = new HashSet(); 219 | for (int i = 0; i < rawSentences.length; i++) { 220 | String rawSentence = rawSentences[i]; 221 | String[] words = tokenizer.tokenize(rawSentence); 222 | if (words.length >= minimumWordsInASentence) { 223 | sentences.add(rawSentence); 224 | } 225 | } 226 | return sentences; 227 | 228 | } 229 | 230 | public static final String replaceSmartQuotes(String s) { 231 | return s.replace('\u2018', '\'') 232 | .replace('\u2019', '\'') 233 | .replace('\u201c', '\"') 234 | .replace('\u201b', '\'') 235 | .replace('\u201d', '\"') 236 | .replace('\u2026', '-') 237 | .replace('\u2013', '-') 238 | .replace('\u2014', '-') 239 | .replaceAll("–", "-") 240 | .replaceAll("“", "\"") 241 | .replaceAll("”", "\"") 242 | .replaceAll("‘", "\'") 243 | .replaceAll("’", "\'") 244 | .replaceAll("‛", "\'") 245 | .replaceAll("'", "\'") 246 | .replaceAll("…", "...") 247 | .replaceAll("—", "-"); 248 | } 249 | 250 | public static void main(String[] args) { 251 | String s = "–than estimated by Umeng–"; 252 | for(int i = 0; i < s.length(); i++){ 253 | System.out.println(">> Char Code "+(short)s.charAt(i)+" (0x"+Integer.toHexString((short)s.charAt(i))+") - {"+s.charAt(i)+"}"); 254 | } 255 | } 256 | 257 | //TODO: ugly hack. find something more efficient and elegant to replace well-known contractions with longer synonyms 258 | public static String dotCorrection(String inputRaw) { 259 | return inputRaw.replace("U.S.", "US").replace("U.K.", "UK").replace("Mass.", "Massachusetts").replace("Mr.", "Mr"); 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/Among.java: -------------------------------------------------------------------------------- 1 | package org.tartarus.snowball; 2 | 3 | import java.lang.reflect.Method; 4 | 5 | public class Among { 6 | public Among(String s, int substring_i, int result, String methodname, 7 | SnowballProgram methodobject) { 8 | this.s_size = s.length(); 9 | this.s = s.toCharArray(); 10 | this.substring_i = substring_i; 11 | this.result = result; 12 | this.methodobject = methodobject; 13 | if (methodname.length() == 0) { 14 | this.method = null; 15 | } else { 16 | try { 17 | this.method = methodobject.getClass().getDeclaredMethod( 18 | methodname, new Class[0]); 19 | } catch (NoSuchMethodException e) { 20 | throw new RuntimeException(e); 21 | } 22 | } 23 | } 24 | 25 | public final int s_size; /* search string */ 26 | public final char[] s; /* search string */ 27 | public final int substring_i; /* index to longest matching substring */ 28 | public final int result; /* result of the lookup */ 29 | public final Method method; /* method to use if substring matches */ 30 | public final SnowballProgram methodobject; /* object to invoke method on */ 31 | }; 32 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/SnowballProgram.java: -------------------------------------------------------------------------------- 1 | package org.tartarus.snowball; 2 | 3 | import java.lang.reflect.InvocationTargetException; 4 | 5 | public class SnowballProgram { 6 | protected SnowballProgram() { 7 | // fix: (mohaps) changed StringBuffer to StringBuilder 8 | current = new StringBuilder(); 9 | setCurrent(""); 10 | } 11 | 12 | /** 13 | * Set the current string. 14 | */ 15 | public void setCurrent(String value) { 16 | current.replace(0, current.length(), value); 17 | cursor = 0; 18 | limit = current.length(); 19 | limit_backward = 0; 20 | bra = cursor; 21 | ket = limit; 22 | } 23 | 24 | /** 25 | * Get the current string. 26 | */ 27 | public String getCurrent() { 28 | String result = current.toString(); 29 | // Make a new StringBuffer. If we reuse the old one, and a user of 30 | // the library keeps a reference to the buffer returned (for example, 31 | // by converting it to a String in a way which doesn't force a copy), 32 | // the buffer size will not decrease, and we will risk wasting a large 33 | // amount of memory. 34 | // Thanks to Wolfram Esser for spotting this problem. 35 | // fix: (mohaps) changed StringBuffer to StringBuilder 36 | current = new StringBuilder(); 37 | return result; 38 | } 39 | 40 | // current string 41 | // fix: (mohaps) changed StringBuffer to StringBuilder 42 | protected StringBuilder current; 43 | 44 | protected int cursor; 45 | protected int limit; 46 | protected int limit_backward; 47 | protected int bra; 48 | protected int ket; 49 | 50 | protected void copy_from(SnowballProgram other) { 51 | current = other.current; 52 | cursor = other.cursor; 53 | limit = other.limit; 54 | limit_backward = other.limit_backward; 55 | bra = other.bra; 56 | ket = other.ket; 57 | } 58 | 59 | protected boolean in_grouping(char[] s, int min, int max) { 60 | if (cursor >= limit) 61 | return false; 62 | char ch = current.charAt(cursor); 63 | if (ch > max || ch < min) 64 | return false; 65 | ch -= min; 66 | if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 67 | return false; 68 | cursor++; 69 | return true; 70 | } 71 | 72 | protected boolean in_grouping_b(char[] s, int min, int max) { 73 | if (cursor <= limit_backward) 74 | return false; 75 | char ch = current.charAt(cursor - 1); 76 | if (ch > max || ch < min) 77 | return false; 78 | ch -= min; 79 | if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 80 | return false; 81 | cursor--; 82 | return true; 83 | } 84 | 85 | protected boolean out_grouping(char[] s, int min, int max) { 86 | if (cursor >= limit) 87 | return false; 88 | char ch = current.charAt(cursor); 89 | if (ch > max || ch < min) { 90 | cursor++; 91 | return true; 92 | } 93 | ch -= min; 94 | if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { 95 | cursor++; 96 | return true; 97 | } 98 | return false; 99 | } 100 | 101 | protected boolean out_grouping_b(char[] s, int min, int max) { 102 | if (cursor <= limit_backward) 103 | return false; 104 | char ch = current.charAt(cursor - 1); 105 | if (ch > max || ch < min) { 106 | cursor--; 107 | return true; 108 | } 109 | ch -= min; 110 | if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { 111 | cursor--; 112 | return true; 113 | } 114 | return false; 115 | } 116 | 117 | protected boolean in_range(int min, int max) { 118 | if (cursor >= limit) 119 | return false; 120 | char ch = current.charAt(cursor); 121 | if (ch > max || ch < min) 122 | return false; 123 | cursor++; 124 | return true; 125 | } 126 | 127 | protected boolean in_range_b(int min, int max) { 128 | if (cursor <= limit_backward) 129 | return false; 130 | char ch = current.charAt(cursor - 1); 131 | if (ch > max || ch < min) 132 | return false; 133 | cursor--; 134 | return true; 135 | } 136 | 137 | protected boolean out_range(int min, int max) { 138 | if (cursor >= limit) 139 | return false; 140 | char ch = current.charAt(cursor); 141 | if (!(ch > max || ch < min)) 142 | return false; 143 | cursor++; 144 | return true; 145 | } 146 | 147 | protected boolean out_range_b(int min, int max) { 148 | if (cursor <= limit_backward) 149 | return false; 150 | char ch = current.charAt(cursor - 1); 151 | if (!(ch > max || ch < min)) 152 | return false; 153 | cursor--; 154 | return true; 155 | } 156 | 157 | protected boolean eq_s(int s_size, String s) { 158 | if (limit - cursor < s_size) 159 | return false; 160 | int i; 161 | for (i = 0; i != s_size; i++) { 162 | if (current.charAt(cursor + i) != s.charAt(i)) 163 | return false; 164 | } 165 | cursor += s_size; 166 | return true; 167 | } 168 | 169 | protected boolean eq_s_b(int s_size, String s) { 170 | if (cursor - limit_backward < s_size) 171 | return false; 172 | int i; 173 | for (i = 0; i != s_size; i++) { 174 | if (current.charAt(cursor - s_size + i) != s.charAt(i)) 175 | return false; 176 | } 177 | cursor -= s_size; 178 | return true; 179 | } 180 | 181 | protected boolean eq_v(CharSequence s) { 182 | return eq_s(s.length(), s.toString()); 183 | } 184 | 185 | protected boolean eq_v_b(CharSequence s) { 186 | return eq_s_b(s.length(), s.toString()); 187 | } 188 | 189 | protected int find_among(Among v[], int v_size) { 190 | int i = 0; 191 | int j = v_size; 192 | 193 | int c = cursor; 194 | int l = limit; 195 | 196 | int common_i = 0; 197 | int common_j = 0; 198 | 199 | boolean first_key_inspected = false; 200 | 201 | while (true) { 202 | int k = i + ((j - i) >> 1); 203 | int diff = 0; 204 | int common = common_i < common_j ? common_i : common_j; // smaller 205 | Among w = v[k]; 206 | int i2; 207 | for (i2 = common; i2 < w.s_size; i2++) { 208 | if (c + common == l) { 209 | diff = -1; 210 | break; 211 | } 212 | diff = current.charAt(c + common) - w.s[i2]; 213 | if (diff != 0) 214 | break; 215 | common++; 216 | } 217 | if (diff < 0) { 218 | j = k; 219 | common_j = common; 220 | } else { 221 | i = k; 222 | common_i = common; 223 | } 224 | if (j - i <= 1) { 225 | if (i > 0) 226 | break; // v->s has been inspected 227 | if (j == i) 228 | break; // only one item in v 229 | 230 | // - but now we need to go round once more to get 231 | // v->s inspected. This looks messy, but is actually 232 | // the optimal approach. 233 | 234 | if (first_key_inspected) 235 | break; 236 | first_key_inspected = true; 237 | } 238 | } 239 | while (true) { 240 | Among w = v[i]; 241 | if (common_i >= w.s_size) { 242 | cursor = c + w.s_size; 243 | if (w.method == null) 244 | return w.result; 245 | boolean res; 246 | try { 247 | Object resobj = w.method.invoke(w.methodobject, 248 | new Object[0]); 249 | res = resobj.toString().equals("true"); 250 | } catch (InvocationTargetException e) { 251 | res = false; 252 | // FIXME - debug message 253 | } catch (IllegalAccessException e) { 254 | res = false; 255 | // FIXME - debug message 256 | } 257 | cursor = c + w.s_size; 258 | if (res) 259 | return w.result; 260 | } 261 | i = w.substring_i; 262 | if (i < 0) 263 | return 0; 264 | } 265 | } 266 | 267 | // find_among_b is for backwards processing. Same comments apply 268 | protected int find_among_b(Among v[], int v_size) { 269 | int i = 0; 270 | int j = v_size; 271 | 272 | int c = cursor; 273 | int lb = limit_backward; 274 | 275 | int common_i = 0; 276 | int common_j = 0; 277 | 278 | boolean first_key_inspected = false; 279 | 280 | while (true) { 281 | int k = i + ((j - i) >> 1); 282 | int diff = 0; 283 | int common = common_i < common_j ? common_i : common_j; 284 | Among w = v[k]; 285 | int i2; 286 | for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) { 287 | if (c - common == lb) { 288 | diff = -1; 289 | break; 290 | } 291 | diff = current.charAt(c - 1 - common) - w.s[i2]; 292 | if (diff != 0) 293 | break; 294 | common++; 295 | } 296 | if (diff < 0) { 297 | j = k; 298 | common_j = common; 299 | } else { 300 | i = k; 301 | common_i = common; 302 | } 303 | if (j - i <= 1) { 304 | if (i > 0) 305 | break; 306 | if (j == i) 307 | break; 308 | if (first_key_inspected) 309 | break; 310 | first_key_inspected = true; 311 | } 312 | } 313 | while (true) { 314 | Among w = v[i]; 315 | if (common_i >= w.s_size) { 316 | cursor = c - w.s_size; 317 | if (w.method == null) 318 | return w.result; 319 | 320 | boolean res; 321 | try { 322 | Object resobj = w.method.invoke(w.methodobject, 323 | new Object[0]); 324 | res = resobj.toString().equals("true"); 325 | } catch (InvocationTargetException e) { 326 | res = false; 327 | // FIXME - debug message 328 | } catch (IllegalAccessException e) { 329 | res = false; 330 | // FIXME - debug message 331 | } 332 | cursor = c - w.s_size; 333 | if (res) 334 | return w.result; 335 | } 336 | i = w.substring_i; 337 | if (i < 0) 338 | return 0; 339 | } 340 | } 341 | 342 | /* 343 | * to replace chars between c_bra and c_ket in current by the chars in s. 344 | */ 345 | protected int replace_s(int c_bra, int c_ket, String s) { 346 | int adjustment = s.length() - (c_ket - c_bra); 347 | current.replace(c_bra, c_ket, s); 348 | limit += adjustment; 349 | if (cursor >= c_ket) 350 | cursor += adjustment; 351 | else if (cursor > c_bra) 352 | cursor = c_bra; 353 | return adjustment; 354 | } 355 | 356 | protected void slice_check() { 357 | if (bra < 0 || bra > ket || ket > limit || limit > current.length()) // this 358 | // line 359 | // could 360 | // be 361 | // removed 362 | { 363 | System.err.println("faulty slice operation"); 364 | // FIXME: report error somehow. 365 | /* 366 | * fprintf(stderr, "faulty slice operation:\n"); debug(z, -1, 0); 367 | * exit(1); 368 | */ 369 | } 370 | } 371 | 372 | protected void slice_from(String s) { 373 | slice_check(); 374 | replace_s(bra, ket, s); 375 | } 376 | 377 | protected void slice_from(CharSequence s) { 378 | slice_from(s.toString()); 379 | } 380 | 381 | protected void slice_del() { 382 | slice_from(""); 383 | } 384 | 385 | protected void insert(int c_bra, int c_ket, String s) { 386 | int adjustment = replace_s(c_bra, c_ket, s); 387 | if (c_bra <= bra) 388 | bra += adjustment; 389 | if (c_bra <= ket) 390 | ket += adjustment; 391 | } 392 | 393 | protected void insert(int c_bra, int c_ket, CharSequence s) { 394 | insert(c_bra, c_ket, s.toString()); 395 | } 396 | 397 | /* Copy the slice into the supplied StringBuffer */ 398 | protected StringBuffer slice_to(StringBuffer s) { 399 | slice_check(); 400 | int len = ket - bra; 401 | s.replace(0, s.length(), current.substring(bra, ket)); 402 | return s; 403 | } 404 | 405 | /* Copy the slice into the supplied StringBuilder */ 406 | protected StringBuilder slice_to(StringBuilder s) { 407 | slice_check(); 408 | int len = ket - bra; 409 | s.replace(0, s.length(), current.substring(bra, ket)); 410 | return s; 411 | } 412 | 413 | protected StringBuffer assign_to(StringBuffer s) { 414 | s.replace(0, s.length(), current.substring(0, limit)); 415 | return s; 416 | } 417 | 418 | protected StringBuilder assign_to(StringBuilder s) { 419 | s.replace(0, s.length(), current.substring(0, limit)); 420 | return s; 421 | } 422 | 423 | /* 424 | * extern void debug(struct SN_env * z, int number, int line_count) { int i; 425 | * int limit = SIZE(z->p); //if (number >= 0) printf("%3d (line %4d): '", 426 | * number, line_count); if (number >= 0) printf("%3d (line %4d): [%d]'", 427 | * number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) 428 | * printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if 429 | * (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { 430 | * int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } 431 | * printf("'\n"); } 432 | */ 433 | 434 | }; 435 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/SnowballStemmer.java: -------------------------------------------------------------------------------- 1 | package org.tartarus.snowball; 2 | 3 | 4 | 5 | public abstract class SnowballStemmer extends SnowballProgram { 6 | public abstract boolean stem(); 7 | }; 8 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/ext/danishStemmer.java: -------------------------------------------------------------------------------- 1 | // This file was generated automatically by the Snowball to Java compiler 2 | 3 | package org.tartarus.snowball.ext; 4 | 5 | import org.tartarus.snowball.Among; 6 | 7 | /** 8 | * This class was automatically generated by a Snowball to Java compiler It 9 | * implements the stemming algorithm defined by a snowball script. 10 | */ 11 | 12 | public class danishStemmer extends org.tartarus.snowball.SnowballStemmer { 13 | 14 | private static final long serialVersionUID = 1L; 15 | 16 | private final static danishStemmer methodObject = new danishStemmer(); 17 | 18 | private final static Among a_0[] = { 19 | new Among("hed", -1, 1, "", methodObject), 20 | new Among("ethed", 0, 1, "", methodObject), 21 | new Among("ered", -1, 1, "", methodObject), 22 | new Among("e", -1, 1, "", methodObject), 23 | new Among("erede", 3, 1, "", methodObject), 24 | new Among("ende", 3, 1, "", methodObject), 25 | new Among("erende", 5, 1, "", methodObject), 26 | new Among("ene", 3, 1, "", methodObject), 27 | new Among("erne", 3, 1, "", methodObject), 28 | new Among("ere", 3, 1, "", methodObject), 29 | new Among("en", -1, 1, "", methodObject), 30 | new Among("heden", 10, 1, "", methodObject), 31 | new Among("eren", 10, 1, "", methodObject), 32 | new Among("er", -1, 1, "", methodObject), 33 | new Among("heder", 13, 1, "", methodObject), 34 | new Among("erer", 13, 1, "", methodObject), 35 | new Among("s", -1, 2, "", methodObject), 36 | new Among("heds", 16, 1, "", methodObject), 37 | new Among("es", 16, 1, "", methodObject), 38 | new Among("endes", 18, 1, "", methodObject), 39 | new Among("erendes", 19, 1, "", methodObject), 40 | new Among("enes", 18, 1, "", methodObject), 41 | new Among("ernes", 18, 1, "", methodObject), 42 | new Among("eres", 18, 1, "", methodObject), 43 | new Among("ens", 16, 1, "", methodObject), 44 | new Among("hedens", 24, 1, "", methodObject), 45 | new Among("erens", 24, 1, "", methodObject), 46 | new Among("ers", 16, 1, "", methodObject), 47 | new Among("ets", 16, 1, "", methodObject), 48 | new Among("erets", 28, 1, "", methodObject), 49 | new Among("et", -1, 1, "", methodObject), 50 | new Among("eret", 30, 1, "", methodObject) }; 51 | 52 | private final static Among a_1[] = { 53 | new Among("gd", -1, -1, "", methodObject), 54 | new Among("dt", -1, -1, "", methodObject), 55 | new Among("gt", -1, -1, "", methodObject), 56 | new Among("kt", -1, -1, "", methodObject) }; 57 | 58 | private final static Among a_2[] = { 59 | new Among("ig", -1, 1, "", methodObject), 60 | new Among("lig", 0, 1, "", methodObject), 61 | new Among("elig", 1, 1, "", methodObject), 62 | new Among("els", -1, 1, "", methodObject), 63 | new Among("l\u00F8st", -1, 2, "", methodObject) }; 64 | 65 | private static final char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 66 | 0, 0, 0, 0, 48, 0, 128 }; 67 | 68 | private static final char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 69 | 0, 0, 0, 0, 0, 0, 0, 16 }; 70 | 71 | private int I_x; 72 | private int I_p1; 73 | private java.lang.StringBuilder S_ch = new java.lang.StringBuilder(); 74 | 75 | private void copy_from(danishStemmer other) { 76 | I_x = other.I_x; 77 | I_p1 = other.I_p1; 78 | S_ch = other.S_ch; 79 | super.copy_from(other); 80 | } 81 | 82 | private boolean r_mark_regions() { 83 | int v_1; 84 | int v_2; 85 | // (, line 29 86 | I_p1 = limit; 87 | // test, line 33 88 | v_1 = cursor; 89 | // (, line 33 90 | // hop, line 33 91 | { 92 | int c = cursor + 3; 93 | if (0 > c || c > limit) { 94 | return false; 95 | } 96 | cursor = c; 97 | } 98 | // setmark x, line 33 99 | I_x = cursor; 100 | cursor = v_1; 101 | // goto, line 34 102 | golab0: while (true) { 103 | v_2 = cursor; 104 | lab1: do { 105 | if (!(in_grouping(g_v, 97, 248))) { 106 | break lab1; 107 | } 108 | cursor = v_2; 109 | break golab0; 110 | } while (false); 111 | cursor = v_2; 112 | if (cursor >= limit) { 113 | return false; 114 | } 115 | cursor++; 116 | } 117 | // gopast, line 34 118 | golab2: while (true) { 119 | lab3: do { 120 | if (!(out_grouping(g_v, 97, 248))) { 121 | break lab3; 122 | } 123 | break golab2; 124 | } while (false); 125 | if (cursor >= limit) { 126 | return false; 127 | } 128 | cursor++; 129 | } 130 | // setmark p1, line 34 131 | I_p1 = cursor; 132 | // try, line 35 133 | lab4: do { 134 | // (, line 35 135 | if (!(I_p1 < I_x)) { 136 | break lab4; 137 | } 138 | I_p1 = I_x; 139 | } while (false); 140 | return true; 141 | } 142 | 143 | private boolean r_main_suffix() { 144 | int among_var; 145 | int v_1; 146 | int v_2; 147 | // (, line 40 148 | // setlimit, line 41 149 | v_1 = limit - cursor; 150 | // tomark, line 41 151 | if (cursor < I_p1) { 152 | return false; 153 | } 154 | cursor = I_p1; 155 | v_2 = limit_backward; 156 | limit_backward = cursor; 157 | cursor = limit - v_1; 158 | // (, line 41 159 | // [, line 41 160 | ket = cursor; 161 | // substring, line 41 162 | among_var = find_among_b(a_0, 32); 163 | if (among_var == 0) { 164 | limit_backward = v_2; 165 | return false; 166 | } 167 | // ], line 41 168 | bra = cursor; 169 | limit_backward = v_2; 170 | switch (among_var) { 171 | case 0: 172 | return false; 173 | case 1: 174 | // (, line 48 175 | // delete, line 48 176 | slice_del(); 177 | break; 178 | case 2: 179 | // (, line 50 180 | if (!(in_grouping_b(g_s_ending, 97, 229))) { 181 | return false; 182 | } 183 | // delete, line 50 184 | slice_del(); 185 | break; 186 | } 187 | return true; 188 | } 189 | 190 | private boolean r_consonant_pair() { 191 | int v_1; 192 | int v_2; 193 | int v_3; 194 | // (, line 54 195 | // test, line 55 196 | v_1 = limit - cursor; 197 | // (, line 55 198 | // setlimit, line 56 199 | v_2 = limit - cursor; 200 | // tomark, line 56 201 | if (cursor < I_p1) { 202 | return false; 203 | } 204 | cursor = I_p1; 205 | v_3 = limit_backward; 206 | limit_backward = cursor; 207 | cursor = limit - v_2; 208 | // (, line 56 209 | // [, line 56 210 | ket = cursor; 211 | // substring, line 56 212 | if (find_among_b(a_1, 4) == 0) { 213 | limit_backward = v_3; 214 | return false; 215 | } 216 | // ], line 56 217 | bra = cursor; 218 | limit_backward = v_3; 219 | cursor = limit - v_1; 220 | // next, line 62 221 | if (cursor <= limit_backward) { 222 | return false; 223 | } 224 | cursor--; 225 | // ], line 62 226 | bra = cursor; 227 | // delete, line 62 228 | slice_del(); 229 | return true; 230 | } 231 | 232 | private boolean r_other_suffix() { 233 | int among_var; 234 | int v_1; 235 | int v_2; 236 | int v_3; 237 | int v_4; 238 | // (, line 65 239 | // do, line 66 240 | v_1 = limit - cursor; 241 | lab0: do { 242 | // (, line 66 243 | // [, line 66 244 | ket = cursor; 245 | // literal, line 66 246 | if (!(eq_s_b(2, "st"))) { 247 | break lab0; 248 | } 249 | // ], line 66 250 | bra = cursor; 251 | // literal, line 66 252 | if (!(eq_s_b(2, "ig"))) { 253 | break lab0; 254 | } 255 | // delete, line 66 256 | slice_del(); 257 | } while (false); 258 | cursor = limit - v_1; 259 | // setlimit, line 67 260 | v_2 = limit - cursor; 261 | // tomark, line 67 262 | if (cursor < I_p1) { 263 | return false; 264 | } 265 | cursor = I_p1; 266 | v_3 = limit_backward; 267 | limit_backward = cursor; 268 | cursor = limit - v_2; 269 | // (, line 67 270 | // [, line 67 271 | ket = cursor; 272 | // substring, line 67 273 | among_var = find_among_b(a_2, 5); 274 | if (among_var == 0) { 275 | limit_backward = v_3; 276 | return false; 277 | } 278 | // ], line 67 279 | bra = cursor; 280 | limit_backward = v_3; 281 | switch (among_var) { 282 | case 0: 283 | return false; 284 | case 1: 285 | // (, line 70 286 | // delete, line 70 287 | slice_del(); 288 | // do, line 70 289 | v_4 = limit - cursor; 290 | lab1: do { 291 | // call consonant_pair, line 70 292 | if (!r_consonant_pair()) { 293 | break lab1; 294 | } 295 | } while (false); 296 | cursor = limit - v_4; 297 | break; 298 | case 2: 299 | // (, line 72 300 | // <-, line 72 301 | slice_from("l\u00F8s"); 302 | break; 303 | } 304 | return true; 305 | } 306 | 307 | private boolean r_undouble() { 308 | int v_1; 309 | int v_2; 310 | // (, line 75 311 | // setlimit, line 76 312 | v_1 = limit - cursor; 313 | // tomark, line 76 314 | if (cursor < I_p1) { 315 | return false; 316 | } 317 | cursor = I_p1; 318 | v_2 = limit_backward; 319 | limit_backward = cursor; 320 | cursor = limit - v_1; 321 | // (, line 76 322 | // [, line 76 323 | ket = cursor; 324 | if (!(out_grouping_b(g_v, 97, 248))) { 325 | limit_backward = v_2; 326 | return false; 327 | } 328 | // ], line 76 329 | bra = cursor; 330 | // -> ch, line 76 331 | S_ch = slice_to(S_ch); 332 | limit_backward = v_2; 333 | // name ch, line 77 334 | if (!(eq_v_b(S_ch))) { 335 | return false; 336 | } 337 | // delete, line 78 338 | slice_del(); 339 | return true; 340 | } 341 | 342 | public boolean stem() { 343 | int v_1; 344 | int v_2; 345 | int v_3; 346 | int v_4; 347 | int v_5; 348 | // (, line 82 349 | // do, line 84 350 | v_1 = cursor; 351 | lab0: do { 352 | // call mark_regions, line 84 353 | if (!r_mark_regions()) { 354 | break lab0; 355 | } 356 | } while (false); 357 | cursor = v_1; 358 | // backwards, line 85 359 | limit_backward = cursor; 360 | cursor = limit; 361 | // (, line 85 362 | // do, line 86 363 | v_2 = limit - cursor; 364 | lab1: do { 365 | // call main_suffix, line 86 366 | if (!r_main_suffix()) { 367 | break lab1; 368 | } 369 | } while (false); 370 | cursor = limit - v_2; 371 | // do, line 87 372 | v_3 = limit - cursor; 373 | lab2: do { 374 | // call consonant_pair, line 87 375 | if (!r_consonant_pair()) { 376 | break lab2; 377 | } 378 | } while (false); 379 | cursor = limit - v_3; 380 | // do, line 88 381 | v_4 = limit - cursor; 382 | lab3: do { 383 | // call other_suffix, line 88 384 | if (!r_other_suffix()) { 385 | break lab3; 386 | } 387 | } while (false); 388 | cursor = limit - v_4; 389 | // do, line 89 390 | v_5 = limit - cursor; 391 | lab4: do { 392 | // call undouble, line 89 393 | if (!r_undouble()) { 394 | break lab4; 395 | } 396 | } while (false); 397 | cursor = limit - v_5; 398 | cursor = limit_backward; 399 | return true; 400 | } 401 | 402 | public boolean equals(Object o) { 403 | return o instanceof danishStemmer; 404 | } 405 | 406 | public int hashCode() { 407 | return danishStemmer.class.getName().hashCode(); 408 | } 409 | 410 | } 411 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/ext/germanStemmer.java: -------------------------------------------------------------------------------- 1 | // This file was generated automatically by the Snowball to Java compiler 2 | 3 | package org.tartarus.snowball.ext; 4 | 5 | import org.tartarus.snowball.Among; 6 | 7 | /** 8 | * This class was automatically generated by a Snowball to Java compiler It 9 | * implements the stemming algorithm defined by a snowball script. 10 | */ 11 | 12 | public class germanStemmer extends org.tartarus.snowball.SnowballStemmer { 13 | 14 | private static final long serialVersionUID = 1L; 15 | 16 | private final static germanStemmer methodObject = new germanStemmer(); 17 | 18 | private final static Among a_0[] = { 19 | new Among("", -1, 6, "", methodObject), 20 | new Among("U", 0, 2, "", methodObject), 21 | new Among("Y", 0, 1, "", methodObject), 22 | new Among("\u00E4", 0, 3, "", methodObject), 23 | new Among("\u00F6", 0, 4, "", methodObject), 24 | new Among("\u00FC", 0, 5, "", methodObject) }; 25 | 26 | private final static Among a_1[] = { 27 | new Among("e", -1, 2, "", methodObject), 28 | new Among("em", -1, 1, "", methodObject), 29 | new Among("en", -1, 2, "", methodObject), 30 | new Among("ern", -1, 1, "", methodObject), 31 | new Among("er", -1, 1, "", methodObject), 32 | new Among("s", -1, 3, "", methodObject), 33 | new Among("es", 5, 2, "", methodObject) }; 34 | 35 | private final static Among a_2[] = { 36 | new Among("en", -1, 1, "", methodObject), 37 | new Among("er", -1, 1, "", methodObject), 38 | new Among("st", -1, 2, "", methodObject), 39 | new Among("est", 2, 1, "", methodObject) }; 40 | 41 | private final static Among a_3[] = { 42 | new Among("ig", -1, 1, "", methodObject), 43 | new Among("lich", -1, 1, "", methodObject) }; 44 | 45 | private final static Among a_4[] = { 46 | new Among("end", -1, 1, "", methodObject), 47 | new Among("ig", -1, 2, "", methodObject), 48 | new Among("ung", -1, 1, "", methodObject), 49 | new Among("lich", -1, 3, "", methodObject), 50 | new Among("isch", -1, 2, "", methodObject), 51 | new Among("ik", -1, 2, "", methodObject), 52 | new Among("heit", -1, 3, "", methodObject), 53 | new Among("keit", -1, 4, "", methodObject) }; 54 | 55 | private static final char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 56 | 0, 0, 0, 0, 8, 0, 32, 8 }; 57 | 58 | private static final char g_s_ending[] = { 117, 30, 5 }; 59 | 60 | private static final char g_st_ending[] = { 117, 30, 4 }; 61 | 62 | private int I_x; 63 | private int I_p2; 64 | private int I_p1; 65 | 66 | private void copy_from(germanStemmer other) { 67 | I_x = other.I_x; 68 | I_p2 = other.I_p2; 69 | I_p1 = other.I_p1; 70 | super.copy_from(other); 71 | } 72 | 73 | private boolean r_prelude() { 74 | int v_1; 75 | int v_2; 76 | int v_3; 77 | int v_4; 78 | int v_5; 79 | int v_6; 80 | // (, line 33 81 | // test, line 35 82 | v_1 = cursor; 83 | // repeat, line 35 84 | replab0: while (true) { 85 | v_2 = cursor; 86 | lab1: do { 87 | // (, line 35 88 | // or, line 38 89 | lab2: do { 90 | v_3 = cursor; 91 | lab3: do { 92 | // (, line 36 93 | // [, line 37 94 | bra = cursor; 95 | // literal, line 37 96 | if (!(eq_s(1, "\u00DF"))) { 97 | break lab3; 98 | } 99 | // ], line 37 100 | ket = cursor; 101 | // <-, line 37 102 | slice_from("ss"); 103 | break lab2; 104 | } while (false); 105 | cursor = v_3; 106 | // next, line 38 107 | if (cursor >= limit) { 108 | break lab1; 109 | } 110 | cursor++; 111 | } while (false); 112 | continue replab0; 113 | } while (false); 114 | cursor = v_2; 115 | break replab0; 116 | } 117 | cursor = v_1; 118 | // repeat, line 41 119 | replab4: while (true) { 120 | v_4 = cursor; 121 | lab5: do { 122 | // goto, line 41 123 | golab6: while (true) { 124 | v_5 = cursor; 125 | lab7: do { 126 | // (, line 41 127 | if (!(in_grouping(g_v, 97, 252))) { 128 | break lab7; 129 | } 130 | // [, line 42 131 | bra = cursor; 132 | // or, line 42 133 | lab8: do { 134 | v_6 = cursor; 135 | lab9: do { 136 | // (, line 42 137 | // literal, line 42 138 | if (!(eq_s(1, "u"))) { 139 | break lab9; 140 | } 141 | // ], line 42 142 | ket = cursor; 143 | if (!(in_grouping(g_v, 97, 252))) { 144 | break lab9; 145 | } 146 | // <-, line 42 147 | slice_from("U"); 148 | break lab8; 149 | } while (false); 150 | cursor = v_6; 151 | // (, line 43 152 | // literal, line 43 153 | if (!(eq_s(1, "y"))) { 154 | break lab7; 155 | } 156 | // ], line 43 157 | ket = cursor; 158 | if (!(in_grouping(g_v, 97, 252))) { 159 | break lab7; 160 | } 161 | // <-, line 43 162 | slice_from("Y"); 163 | } while (false); 164 | cursor = v_5; 165 | break golab6; 166 | } while (false); 167 | cursor = v_5; 168 | if (cursor >= limit) { 169 | break lab5; 170 | } 171 | cursor++; 172 | } 173 | continue replab4; 174 | } while (false); 175 | cursor = v_4; 176 | break replab4; 177 | } 178 | return true; 179 | } 180 | 181 | private boolean r_mark_regions() { 182 | int v_1; 183 | // (, line 47 184 | I_p1 = limit; 185 | I_p2 = limit; 186 | // test, line 52 187 | v_1 = cursor; 188 | // (, line 52 189 | // hop, line 52 190 | { 191 | int c = cursor + 3; 192 | if (0 > c || c > limit) { 193 | return false; 194 | } 195 | cursor = c; 196 | } 197 | // setmark x, line 52 198 | I_x = cursor; 199 | cursor = v_1; 200 | // gopast, line 54 201 | golab0: while (true) { 202 | lab1: do { 203 | if (!(in_grouping(g_v, 97, 252))) { 204 | break lab1; 205 | } 206 | break golab0; 207 | } while (false); 208 | if (cursor >= limit) { 209 | return false; 210 | } 211 | cursor++; 212 | } 213 | // gopast, line 54 214 | golab2: while (true) { 215 | lab3: do { 216 | if (!(out_grouping(g_v, 97, 252))) { 217 | break lab3; 218 | } 219 | break golab2; 220 | } while (false); 221 | if (cursor >= limit) { 222 | return false; 223 | } 224 | cursor++; 225 | } 226 | // setmark p1, line 54 227 | I_p1 = cursor; 228 | // try, line 55 229 | lab4: do { 230 | // (, line 55 231 | if (!(I_p1 < I_x)) { 232 | break lab4; 233 | } 234 | I_p1 = I_x; 235 | } while (false); 236 | // gopast, line 56 237 | golab5: while (true) { 238 | lab6: do { 239 | if (!(in_grouping(g_v, 97, 252))) { 240 | break lab6; 241 | } 242 | break golab5; 243 | } while (false); 244 | if (cursor >= limit) { 245 | return false; 246 | } 247 | cursor++; 248 | } 249 | // gopast, line 56 250 | golab7: while (true) { 251 | lab8: do { 252 | if (!(out_grouping(g_v, 97, 252))) { 253 | break lab8; 254 | } 255 | break golab7; 256 | } while (false); 257 | if (cursor >= limit) { 258 | return false; 259 | } 260 | cursor++; 261 | } 262 | // setmark p2, line 56 263 | I_p2 = cursor; 264 | return true; 265 | } 266 | 267 | private boolean r_postlude() { 268 | int among_var; 269 | int v_1; 270 | // repeat, line 60 271 | replab0: while (true) { 272 | v_1 = cursor; 273 | lab1: do { 274 | // (, line 60 275 | // [, line 62 276 | bra = cursor; 277 | // substring, line 62 278 | among_var = find_among(a_0, 6); 279 | if (among_var == 0) { 280 | break lab1; 281 | } 282 | // ], line 62 283 | ket = cursor; 284 | switch (among_var) { 285 | case 0: 286 | break lab1; 287 | case 1: 288 | // (, line 63 289 | // <-, line 63 290 | slice_from("y"); 291 | break; 292 | case 2: 293 | // (, line 64 294 | // <-, line 64 295 | slice_from("u"); 296 | break; 297 | case 3: 298 | // (, line 65 299 | // <-, line 65 300 | slice_from("a"); 301 | break; 302 | case 4: 303 | // (, line 66 304 | // <-, line 66 305 | slice_from("o"); 306 | break; 307 | case 5: 308 | // (, line 67 309 | // <-, line 67 310 | slice_from("u"); 311 | break; 312 | case 6: 313 | // (, line 68 314 | // next, line 68 315 | if (cursor >= limit) { 316 | break lab1; 317 | } 318 | cursor++; 319 | break; 320 | } 321 | continue replab0; 322 | } while (false); 323 | cursor = v_1; 324 | break replab0; 325 | } 326 | return true; 327 | } 328 | 329 | private boolean r_R1() { 330 | if (!(I_p1 <= cursor)) { 331 | return false; 332 | } 333 | return true; 334 | } 335 | 336 | private boolean r_R2() { 337 | if (!(I_p2 <= cursor)) { 338 | return false; 339 | } 340 | return true; 341 | } 342 | 343 | private boolean r_standard_suffix() { 344 | int among_var; 345 | int v_1; 346 | int v_2; 347 | int v_3; 348 | int v_4; 349 | int v_5; 350 | int v_6; 351 | int v_7; 352 | int v_8; 353 | int v_9; 354 | int v_10; 355 | // (, line 78 356 | // do, line 79 357 | v_1 = limit - cursor; 358 | lab0: do { 359 | // (, line 79 360 | // [, line 80 361 | ket = cursor; 362 | // substring, line 80 363 | among_var = find_among_b(a_1, 7); 364 | if (among_var == 0) { 365 | break lab0; 366 | } 367 | // ], line 80 368 | bra = cursor; 369 | // call R1, line 80 370 | if (!r_R1()) { 371 | break lab0; 372 | } 373 | switch (among_var) { 374 | case 0: 375 | break lab0; 376 | case 1: 377 | // (, line 82 378 | // delete, line 82 379 | slice_del(); 380 | break; 381 | case 2: 382 | // (, line 85 383 | // delete, line 85 384 | slice_del(); 385 | // try, line 86 386 | v_2 = limit - cursor; 387 | lab1: do { 388 | // (, line 86 389 | // [, line 86 390 | ket = cursor; 391 | // literal, line 86 392 | if (!(eq_s_b(1, "s"))) { 393 | cursor = limit - v_2; 394 | break lab1; 395 | } 396 | // ], line 86 397 | bra = cursor; 398 | // literal, line 86 399 | if (!(eq_s_b(3, "nis"))) { 400 | cursor = limit - v_2; 401 | break lab1; 402 | } 403 | // delete, line 86 404 | slice_del(); 405 | } while (false); 406 | break; 407 | case 3: 408 | // (, line 89 409 | if (!(in_grouping_b(g_s_ending, 98, 116))) { 410 | break lab0; 411 | } 412 | // delete, line 89 413 | slice_del(); 414 | break; 415 | } 416 | } while (false); 417 | cursor = limit - v_1; 418 | // do, line 93 419 | v_3 = limit - cursor; 420 | lab2: do { 421 | // (, line 93 422 | // [, line 94 423 | ket = cursor; 424 | // substring, line 94 425 | among_var = find_among_b(a_2, 4); 426 | if (among_var == 0) { 427 | break lab2; 428 | } 429 | // ], line 94 430 | bra = cursor; 431 | // call R1, line 94 432 | if (!r_R1()) { 433 | break lab2; 434 | } 435 | switch (among_var) { 436 | case 0: 437 | break lab2; 438 | case 1: 439 | // (, line 96 440 | // delete, line 96 441 | slice_del(); 442 | break; 443 | case 2: 444 | // (, line 99 445 | if (!(in_grouping_b(g_st_ending, 98, 116))) { 446 | break lab2; 447 | } 448 | // hop, line 99 449 | { 450 | int c = cursor - 3; 451 | if (limit_backward > c || c > limit) { 452 | break lab2; 453 | } 454 | cursor = c; 455 | } 456 | // delete, line 99 457 | slice_del(); 458 | break; 459 | } 460 | } while (false); 461 | cursor = limit - v_3; 462 | // do, line 103 463 | v_4 = limit - cursor; 464 | lab3: do { 465 | // (, line 103 466 | // [, line 104 467 | ket = cursor; 468 | // substring, line 104 469 | among_var = find_among_b(a_4, 8); 470 | if (among_var == 0) { 471 | break lab3; 472 | } 473 | // ], line 104 474 | bra = cursor; 475 | // call R2, line 104 476 | if (!r_R2()) { 477 | break lab3; 478 | } 479 | switch (among_var) { 480 | case 0: 481 | break lab3; 482 | case 1: 483 | // (, line 106 484 | // delete, line 106 485 | slice_del(); 486 | // try, line 107 487 | v_5 = limit - cursor; 488 | lab4: do { 489 | // (, line 107 490 | // [, line 107 491 | ket = cursor; 492 | // literal, line 107 493 | if (!(eq_s_b(2, "ig"))) { 494 | cursor = limit - v_5; 495 | break lab4; 496 | } 497 | // ], line 107 498 | bra = cursor; 499 | // not, line 107 500 | { 501 | v_6 = limit - cursor; 502 | lab5: do { 503 | // literal, line 107 504 | if (!(eq_s_b(1, "e"))) { 505 | break lab5; 506 | } 507 | cursor = limit - v_5; 508 | break lab4; 509 | } while (false); 510 | cursor = limit - v_6; 511 | } 512 | // call R2, line 107 513 | if (!r_R2()) { 514 | cursor = limit - v_5; 515 | break lab4; 516 | } 517 | // delete, line 107 518 | slice_del(); 519 | } while (false); 520 | break; 521 | case 2: 522 | // (, line 110 523 | // not, line 110 524 | { 525 | v_7 = limit - cursor; 526 | lab6: do { 527 | // literal, line 110 528 | if (!(eq_s_b(1, "e"))) { 529 | break lab6; 530 | } 531 | break lab3; 532 | } while (false); 533 | cursor = limit - v_7; 534 | } 535 | // delete, line 110 536 | slice_del(); 537 | break; 538 | case 3: 539 | // (, line 113 540 | // delete, line 113 541 | slice_del(); 542 | // try, line 114 543 | v_8 = limit - cursor; 544 | lab7: do { 545 | // (, line 114 546 | // [, line 115 547 | ket = cursor; 548 | // or, line 115 549 | lab8: do { 550 | v_9 = limit - cursor; 551 | lab9: do { 552 | // literal, line 115 553 | if (!(eq_s_b(2, "er"))) { 554 | break lab9; 555 | } 556 | break lab8; 557 | } while (false); 558 | cursor = limit - v_9; 559 | // literal, line 115 560 | if (!(eq_s_b(2, "en"))) { 561 | cursor = limit - v_8; 562 | break lab7; 563 | } 564 | } while (false); 565 | // ], line 115 566 | bra = cursor; 567 | // call R1, line 115 568 | if (!r_R1()) { 569 | cursor = limit - v_8; 570 | break lab7; 571 | } 572 | // delete, line 115 573 | slice_del(); 574 | } while (false); 575 | break; 576 | case 4: 577 | // (, line 119 578 | // delete, line 119 579 | slice_del(); 580 | // try, line 120 581 | v_10 = limit - cursor; 582 | lab10: do { 583 | // (, line 120 584 | // [, line 121 585 | ket = cursor; 586 | // substring, line 121 587 | among_var = find_among_b(a_3, 2); 588 | if (among_var == 0) { 589 | cursor = limit - v_10; 590 | break lab10; 591 | } 592 | // ], line 121 593 | bra = cursor; 594 | // call R2, line 121 595 | if (!r_R2()) { 596 | cursor = limit - v_10; 597 | break lab10; 598 | } 599 | switch (among_var) { 600 | case 0: 601 | cursor = limit - v_10; 602 | break lab10; 603 | case 1: 604 | // (, line 123 605 | // delete, line 123 606 | slice_del(); 607 | break; 608 | } 609 | } while (false); 610 | break; 611 | } 612 | } while (false); 613 | cursor = limit - v_4; 614 | return true; 615 | } 616 | 617 | public boolean stem() { 618 | int v_1; 619 | int v_2; 620 | int v_3; 621 | int v_4; 622 | // (, line 133 623 | // do, line 134 624 | v_1 = cursor; 625 | lab0: do { 626 | // call prelude, line 134 627 | if (!r_prelude()) { 628 | break lab0; 629 | } 630 | } while (false); 631 | cursor = v_1; 632 | // do, line 135 633 | v_2 = cursor; 634 | lab1: do { 635 | // call mark_regions, line 135 636 | if (!r_mark_regions()) { 637 | break lab1; 638 | } 639 | } while (false); 640 | cursor = v_2; 641 | // backwards, line 136 642 | limit_backward = cursor; 643 | cursor = limit; 644 | // do, line 137 645 | v_3 = limit - cursor; 646 | lab2: do { 647 | // call standard_suffix, line 137 648 | if (!r_standard_suffix()) { 649 | break lab2; 650 | } 651 | } while (false); 652 | cursor = limit - v_3; 653 | cursor = limit_backward; // do, line 138 654 | v_4 = cursor; 655 | lab3: do { 656 | // call postlude, line 138 657 | if (!r_postlude()) { 658 | break lab3; 659 | } 660 | } while (false); 661 | cursor = v_4; 662 | return true; 663 | } 664 | 665 | public boolean equals(Object o) { 666 | return o instanceof germanStemmer; 667 | } 668 | 669 | public int hashCode() { 670 | return germanStemmer.class.getName().hashCode(); 671 | } 672 | 673 | } 674 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/ext/norwegianStemmer.java: -------------------------------------------------------------------------------- 1 | // This file was generated automatically by the Snowball to Java compiler 2 | 3 | package org.tartarus.snowball.ext; 4 | 5 | import org.tartarus.snowball.Among; 6 | 7 | /** 8 | * This class was automatically generated by a Snowball to Java compiler It 9 | * implements the stemming algorithm defined by a snowball script. 10 | */ 11 | 12 | public class norwegianStemmer extends org.tartarus.snowball.SnowballStemmer { 13 | 14 | private static final long serialVersionUID = 1L; 15 | 16 | private final static norwegianStemmer methodObject = new norwegianStemmer(); 17 | 18 | private final static Among a_0[] = { 19 | new Among("a", -1, 1, "", methodObject), 20 | new Among("e", -1, 1, "", methodObject), 21 | new Among("ede", 1, 1, "", methodObject), 22 | new Among("ande", 1, 1, "", methodObject), 23 | new Among("ende", 1, 1, "", methodObject), 24 | new Among("ane", 1, 1, "", methodObject), 25 | new Among("ene", 1, 1, "", methodObject), 26 | new Among("hetene", 6, 1, "", methodObject), 27 | new Among("erte", 1, 3, "", methodObject), 28 | new Among("en", -1, 1, "", methodObject), 29 | new Among("heten", 9, 1, "", methodObject), 30 | new Among("ar", -1, 1, "", methodObject), 31 | new Among("er", -1, 1, "", methodObject), 32 | new Among("heter", 12, 1, "", methodObject), 33 | new Among("s", -1, 2, "", methodObject), 34 | new Among("as", 14, 1, "", methodObject), 35 | new Among("es", 14, 1, "", methodObject), 36 | new Among("edes", 16, 1, "", methodObject), 37 | new Among("endes", 16, 1, "", methodObject), 38 | new Among("enes", 16, 1, "", methodObject), 39 | new Among("hetenes", 19, 1, "", methodObject), 40 | new Among("ens", 14, 1, "", methodObject), 41 | new Among("hetens", 21, 1, "", methodObject), 42 | new Among("ers", 14, 1, "", methodObject), 43 | new Among("ets", 14, 1, "", methodObject), 44 | new Among("et", -1, 1, "", methodObject), 45 | new Among("het", 25, 1, "", methodObject), 46 | new Among("ert", -1, 3, "", methodObject), 47 | new Among("ast", -1, 1, "", methodObject) }; 48 | 49 | private final static Among a_1[] = { 50 | new Among("dt", -1, -1, "", methodObject), 51 | new Among("vt", -1, -1, "", methodObject) }; 52 | 53 | private final static Among a_2[] = { 54 | new Among("leg", -1, 1, "", methodObject), 55 | new Among("eleg", 0, 1, "", methodObject), 56 | new Among("ig", -1, 1, "", methodObject), 57 | new Among("eig", 2, 1, "", methodObject), 58 | new Among("lig", 2, 1, "", methodObject), 59 | new Among("elig", 4, 1, "", methodObject), 60 | new Among("els", -1, 1, "", methodObject), 61 | new Among("lov", -1, 1, "", methodObject), 62 | new Among("elov", 7, 1, "", methodObject), 63 | new Among("slov", 7, 1, "", methodObject), 64 | new Among("hetslov", 9, 1, "", methodObject) }; 65 | 66 | private static final char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 67 | 0, 0, 0, 0, 48, 0, 128 }; 68 | 69 | private static final char g_s_ending[] = { 119, 125, 149, 1 }; 70 | 71 | private int I_x; 72 | private int I_p1; 73 | 74 | private void copy_from(norwegianStemmer other) { 75 | I_x = other.I_x; 76 | I_p1 = other.I_p1; 77 | super.copy_from(other); 78 | } 79 | 80 | private boolean r_mark_regions() { 81 | int v_1; 82 | int v_2; 83 | // (, line 26 84 | I_p1 = limit; 85 | // test, line 30 86 | v_1 = cursor; 87 | // (, line 30 88 | // hop, line 30 89 | { 90 | int c = cursor + 3; 91 | if (0 > c || c > limit) { 92 | return false; 93 | } 94 | cursor = c; 95 | } 96 | // setmark x, line 30 97 | I_x = cursor; 98 | cursor = v_1; 99 | // goto, line 31 100 | golab0: while (true) { 101 | v_2 = cursor; 102 | lab1: do { 103 | if (!(in_grouping(g_v, 97, 248))) { 104 | break lab1; 105 | } 106 | cursor = v_2; 107 | break golab0; 108 | } while (false); 109 | cursor = v_2; 110 | if (cursor >= limit) { 111 | return false; 112 | } 113 | cursor++; 114 | } 115 | // gopast, line 31 116 | golab2: while (true) { 117 | lab3: do { 118 | if (!(out_grouping(g_v, 97, 248))) { 119 | break lab3; 120 | } 121 | break golab2; 122 | } while (false); 123 | if (cursor >= limit) { 124 | return false; 125 | } 126 | cursor++; 127 | } 128 | // setmark p1, line 31 129 | I_p1 = cursor; 130 | // try, line 32 131 | lab4: do { 132 | // (, line 32 133 | if (!(I_p1 < I_x)) { 134 | break lab4; 135 | } 136 | I_p1 = I_x; 137 | } while (false); 138 | return true; 139 | } 140 | 141 | private boolean r_main_suffix() { 142 | int among_var; 143 | int v_1; 144 | int v_2; 145 | int v_3; 146 | // (, line 37 147 | // setlimit, line 38 148 | v_1 = limit - cursor; 149 | // tomark, line 38 150 | if (cursor < I_p1) { 151 | return false; 152 | } 153 | cursor = I_p1; 154 | v_2 = limit_backward; 155 | limit_backward = cursor; 156 | cursor = limit - v_1; 157 | // (, line 38 158 | // [, line 38 159 | ket = cursor; 160 | // substring, line 38 161 | among_var = find_among_b(a_0, 29); 162 | if (among_var == 0) { 163 | limit_backward = v_2; 164 | return false; 165 | } 166 | // ], line 38 167 | bra = cursor; 168 | limit_backward = v_2; 169 | switch (among_var) { 170 | case 0: 171 | return false; 172 | case 1: 173 | // (, line 44 174 | // delete, line 44 175 | slice_del(); 176 | break; 177 | case 2: 178 | // (, line 46 179 | // or, line 46 180 | lab0: do { 181 | v_3 = limit - cursor; 182 | lab1: do { 183 | if (!(in_grouping_b(g_s_ending, 98, 122))) { 184 | break lab1; 185 | } 186 | break lab0; 187 | } while (false); 188 | cursor = limit - v_3; 189 | // (, line 46 190 | // literal, line 46 191 | if (!(eq_s_b(1, "k"))) { 192 | return false; 193 | } 194 | if (!(out_grouping_b(g_v, 97, 248))) { 195 | return false; 196 | } 197 | } while (false); 198 | // delete, line 46 199 | slice_del(); 200 | break; 201 | case 3: 202 | // (, line 48 203 | // <-, line 48 204 | slice_from("er"); 205 | break; 206 | } 207 | return true; 208 | } 209 | 210 | private boolean r_consonant_pair() { 211 | int v_1; 212 | int v_2; 213 | int v_3; 214 | // (, line 52 215 | // test, line 53 216 | v_1 = limit - cursor; 217 | // (, line 53 218 | // setlimit, line 54 219 | v_2 = limit - cursor; 220 | // tomark, line 54 221 | if (cursor < I_p1) { 222 | return false; 223 | } 224 | cursor = I_p1; 225 | v_3 = limit_backward; 226 | limit_backward = cursor; 227 | cursor = limit - v_2; 228 | // (, line 54 229 | // [, line 54 230 | ket = cursor; 231 | // substring, line 54 232 | if (find_among_b(a_1, 2) == 0) { 233 | limit_backward = v_3; 234 | return false; 235 | } 236 | // ], line 54 237 | bra = cursor; 238 | limit_backward = v_3; 239 | cursor = limit - v_1; 240 | // next, line 59 241 | if (cursor <= limit_backward) { 242 | return false; 243 | } 244 | cursor--; 245 | // ], line 59 246 | bra = cursor; 247 | // delete, line 59 248 | slice_del(); 249 | return true; 250 | } 251 | 252 | private boolean r_other_suffix() { 253 | int among_var; 254 | int v_1; 255 | int v_2; 256 | // (, line 62 257 | // setlimit, line 63 258 | v_1 = limit - cursor; 259 | // tomark, line 63 260 | if (cursor < I_p1) { 261 | return false; 262 | } 263 | cursor = I_p1; 264 | v_2 = limit_backward; 265 | limit_backward = cursor; 266 | cursor = limit - v_1; 267 | // (, line 63 268 | // [, line 63 269 | ket = cursor; 270 | // substring, line 63 271 | among_var = find_among_b(a_2, 11); 272 | if (among_var == 0) { 273 | limit_backward = v_2; 274 | return false; 275 | } 276 | // ], line 63 277 | bra = cursor; 278 | limit_backward = v_2; 279 | switch (among_var) { 280 | case 0: 281 | return false; 282 | case 1: 283 | // (, line 67 284 | // delete, line 67 285 | slice_del(); 286 | break; 287 | } 288 | return true; 289 | } 290 | 291 | public boolean stem() { 292 | int v_1; 293 | int v_2; 294 | int v_3; 295 | int v_4; 296 | // (, line 72 297 | // do, line 74 298 | v_1 = cursor; 299 | lab0: do { 300 | // call mark_regions, line 74 301 | if (!r_mark_regions()) { 302 | break lab0; 303 | } 304 | } while (false); 305 | cursor = v_1; 306 | // backwards, line 75 307 | limit_backward = cursor; 308 | cursor = limit; 309 | // (, line 75 310 | // do, line 76 311 | v_2 = limit - cursor; 312 | lab1: do { 313 | // call main_suffix, line 76 314 | if (!r_main_suffix()) { 315 | break lab1; 316 | } 317 | } while (false); 318 | cursor = limit - v_2; 319 | // do, line 77 320 | v_3 = limit - cursor; 321 | lab2: do { 322 | // call consonant_pair, line 77 323 | if (!r_consonant_pair()) { 324 | break lab2; 325 | } 326 | } while (false); 327 | cursor = limit - v_3; 328 | // do, line 78 329 | v_4 = limit - cursor; 330 | lab3: do { 331 | // call other_suffix, line 78 332 | if (!r_other_suffix()) { 333 | break lab3; 334 | } 335 | } while (false); 336 | cursor = limit - v_4; 337 | cursor = limit_backward; 338 | return true; 339 | } 340 | 341 | public boolean equals(Object o) { 342 | return o instanceof norwegianStemmer; 343 | } 344 | 345 | public int hashCode() { 346 | return norwegianStemmer.class.getName().hashCode(); 347 | } 348 | 349 | } 350 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/ext/swedishStemmer.java: -------------------------------------------------------------------------------- 1 | // This file was generated automatically by the Snowball to Java compiler 2 | 3 | package org.tartarus.snowball.ext; 4 | 5 | import org.tartarus.snowball.Among; 6 | 7 | /** 8 | * This class was automatically generated by a Snowball to Java compiler It 9 | * implements the stemming algorithm defined by a snowball script. 10 | */ 11 | 12 | public class swedishStemmer extends org.tartarus.snowball.SnowballStemmer { 13 | 14 | private static final long serialVersionUID = 1L; 15 | 16 | private final static swedishStemmer methodObject = new swedishStemmer(); 17 | 18 | private final static Among a_0[] = { 19 | new Among("a", -1, 1, "", methodObject), 20 | new Among("arna", 0, 1, "", methodObject), 21 | new Among("erna", 0, 1, "", methodObject), 22 | new Among("heterna", 2, 1, "", methodObject), 23 | new Among("orna", 0, 1, "", methodObject), 24 | new Among("ad", -1, 1, "", methodObject), 25 | new Among("e", -1, 1, "", methodObject), 26 | new Among("ade", 6, 1, "", methodObject), 27 | new Among("ande", 6, 1, "", methodObject), 28 | new Among("arne", 6, 1, "", methodObject), 29 | new Among("are", 6, 1, "", methodObject), 30 | new Among("aste", 6, 1, "", methodObject), 31 | new Among("en", -1, 1, "", methodObject), 32 | new Among("anden", 12, 1, "", methodObject), 33 | new Among("aren", 12, 1, "", methodObject), 34 | new Among("heten", 12, 1, "", methodObject), 35 | new Among("ern", -1, 1, "", methodObject), 36 | new Among("ar", -1, 1, "", methodObject), 37 | new Among("er", -1, 1, "", methodObject), 38 | new Among("heter", 18, 1, "", methodObject), 39 | new Among("or", -1, 1, "", methodObject), 40 | new Among("s", -1, 2, "", methodObject), 41 | new Among("as", 21, 1, "", methodObject), 42 | new Among("arnas", 22, 1, "", methodObject), 43 | new Among("ernas", 22, 1, "", methodObject), 44 | new Among("ornas", 22, 1, "", methodObject), 45 | new Among("es", 21, 1, "", methodObject), 46 | new Among("ades", 26, 1, "", methodObject), 47 | new Among("andes", 26, 1, "", methodObject), 48 | new Among("ens", 21, 1, "", methodObject), 49 | new Among("arens", 29, 1, "", methodObject), 50 | new Among("hetens", 29, 1, "", methodObject), 51 | new Among("erns", 21, 1, "", methodObject), 52 | new Among("at", -1, 1, "", methodObject), 53 | new Among("andet", -1, 1, "", methodObject), 54 | new Among("het", -1, 1, "", methodObject), 55 | new Among("ast", -1, 1, "", methodObject) }; 56 | 57 | private final static Among a_1[] = { 58 | new Among("dd", -1, -1, "", methodObject), 59 | new Among("gd", -1, -1, "", methodObject), 60 | new Among("nn", -1, -1, "", methodObject), 61 | new Among("dt", -1, -1, "", methodObject), 62 | new Among("gt", -1, -1, "", methodObject), 63 | new Among("kt", -1, -1, "", methodObject), 64 | new Among("tt", -1, -1, "", methodObject) }; 65 | 66 | private final static Among a_2[] = { 67 | new Among("ig", -1, 1, "", methodObject), 68 | new Among("lig", 0, 1, "", methodObject), 69 | new Among("els", -1, 1, "", methodObject), 70 | new Among("fullt", -1, 3, "", methodObject), 71 | new Among("l\u00F6st", -1, 2, "", methodObject) }; 72 | 73 | private static final char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 74 | 0, 0, 0, 0, 24, 0, 32 }; 75 | 76 | private static final char g_s_ending[] = { 119, 127, 149 }; 77 | 78 | private int I_x; 79 | private int I_p1; 80 | 81 | private void copy_from(swedishStemmer other) { 82 | I_x = other.I_x; 83 | I_p1 = other.I_p1; 84 | super.copy_from(other); 85 | } 86 | 87 | private boolean r_mark_regions() { 88 | int v_1; 89 | int v_2; 90 | // (, line 26 91 | I_p1 = limit; 92 | // test, line 29 93 | v_1 = cursor; 94 | // (, line 29 95 | // hop, line 29 96 | { 97 | int c = cursor + 3; 98 | if (0 > c || c > limit) { 99 | return false; 100 | } 101 | cursor = c; 102 | } 103 | // setmark x, line 29 104 | I_x = cursor; 105 | cursor = v_1; 106 | // goto, line 30 107 | golab0: while (true) { 108 | v_2 = cursor; 109 | lab1: do { 110 | if (!(in_grouping(g_v, 97, 246))) { 111 | break lab1; 112 | } 113 | cursor = v_2; 114 | break golab0; 115 | } while (false); 116 | cursor = v_2; 117 | if (cursor >= limit) { 118 | return false; 119 | } 120 | cursor++; 121 | } 122 | // gopast, line 30 123 | golab2: while (true) { 124 | lab3: do { 125 | if (!(out_grouping(g_v, 97, 246))) { 126 | break lab3; 127 | } 128 | break golab2; 129 | } while (false); 130 | if (cursor >= limit) { 131 | return false; 132 | } 133 | cursor++; 134 | } 135 | // setmark p1, line 30 136 | I_p1 = cursor; 137 | // try, line 31 138 | lab4: do { 139 | // (, line 31 140 | if (!(I_p1 < I_x)) { 141 | break lab4; 142 | } 143 | I_p1 = I_x; 144 | } while (false); 145 | return true; 146 | } 147 | 148 | private boolean r_main_suffix() { 149 | int among_var; 150 | int v_1; 151 | int v_2; 152 | // (, line 36 153 | // setlimit, line 37 154 | v_1 = limit - cursor; 155 | // tomark, line 37 156 | if (cursor < I_p1) { 157 | return false; 158 | } 159 | cursor = I_p1; 160 | v_2 = limit_backward; 161 | limit_backward = cursor; 162 | cursor = limit - v_1; 163 | // (, line 37 164 | // [, line 37 165 | ket = cursor; 166 | // substring, line 37 167 | among_var = find_among_b(a_0, 37); 168 | if (among_var == 0) { 169 | limit_backward = v_2; 170 | return false; 171 | } 172 | // ], line 37 173 | bra = cursor; 174 | limit_backward = v_2; 175 | switch (among_var) { 176 | case 0: 177 | return false; 178 | case 1: 179 | // (, line 44 180 | // delete, line 44 181 | slice_del(); 182 | break; 183 | case 2: 184 | // (, line 46 185 | if (!(in_grouping_b(g_s_ending, 98, 121))) { 186 | return false; 187 | } 188 | // delete, line 46 189 | slice_del(); 190 | break; 191 | } 192 | return true; 193 | } 194 | 195 | private boolean r_consonant_pair() { 196 | int v_1; 197 | int v_2; 198 | int v_3; 199 | // setlimit, line 50 200 | v_1 = limit - cursor; 201 | // tomark, line 50 202 | if (cursor < I_p1) { 203 | return false; 204 | } 205 | cursor = I_p1; 206 | v_2 = limit_backward; 207 | limit_backward = cursor; 208 | cursor = limit - v_1; 209 | // (, line 50 210 | // and, line 52 211 | v_3 = limit - cursor; 212 | // among, line 51 213 | if (find_among_b(a_1, 7) == 0) { 214 | limit_backward = v_2; 215 | return false; 216 | } 217 | cursor = limit - v_3; 218 | // (, line 52 219 | // [, line 52 220 | ket = cursor; 221 | // next, line 52 222 | if (cursor <= limit_backward) { 223 | limit_backward = v_2; 224 | return false; 225 | } 226 | cursor--; 227 | // ], line 52 228 | bra = cursor; 229 | // delete, line 52 230 | slice_del(); 231 | limit_backward = v_2; 232 | return true; 233 | } 234 | 235 | private boolean r_other_suffix() { 236 | int among_var; 237 | int v_1; 238 | int v_2; 239 | // setlimit, line 55 240 | v_1 = limit - cursor; 241 | // tomark, line 55 242 | if (cursor < I_p1) { 243 | return false; 244 | } 245 | cursor = I_p1; 246 | v_2 = limit_backward; 247 | limit_backward = cursor; 248 | cursor = limit - v_1; 249 | // (, line 55 250 | // [, line 56 251 | ket = cursor; 252 | // substring, line 56 253 | among_var = find_among_b(a_2, 5); 254 | if (among_var == 0) { 255 | limit_backward = v_2; 256 | return false; 257 | } 258 | // ], line 56 259 | bra = cursor; 260 | switch (among_var) { 261 | case 0: 262 | limit_backward = v_2; 263 | return false; 264 | case 1: 265 | // (, line 57 266 | // delete, line 57 267 | slice_del(); 268 | break; 269 | case 2: 270 | // (, line 58 271 | // <-, line 58 272 | slice_from("l\u00F6s"); 273 | break; 274 | case 3: 275 | // (, line 59 276 | // <-, line 59 277 | slice_from("full"); 278 | break; 279 | } 280 | limit_backward = v_2; 281 | return true; 282 | } 283 | 284 | public boolean stem() { 285 | int v_1; 286 | int v_2; 287 | int v_3; 288 | int v_4; 289 | // (, line 64 290 | // do, line 66 291 | v_1 = cursor; 292 | lab0: do { 293 | // call mark_regions, line 66 294 | if (!r_mark_regions()) { 295 | break lab0; 296 | } 297 | } while (false); 298 | cursor = v_1; 299 | // backwards, line 67 300 | limit_backward = cursor; 301 | cursor = limit; 302 | // (, line 67 303 | // do, line 68 304 | v_2 = limit - cursor; 305 | lab1: do { 306 | // call main_suffix, line 68 307 | if (!r_main_suffix()) { 308 | break lab1; 309 | } 310 | } while (false); 311 | cursor = limit - v_2; 312 | // do, line 69 313 | v_3 = limit - cursor; 314 | lab2: do { 315 | // call consonant_pair, line 69 316 | if (!r_consonant_pair()) { 317 | break lab2; 318 | } 319 | } while (false); 320 | cursor = limit - v_3; 321 | // do, line 70 322 | v_4 = limit - cursor; 323 | lab3: do { 324 | // call other_suffix, line 70 325 | if (!r_other_suffix()) { 326 | break lab3; 327 | } 328 | } while (false); 329 | cursor = limit - v_4; 330 | cursor = limit_backward; 331 | return true; 332 | } 333 | 334 | public boolean equals(Object o) { 335 | return o instanceof swedishStemmer; 336 | } 337 | 338 | public int hashCode() { 339 | return swedishStemmer.class.getName().hashCode(); 340 | } 341 | 342 | } 343 | -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/en-sent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/tldrzr/09a4c3a4c54cc6968b8ee17a9d97e9a69b82c725/src/main/webapp/WEB-INF/en-sent.bin -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/en-token.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/tldrzr/09a4c3a4c54cc6968b8ee17a9d97e9a69b82c725/src/main/webapp/WEB-INF/en-token.bin -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | index.html 10 | 11 | 12 | 13 | tldr 14 | com.mohaps.tldr.TLDRServlet 15 | 16 | 17 | 18 | tldr 19 | /tldr 20 | 21 | 22 | tldr 23 | /tldr/* 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/main/webapp/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/tldrzr/09a4c3a4c54cc6968b8ee17a9d97e9a69b82c725/src/main/webapp/apple-touch-icon.png -------------------------------------------------------------------------------- /src/main/webapp/css/main.css: -------------------------------------------------------------------------------- 1 | /* 2 | * HTML5 Boilerplate 3 | * 4 | * What follows is the result of much research on cross-browser styling. 5 | * Credit left inline and big thanks to Nicolas Gallagher, Jonathan Neal, 6 | * Kroc Camen, and the H5BP dev community and team. 7 | */ 8 | 9 | /* ========================================================================== 10 | Base styles: opinionated defaults 11 | ========================================================================== */ 12 | 13 | html, 14 | button, 15 | input, 16 | select, 17 | textarea { 18 | color: #222; 19 | } 20 | 21 | body { 22 | font-size: 1em; 23 | line-height: 1.4; 24 | } 25 | 26 | /* 27 | * Remove text-shadow in selection highlight: h5bp.com/i 28 | * These selection rule sets have to be separate. 29 | * Customize the background color to match your design. 30 | */ 31 | 32 | ::-moz-selection { 33 | background: #b3d4fc; 34 | text-shadow: none; 35 | } 36 | 37 | ::selection { 38 | background: #b3d4fc; 39 | text-shadow: none; 40 | } 41 | 42 | /* 43 | * A better looking default horizontal rule 44 | */ 45 | 46 | hr { 47 | display: block; 48 | height: 1px; 49 | border: 0; 50 | border-top: 1px solid #ccc; 51 | margin: 1em 0; 52 | padding: 0; 53 | } 54 | 55 | /* 56 | * Remove the gap between images and the bottom of their containers: h5bp.com/i/440 57 | */ 58 | 59 | img { 60 | vertical-align: middle; 61 | } 62 | 63 | /* 64 | * Remove default fieldset styles. 65 | */ 66 | 67 | fieldset { 68 | border: 0; 69 | margin: 0; 70 | padding: 0; 71 | } 72 | 73 | /* 74 | * Allow only vertical resizing of textareas. 75 | */ 76 | 77 | textarea { 78 | width:100%; 79 | } 80 | 81 | /* ========================================================================== 82 | Chrome Frame prompt 83 | ========================================================================== */ 84 | 85 | .chromeframe { 86 | margin: 0.2em 0; 87 | background: #ccc; 88 | color: #000; 89 | padding: 0.2em 0; 90 | } 91 | 92 | /* ========================================================================== 93 | Author's custom styles 94 | ========================================================================== */ 95 | 96 | .mypage { 97 | padding: 3px; 98 | background-color: rgb(255,255,255); 99 | max-width: 800px; 100 | } 101 | .topnav { 102 | font-size: 0.7em; 103 | } 104 | .footer { 105 | font-size: 0.7em; 106 | } 107 | .content_section { 108 | border: 1px dotted; 109 | margin-top: 6px; 110 | margin-bottom: 6px; 111 | margin-left: 2px; 112 | margin-right: 2px; 113 | padding: 5px; 114 | } 115 | .textWrapper 116 | { 117 | border: 0px; 118 | margin:5px 0; 119 | padding:3px; 120 | } 121 | 122 | .feedEntryItem { 123 | padding-left: 6px; 124 | padding-right: 6px; 125 | padding-top: 2px; 126 | padding-bottom: 2px; 127 | } 128 | 129 | .feedEntryTitle { 130 | font-weight: bolder; 131 | } 132 | .feedEntryAuthor { 133 | font-style: italic; 134 | } 135 | .feedEntryLink{ 136 | font-style: italic; 137 | font-size: 0.8em; 138 | text-align: right; 139 | } 140 | .feedEntrySummary { 141 | font-size: 1em; 142 | font-style: italic; 143 | } 144 | .feedEntryText { 145 | font-size: 1em; 146 | } 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | /* ========================================================================== 160 | Helper classes 161 | ========================================================================== */ 162 | 163 | /* 164 | * Image replacement 165 | */ 166 | 167 | .ir { 168 | background-color: transparent; 169 | border: 0; 170 | overflow: hidden; 171 | /* IE 6/7 fallback */ 172 | *text-indent: -9999px; 173 | } 174 | 175 | .ir:before { 176 | content: ""; 177 | display: block; 178 | width: 0; 179 | height: 150%; 180 | } 181 | 182 | /* 183 | * Hide from both screenreaders and browsers: h5bp.com/u 184 | */ 185 | 186 | .hidden { 187 | display: none !important; 188 | visibility: hidden; 189 | } 190 | 191 | /* 192 | * Hide only visually, but have it available for screenreaders: h5bp.com/v 193 | */ 194 | 195 | .visuallyhidden { 196 | border: 0; 197 | clip: rect(0 0 0 0); 198 | height: 1px; 199 | margin: -1px; 200 | overflow: hidden; 201 | padding: 0; 202 | position: absolute; 203 | width: 1px; 204 | } 205 | 206 | /* 207 | * Extends the .visuallyhidden class to allow the element to be focusable 208 | * when navigated to via the keyboard: h5bp.com/p 209 | */ 210 | 211 | .visuallyhidden.focusable:active, 212 | .visuallyhidden.focusable:focus { 213 | clip: auto; 214 | height: auto; 215 | margin: 0; 216 | overflow: visible; 217 | position: static; 218 | width: auto; 219 | } 220 | 221 | /* 222 | * Hide visually and from screenreaders, but maintain layout 223 | */ 224 | 225 | .invisible { 226 | visibility: hidden; 227 | } 228 | 229 | /* 230 | * Clearfix: contain floats 231 | * 232 | * For modern browsers 233 | * 1. The space content is one way to avoid an Opera bug when the 234 | * `contenteditable` attribute is included anywhere else in the document. 235 | * Otherwise it causes space to appear at the top and bottom of elements 236 | * that receive the `clearfix` class. 237 | * 2. The use of `table` rather than `block` is only necessary if using 238 | * `:before` to contain the top-margins of child elements. 239 | */ 240 | 241 | .clearfix:before, 242 | .clearfix:after { 243 | content: " "; /* 1 */ 244 | display: table; /* 2 */ 245 | } 246 | 247 | .clearfix:after { 248 | clear: both; 249 | } 250 | 251 | /* 252 | * For IE 6/7 only 253 | * Include this rule to trigger hasLayout and contain floats. 254 | */ 255 | 256 | .clearfix { 257 | *zoom: 1; 258 | } 259 | 260 | /* ========================================================================== 261 | EXAMPLE Media Queries for Responsive Design. 262 | These examples override the primary ('mobile first') styles. 263 | Modify as content requires. 264 | ========================================================================== */ 265 | 266 | @media only screen and (min-width: 35em) { 267 | /* Style adjustments for viewports that meet the condition */ 268 | } 269 | 270 | @media print, 271 | (-o-min-device-pixel-ratio: 5/4), 272 | (-webkit-min-device-pixel-ratio: 1.25), 273 | (min-resolution: 120dpi) { 274 | /* Style adjustments for high resolution devices */ 275 | } 276 | 277 | /* ========================================================================== 278 | Print styles. 279 | Inlined to avoid required HTTP connection: h5bp.com/r 280 | ========================================================================== */ 281 | 282 | @media print { 283 | * { 284 | background: transparent !important; 285 | color: #000 !important; /* Black prints faster: h5bp.com/s */ 286 | box-shadow: none !important; 287 | text-shadow: none !important; 288 | } 289 | 290 | a, 291 | a:visited { 292 | text-decoration: underline; 293 | } 294 | 295 | a[href]:after { 296 | content: " (" attr(href) ")"; 297 | } 298 | 299 | abbr[title]:after { 300 | content: " (" attr(title) ")"; 301 | } 302 | 303 | /* 304 | * Don't show links for images, or javascript/internal links 305 | */ 306 | 307 | .ir a:after, 308 | a[href^="javascript:"]:after, 309 | a[href^="#"]:after { 310 | content: ""; 311 | } 312 | 313 | pre, 314 | blockquote { 315 | border: 1px solid #999; 316 | page-break-inside: avoid; 317 | } 318 | 319 | thead { 320 | display: table-header-group; /* h5bp.com/t */ 321 | } 322 | 323 | tr, 324 | img { 325 | page-break-inside: avoid; 326 | } 327 | 328 | img { 329 | max-width: 100% !important; 330 | } 331 | 332 | @page { 333 | margin: 0.5cm; 334 | } 335 | 336 | p, 337 | h2, 338 | h3 { 339 | orphans: 3; 340 | widows: 3; 341 | } 342 | 343 | h2, 344 | h3 { 345 | page-break-after: avoid; 346 | } 347 | } 348 | -------------------------------------------------------------------------------- /src/main/webapp/css/normalize.css: -------------------------------------------------------------------------------- 1 | /*! normalize.css v1.1.0 | MIT License | git.io/normalize */ 2 | 3 | /* ========================================================================== 4 | HTML5 display definitions 5 | ========================================================================== */ 6 | 7 | /** 8 | * Correct `block` display not defined in IE 6/7/8/9 and Firefox 3. 9 | */ 10 | 11 | article, 12 | aside, 13 | details, 14 | figcaption, 15 | figure, 16 | footer, 17 | header, 18 | hgroup, 19 | main, 20 | nav, 21 | section, 22 | summary { 23 | display: block; 24 | } 25 | 26 | /** 27 | * Correct `inline-block` display not defined in IE 6/7/8/9 and Firefox 3. 28 | */ 29 | 30 | audio, 31 | canvas, 32 | video { 33 | display: inline-block; 34 | *display: inline; 35 | *zoom: 1; 36 | } 37 | 38 | /** 39 | * Prevent modern browsers from displaying `audio` without controls. 40 | * Remove excess height in iOS 5 devices. 41 | */ 42 | 43 | audio:not([controls]) { 44 | display: none; 45 | height: 0; 46 | } 47 | 48 | /** 49 | * Address styling not present in IE 7/8/9, Firefox 3, and Safari 4. 50 | * Known issue: no IE 6 support. 51 | */ 52 | 53 | [hidden] { 54 | display: none; 55 | } 56 | 57 | /* ========================================================================== 58 | Base 59 | ========================================================================== */ 60 | 61 | /** 62 | * 1. Correct text resizing oddly in IE 6/7 when body `font-size` is set using 63 | * `em` units. 64 | * 2. Prevent iOS text size adjust after orientation change, without disabling 65 | * user zoom. 66 | */ 67 | 68 | html { 69 | font-size: 100%; /* 1 */ 70 | -webkit-text-size-adjust: 100%; /* 2 */ 71 | -ms-text-size-adjust: 100%; /* 2 */ 72 | } 73 | 74 | /** 75 | * Address `font-family` inconsistency between `textarea` and other form 76 | * elements. 77 | */ 78 | 79 | html, 80 | button, 81 | input, 82 | select, 83 | textarea { 84 | font-family: sans-serif; 85 | } 86 | 87 | /** 88 | * Address margins handled incorrectly in IE 6/7. 89 | */ 90 | 91 | body { 92 | margin: 0; 93 | } 94 | 95 | /* ========================================================================== 96 | Links 97 | ========================================================================== */ 98 | 99 | /** 100 | * Address `outline` inconsistency between Chrome and other browsers. 101 | */ 102 | 103 | a:focus { 104 | outline: thin dotted; 105 | } 106 | 107 | /** 108 | * Improve readability when focused and also mouse hovered in all browsers. 109 | */ 110 | 111 | a:active, 112 | a:hover { 113 | outline: 0; 114 | } 115 | 116 | /* ========================================================================== 117 | Typography 118 | ========================================================================== */ 119 | 120 | /** 121 | * Address font sizes and margins set differently in IE 6/7. 122 | * Address font sizes within `section` and `article` in Firefox 4+, Safari 5, 123 | * and Chrome. 124 | */ 125 | 126 | h1 { 127 | font-size: 2em; 128 | margin: 0.67em 0; 129 | } 130 | 131 | h2 { 132 | font-size: 1.5em; 133 | margin: 0.83em 0; 134 | } 135 | 136 | h3 { 137 | font-size: 1.17em; 138 | margin: 1em 0; 139 | } 140 | 141 | h4 { 142 | font-size: 1em; 143 | margin: 1.33em 0; 144 | } 145 | 146 | h5 { 147 | font-size: 0.83em; 148 | margin: 1.67em 0; 149 | } 150 | 151 | h6 { 152 | font-size: 0.67em; 153 | margin: 2.33em 0; 154 | } 155 | 156 | /** 157 | * Address styling not present in IE 7/8/9, Safari 5, and Chrome. 158 | */ 159 | 160 | abbr[title] { 161 | border-bottom: 1px dotted; 162 | } 163 | 164 | /** 165 | * Address style set to `bolder` in Firefox 3+, Safari 4/5, and Chrome. 166 | */ 167 | 168 | b, 169 | strong { 170 | font-weight: bold; 171 | } 172 | 173 | blockquote { 174 | margin: 1em 40px; 175 | } 176 | 177 | /** 178 | * Address styling not present in Safari 5 and Chrome. 179 | */ 180 | 181 | dfn { 182 | font-style: italic; 183 | } 184 | 185 | /** 186 | * Address differences between Firefox and other browsers. 187 | * Known issue: no IE 6/7 normalization. 188 | */ 189 | 190 | hr { 191 | -moz-box-sizing: content-box; 192 | box-sizing: content-box; 193 | height: 0; 194 | } 195 | 196 | /** 197 | * Address styling not present in IE 6/7/8/9. 198 | */ 199 | 200 | mark { 201 | background: #ff0; 202 | color: #000; 203 | } 204 | 205 | /** 206 | * Address margins set differently in IE 6/7. 207 | */ 208 | 209 | p, 210 | pre { 211 | margin: 1em 0; 212 | } 213 | 214 | /** 215 | * Correct font family set oddly in IE 6, Safari 4/5, and Chrome. 216 | */ 217 | 218 | code, 219 | kbd, 220 | pre, 221 | samp { 222 | font-family: monospace, serif; 223 | _font-family: 'courier new', monospace; 224 | font-size: 1em; 225 | } 226 | 227 | /** 228 | * Improve readability of pre-formatted text in all browsers. 229 | */ 230 | 231 | pre { 232 | white-space: pre; 233 | white-space: pre-wrap; 234 | word-wrap: break-word; 235 | } 236 | 237 | /** 238 | * Address CSS quotes not supported in IE 6/7. 239 | */ 240 | 241 | q { 242 | quotes: none; 243 | } 244 | 245 | /** 246 | * Address `quotes` property not supported in Safari 4. 247 | */ 248 | 249 | q:before, 250 | q:after { 251 | content: ''; 252 | content: none; 253 | } 254 | 255 | /** 256 | * Address inconsistent and variable font size in all browsers. 257 | */ 258 | 259 | small { 260 | font-size: 80%; 261 | } 262 | 263 | /** 264 | * Prevent `sub` and `sup` affecting `line-height` in all browsers. 265 | */ 266 | 267 | sub, 268 | sup { 269 | font-size: 75%; 270 | line-height: 0; 271 | position: relative; 272 | vertical-align: baseline; 273 | } 274 | 275 | sup { 276 | top: -0.5em; 277 | } 278 | 279 | sub { 280 | bottom: -0.25em; 281 | } 282 | 283 | /* ========================================================================== 284 | Lists 285 | ========================================================================== */ 286 | 287 | /** 288 | * Address margins set differently in IE 6/7. 289 | */ 290 | 291 | dl, 292 | menu, 293 | ol, 294 | ul { 295 | margin: 1em 0; 296 | } 297 | 298 | dd { 299 | margin: 0 0 0 40px; 300 | } 301 | 302 | /** 303 | * Address paddings set differently in IE 6/7. 304 | */ 305 | 306 | menu, 307 | ol, 308 | ul { 309 | padding: 0 0 0 40px; 310 | } 311 | 312 | /** 313 | * Correct list images handled incorrectly in IE 7. 314 | */ 315 | 316 | nav ul, 317 | nav ol { 318 | list-style: none; 319 | list-style-image: none; 320 | } 321 | 322 | /* ========================================================================== 323 | Embedded content 324 | ========================================================================== */ 325 | 326 | /** 327 | * 1. Remove border when inside `a` element in IE 6/7/8/9 and Firefox 3. 328 | * 2. Improve image quality when scaled in IE 7. 329 | */ 330 | 331 | img { 332 | border: 0; /* 1 */ 333 | -ms-interpolation-mode: bicubic; /* 2 */ 334 | } 335 | 336 | /** 337 | * Correct overflow displayed oddly in IE 9. 338 | */ 339 | 340 | svg:not(:root) { 341 | overflow: hidden; 342 | } 343 | 344 | /* ========================================================================== 345 | Figures 346 | ========================================================================== */ 347 | 348 | /** 349 | * Address margin not present in IE 6/7/8/9, Safari 5, and Opera 11. 350 | */ 351 | 352 | figure { 353 | margin: 0; 354 | } 355 | 356 | /* ========================================================================== 357 | Forms 358 | ========================================================================== */ 359 | 360 | /** 361 | * Correct margin displayed oddly in IE 6/7. 362 | */ 363 | 364 | form { 365 | margin: 0; 366 | } 367 | 368 | /** 369 | * Define consistent border, margin, and padding. 370 | */ 371 | 372 | fieldset { 373 | border: 1px solid #c0c0c0; 374 | margin: 0 2px; 375 | padding: 0.35em 0.625em 0.75em; 376 | } 377 | 378 | /** 379 | * 1. Correct color not being inherited in IE 6/7/8/9. 380 | * 2. Correct text not wrapping in Firefox 3. 381 | * 3. Correct alignment displayed oddly in IE 6/7. 382 | */ 383 | 384 | legend { 385 | border: 0; /* 1 */ 386 | padding: 0; 387 | white-space: normal; /* 2 */ 388 | *margin-left: -7px; /* 3 */ 389 | } 390 | 391 | /** 392 | * 1. Correct font size not being inherited in all browsers. 393 | * 2. Address margins set differently in IE 6/7, Firefox 3+, Safari 5, 394 | * and Chrome. 395 | * 3. Improve appearance and consistency in all browsers. 396 | */ 397 | 398 | button, 399 | input, 400 | select, 401 | textarea { 402 | font-size: 100%; /* 1 */ 403 | margin: 0; /* 2 */ 404 | vertical-align: baseline; /* 3 */ 405 | *vertical-align: middle; /* 3 */ 406 | } 407 | 408 | /** 409 | * Address Firefox 3+ setting `line-height` on `input` using `!important` in 410 | * the UA stylesheet. 411 | */ 412 | 413 | button, 414 | input { 415 | line-height: normal; 416 | } 417 | 418 | /** 419 | * Address inconsistent `text-transform` inheritance for `button` and `select`. 420 | * All other form control elements do not inherit `text-transform` values. 421 | * Correct `button` style inheritance in Chrome, Safari 5+, and IE 6+. 422 | * Correct `select` style inheritance in Firefox 4+ and Opera. 423 | */ 424 | 425 | button, 426 | select { 427 | text-transform: none; 428 | } 429 | 430 | /** 431 | * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio` 432 | * and `video` controls. 433 | * 2. Correct inability to style clickable `input` types in iOS. 434 | * 3. Improve usability and consistency of cursor style between image-type 435 | * `input` and others. 436 | * 4. Remove inner spacing in IE 7 without affecting normal text inputs. 437 | * Known issue: inner spacing remains in IE 6. 438 | */ 439 | 440 | button, 441 | html input[type="button"], /* 1 */ 442 | input[type="reset"], 443 | input[type="submit"] { 444 | -webkit-appearance: button; /* 2 */ 445 | cursor: pointer; /* 3 */ 446 | *overflow: visible; /* 4 */ 447 | } 448 | 449 | /** 450 | * Re-set default cursor for disabled elements. 451 | */ 452 | 453 | button[disabled], 454 | html input[disabled] { 455 | cursor: default; 456 | } 457 | 458 | /** 459 | * 1. Address box sizing set to content-box in IE 8/9. 460 | * 2. Remove excess padding in IE 8/9. 461 | * 3. Remove excess padding in IE 7. 462 | * Known issue: excess padding remains in IE 6. 463 | */ 464 | 465 | input[type="checkbox"], 466 | input[type="radio"] { 467 | box-sizing: border-box; /* 1 */ 468 | padding: 0; /* 2 */ 469 | *height: 13px; /* 3 */ 470 | *width: 13px; /* 3 */ 471 | } 472 | 473 | /** 474 | * 1. Address `appearance` set to `searchfield` in Safari 5 and Chrome. 475 | * 2. Address `box-sizing` set to `border-box` in Safari 5 and Chrome 476 | * (include `-moz` to future-proof). 477 | */ 478 | 479 | input[type="search"] { 480 | -webkit-appearance: textfield; /* 1 */ 481 | -moz-box-sizing: content-box; 482 | -webkit-box-sizing: content-box; /* 2 */ 483 | box-sizing: content-box; 484 | } 485 | 486 | /** 487 | * Remove inner padding and search cancel button in Safari 5 and Chrome 488 | * on OS X. 489 | */ 490 | 491 | input[type="search"]::-webkit-search-cancel-button, 492 | input[type="search"]::-webkit-search-decoration { 493 | -webkit-appearance: none; 494 | } 495 | 496 | /** 497 | * Remove inner padding and border in Firefox 3+. 498 | */ 499 | 500 | button::-moz-focus-inner, 501 | input::-moz-focus-inner { 502 | border: 0; 503 | padding: 0; 504 | } 505 | 506 | /** 507 | * 1. Remove default vertical scrollbar in IE 6/7/8/9. 508 | * 2. Improve readability and alignment in all browsers. 509 | */ 510 | 511 | textarea { 512 | overflow: auto; /* 1 */ 513 | vertical-align: top; /* 2 */ 514 | } 515 | 516 | /* ========================================================================== 517 | Tables 518 | ========================================================================== */ 519 | 520 | /** 521 | * Remove most spacing between table cells. 522 | */ 523 | 524 | table { 525 | border-collapse: collapse; 526 | border-spacing: 0; 527 | } 528 | -------------------------------------------------------------------------------- /src/main/webapp/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/tldrzr/09a4c3a4c54cc6968b8ee17a9d97e9a69b82c725/src/main/webapp/favicon.ico -------------------------------------------------------------------------------- /src/main/webapp/feed_summary.jsp: -------------------------------------------------------------------------------- 1 | 2 | 10 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | TL;DRzr - an algorithmic summary generation demo 32 | 33 | 34 | 37 |
38 |
39 | 40 | 57 |
58 |
59 |

Summarized Feed

60 |

61 | Feed Url : ">
64 |

65 | Entry Count: ${summarized_feed.itemCount}
66 |

67 | Feed Entries 68 |

69 | 70 |
72 |
73 | 74 |
75 |
76 | by 77 |
78 |
79 |
80 |

81 | Summary (Generated) 82 |

83 |
84 | 85 |
86 |
87 | 93 |
94 |

95 | Text (Original) 96 |

97 |
98 | 99 |
100 |
101 | 106 |
107 | 108 | 109 |
110 | 111 |
112 |
113 |

114 | Generated in ${summarized_feed.millis} milliseconds 115 |

116 |
117 |
118 | back to TLDRizer 119 |
120 | 133 |
134 |
135 |
136 | 140 | 141 | 142 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /src/main/webapp/images/tldrzr_logo_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/tldrzr/09a4c3a4c54cc6968b8ee17a9d97e9a69b82c725/src/main/webapp/images/tldrzr_logo_header.png -------------------------------------------------------------------------------- /src/main/webapp/index.html: -------------------------------------------------------------------------------- 1 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | TL;DRzr - an algorithmic summary generation demo 30 | 31 | 32 | 35 |
36 |
37 | 38 | 55 |
56 | Wanna see TL;DRzr in action in a larger app? Check out my other hack XTractor - that extracts page content, cleans it and summarizes it using TL;DRzr code. 57 |
58 |
59 |

Try it out!

60 | 1. Summarize a Feed (RSS/Atom) or a URL
61 |
62 |
64 | 66 | Sentences : 78 |
79 |
80 |

81 | Defaults to the TechCrunch Feed Burner RSS url. Hit 82 | the Summarize Feed button and you're good to go. This will 83 | fetch/parse/summarize the text, so be a bit patient. :) 84 |

85 |

86 | [NEW] You can also put in 87 | a web url and it will fetch the page and try to summarize the page 88 | content. 89 |

90 | 2. Summarize Text
91 | 92 |
94 |
95 |
96 | 98 |
99 |
100 | Sentences : 112 |
113 |

114 | If cutting/pasting, try to use the paste as 115 | plaintext option 116 |

117 |
118 | 119 | 139 |
140 |
141 | 142 |

What's New?

143 |
    144 |
  • 2013-04-11 - TL;DRzr is now opensource. (Hacker 146 | News announcement). (tldrzr@github) 147 |
  • 148 |
  • 2013-04-11 - OpenNLP based tokenizer is now online. 149 | Earlier, due to a bug in the code it was always falling back to 150 | the Regular Expression based tokenizer. This improves sentence 151 | quality.
  • 152 |
  • 2013-04-10 - Thanks to BoilerPipe, now the url passed to 153 | TL;DRzr doesn't have to be a feed url only. It now can extract 154 | parse general web page content.
  • 155 |
  • 2013-04-09 - Summarized URL's can now be saved as links 156 | of the form /tldr/?feed_url=url_goes_here.
  • 157 |
  • 2013-04-09 - There's a POST based API to summarize text 158 | (upto 4MB) via a HTTP POST to /tldr/api/summarize. The parameters 159 | are input_text (mandatory) and sentence_count (optional: defaults 160 | to 5). This is running on a single heroku dyno. So feel free to 161 | use this API, but please be gentle. :)
  • 162 |
163 |
164 |
165 | 166 |
167 |
168 | 169 |

How does it work?

170 |

171 | TL;DRzr uses an algorithm derived from Classifier4J. I used the 173 | basic algo from Classifier4j, optimized it and added some 174 | refinements. 175 |

176 |

The basic algorithm for summarization is like this. It first 177 | tokenizes the text into words and then calculates the top N most 178 | frequent words (discarding stop words and single occurence words). 179 | It then scans the sentences and gets the first N sentences which 180 | feature any or all of the most frequent words. The sentences are 181 | sorted based on first occurence in original text and concatenated 182 | to create the summary. The user has control over how long the 183 | generated summary should be in terms of sentence count.

184 | 185 |

186 | TL;DRzr is written in Java and uses Jsoup for html text scraping, ROME for RSS Feed parsing 189 | (which depends on JDOM). The parsing 190 | of sentences and word tokenization uses OpenNLP. It uses the Porter2 192 | stemmer algorithm from here 194 | to process the tokens emitted by the tokenizer. The new summarize 195 | any url feature uses BoilerPipe 197 |

198 |
199 |
200 | 201 | 202 |
203 |
204 | 205 |

Credits

206 | 207 |

208 | TL;DRzr is a weekend project/quick hack demo created by Saurav Mohapatra. I wrote this 210 | as a fun weekend hack after reading about the Summly 212 | acquisition by Yahoo!. I had drunk too many Red Bulls and sleep 213 | was not too forthcoming. :) I always wished to try out Heroku and 214 | after a couple of hours of googling + coding, I put this together. 215 |

216 | 217 |

218 | The algorithm is a keyword density based one. As this is my 219 | current hobby project, I shall work on improving the algorithm. I 220 | plan on opensourcing this codebase on github.. 222 |

223 | 224 |
225 |
226 | 227 | 228 | 240 |
241 | 242 |
243 | 244 | 248 | 249 | 250 | 259 | 260 | 261 | -------------------------------------------------------------------------------- /src/main/webapp/js/main.js: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/main/webapp/js/plugins.js: -------------------------------------------------------------------------------- 1 | // Avoid `console` errors in browsers that lack a console. 2 | (function() { 3 | var method; 4 | var noop = function () {}; 5 | var methods = [ 6 | 'assert', 'clear', 'count', 'debug', 'dir', 'dirxml', 'error', 7 | 'exception', 'group', 'groupCollapsed', 'groupEnd', 'info', 'log', 8 | 'markTimeline', 'profile', 'profileEnd', 'table', 'time', 'timeEnd', 9 | 'timeStamp', 'trace', 'warn' 10 | ]; 11 | var length = methods.length; 12 | var console = (window.console = window.console || {}); 13 | 14 | while (length--) { 15 | method = methods[length]; 16 | 17 | // Only stub undefined methods. 18 | if (!console[method]) { 19 | console[method] = noop; 20 | } 21 | } 22 | }()); 23 | 24 | // Place any jQuery/helper plugins in here. 25 | -------------------------------------------------------------------------------- /src/main/webapp/js/vendor/modernizr-2.6.2.min.js: -------------------------------------------------------------------------------- 1 | /* Modernizr 2.6.2 (Custom Build) | MIT & BSD 2 | * Build: http://modernizr.com/download/#-fontface-backgroundsize-borderimage-borderradius-boxshadow-flexbox-hsla-multiplebgs-opacity-rgba-textshadow-cssanimations-csscolumns-generatedcontent-cssgradients-cssreflections-csstransforms-csstransforms3d-csstransitions-applicationcache-canvas-canvastext-draganddrop-hashchange-history-audio-video-indexeddb-input-inputtypes-localstorage-postmessage-sessionstorage-websockets-websqldatabase-webworkers-geolocation-inlinesvg-smil-svg-svgclippaths-touch-webgl-shiv-mq-cssclasses-addtest-prefixed-teststyles-testprop-testallprops-hasevent-prefixes-domprefixes-load 3 | */ 4 | ;window.Modernizr=function(a,b,c){function D(a){j.cssText=a}function E(a,b){return D(n.join(a+";")+(b||""))}function F(a,b){return typeof a===b}function G(a,b){return!!~(""+a).indexOf(b)}function H(a,b){for(var d in a){var e=a[d];if(!G(e,"-")&&j[e]!==c)return b=="pfx"?e:!0}return!1}function I(a,b,d){for(var e in a){var f=b[a[e]];if(f!==c)return d===!1?a[e]:F(f,"function")?f.bind(d||b):f}return!1}function J(a,b,c){var d=a.charAt(0).toUpperCase()+a.slice(1),e=(a+" "+p.join(d+" ")+d).split(" ");return F(b,"string")||F(b,"undefined")?H(e,b):(e=(a+" "+q.join(d+" ")+d).split(" "),I(e,b,c))}function K(){e.input=function(c){for(var d=0,e=c.length;d',a,""].join(""),l.id=h,(m?l:n).innerHTML+=f,n.appendChild(l),m||(n.style.background="",n.style.overflow="hidden",k=g.style.overflow,g.style.overflow="hidden",g.appendChild(n)),i=c(l,a),m?l.parentNode.removeChild(l):(n.parentNode.removeChild(n),g.style.overflow=k),!!i},z=function(b){var c=a.matchMedia||a.msMatchMedia;if(c)return c(b).matches;var d;return y("@media "+b+" { #"+h+" { position: absolute; } }",function(b){d=(a.getComputedStyle?getComputedStyle(b,null):b.currentStyle)["position"]=="absolute"}),d},A=function(){function d(d,e){e=e||b.createElement(a[d]||"div"),d="on"+d;var f=d in e;return f||(e.setAttribute||(e=b.createElement("div")),e.setAttribute&&e.removeAttribute&&(e.setAttribute(d,""),f=F(e[d],"function"),F(e[d],"undefined")||(e[d]=c),e.removeAttribute(d))),e=null,f}var a={select:"input",change:"input",submit:"form",reset:"form",error:"img",load:"img",abort:"img"};return d}(),B={}.hasOwnProperty,C;!F(B,"undefined")&&!F(B.call,"undefined")?C=function(a,b){return B.call(a,b)}:C=function(a,b){return b in a&&F(a.constructor.prototype[b],"undefined")},Function.prototype.bind||(Function.prototype.bind=function(b){var c=this;if(typeof c!="function")throw new TypeError;var d=w.call(arguments,1),e=function(){if(this instanceof e){var a=function(){};a.prototype=c.prototype;var f=new a,g=c.apply(f,d.concat(w.call(arguments)));return Object(g)===g?g:f}return c.apply(b,d.concat(w.call(arguments)))};return e}),s.flexbox=function(){return J("flexWrap")},s.canvas=function(){var a=b.createElement("canvas");return!!a.getContext&&!!a.getContext("2d")},s.canvastext=function(){return!!e.canvas&&!!F(b.createElement("canvas").getContext("2d").fillText,"function")},s.webgl=function(){return!!a.WebGLRenderingContext},s.touch=function(){var c;return"ontouchstart"in a||a.DocumentTouch&&b instanceof DocumentTouch?c=!0:y(["@media (",n.join("touch-enabled),("),h,")","{#modernizr{top:9px;position:absolute}}"].join(""),function(a){c=a.offsetTop===9}),c},s.geolocation=function(){return"geolocation"in navigator},s.postmessage=function(){return!!a.postMessage},s.websqldatabase=function(){return!!a.openDatabase},s.indexedDB=function(){return!!J("indexedDB",a)},s.hashchange=function(){return A("hashchange",a)&&(b.documentMode===c||b.documentMode>7)},s.history=function(){return!!a.history&&!!history.pushState},s.draganddrop=function(){var a=b.createElement("div");return"draggable"in a||"ondragstart"in a&&"ondrop"in a},s.websockets=function(){return"WebSocket"in a||"MozWebSocket"in a},s.rgba=function(){return D("background-color:rgba(150,255,150,.5)"),G(j.backgroundColor,"rgba")},s.hsla=function(){return D("background-color:hsla(120,40%,100%,.5)"),G(j.backgroundColor,"rgba")||G(j.backgroundColor,"hsla")},s.multiplebgs=function(){return D("background:url(https://),url(https://),red url(https://)"),/(url\s*\(.*?){3}/.test(j.background)},s.backgroundsize=function(){return J("backgroundSize")},s.borderimage=function(){return J("borderImage")},s.borderradius=function(){return J("borderRadius")},s.boxshadow=function(){return J("boxShadow")},s.textshadow=function(){return b.createElement("div").style.textShadow===""},s.opacity=function(){return E("opacity:.55"),/^0.55$/.test(j.opacity)},s.cssanimations=function(){return J("animationName")},s.csscolumns=function(){return J("columnCount")},s.cssgradients=function(){var a="background-image:",b="gradient(linear,left top,right bottom,from(#9f9),to(white));",c="linear-gradient(left top,#9f9, white);";return D((a+"-webkit- ".split(" ").join(b+a)+n.join(c+a)).slice(0,-a.length)),G(j.backgroundImage,"gradient")},s.cssreflections=function(){return J("boxReflect")},s.csstransforms=function(){return!!J("transform")},s.csstransforms3d=function(){var a=!!J("perspective");return a&&"webkitPerspective"in g.style&&y("@media (transform-3d),(-webkit-transform-3d){#modernizr{left:9px;position:absolute;height:3px;}}",function(b,c){a=b.offsetLeft===9&&b.offsetHeight===3}),a},s.csstransitions=function(){return J("transition")},s.fontface=function(){var a;return y('@font-face {font-family:"font";src:url("https://")}',function(c,d){var e=b.getElementById("smodernizr"),f=e.sheet||e.styleSheet,g=f?f.cssRules&&f.cssRules[0]?f.cssRules[0].cssText:f.cssText||"":"";a=/src/i.test(g)&&g.indexOf(d.split(" ")[0])===0}),a},s.generatedcontent=function(){var a;return y(["#",h,"{font:0/0 a}#",h,':after{content:"',l,'";visibility:hidden;font:3px/1 a}'].join(""),function(b){a=b.offsetHeight>=3}),a},s.video=function(){var a=b.createElement("video"),c=!1;try{if(c=!!a.canPlayType)c=new Boolean(c),c.ogg=a.canPlayType('video/ogg; codecs="theora"').replace(/^no$/,""),c.h264=a.canPlayType('video/mp4; codecs="avc1.42E01E"').replace(/^no$/,""),c.webm=a.canPlayType('video/webm; codecs="vp8, vorbis"').replace(/^no$/,"")}catch(d){}return c},s.audio=function(){var a=b.createElement("audio"),c=!1;try{if(c=!!a.canPlayType)c=new Boolean(c),c.ogg=a.canPlayType('audio/ogg; codecs="vorbis"').replace(/^no$/,""),c.mp3=a.canPlayType("audio/mpeg;").replace(/^no$/,""),c.wav=a.canPlayType('audio/wav; codecs="1"').replace(/^no$/,""),c.m4a=(a.canPlayType("audio/x-m4a;")||a.canPlayType("audio/aac;")).replace(/^no$/,"")}catch(d){}return c},s.localstorage=function(){try{return localStorage.setItem(h,h),localStorage.removeItem(h),!0}catch(a){return!1}},s.sessionstorage=function(){try{return sessionStorage.setItem(h,h),sessionStorage.removeItem(h),!0}catch(a){return!1}},s.webworkers=function(){return!!a.Worker},s.applicationcache=function(){return!!a.applicationCache},s.svg=function(){return!!b.createElementNS&&!!b.createElementNS(r.svg,"svg").createSVGRect},s.inlinesvg=function(){var a=b.createElement("div");return a.innerHTML="",(a.firstChild&&a.firstChild.namespaceURI)==r.svg},s.smil=function(){return!!b.createElementNS&&/SVGAnimate/.test(m.call(b.createElementNS(r.svg,"animate")))},s.svgclippaths=function(){return!!b.createElementNS&&/SVGClipPath/.test(m.call(b.createElementNS(r.svg,"clipPath")))};for(var L in s)C(s,L)&&(x=L.toLowerCase(),e[x]=s[L](),v.push((e[x]?"":"no-")+x));return e.input||K(),e.addTest=function(a,b){if(typeof a=="object")for(var d in a)C(a,d)&&e.addTest(d,a[d]);else{a=a.toLowerCase();if(e[a]!==c)return e;b=typeof b=="function"?b():b,typeof f!="undefined"&&f&&(g.className+=" "+(b?"":"no-")+a),e[a]=b}return e},D(""),i=k=null,function(a,b){function k(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function l(){var a=r.elements;return typeof a=="string"?a.split(" "):a}function m(a){var b=i[a[g]];return b||(b={},h++,a[g]=h,i[h]=b),b}function n(a,c,f){c||(c=b);if(j)return c.createElement(a);f||(f=m(c));var g;return f.cache[a]?g=f.cache[a].cloneNode():e.test(a)?g=(f.cache[a]=f.createElem(a)).cloneNode():g=f.createElem(a),g.canHaveChildren&&!d.test(a)?f.frag.appendChild(g):g}function o(a,c){a||(a=b);if(j)return a.createDocumentFragment();c=c||m(a);var d=c.frag.cloneNode(),e=0,f=l(),g=f.length;for(;e",f="hidden"in a,j=a.childNodes.length==1||function(){b.createElement("a");var a=b.createDocumentFragment();return typeof a.cloneNode=="undefined"||typeof a.createDocumentFragment=="undefined"||typeof a.createElement=="undefined"}()}catch(c){f=!0,j=!0}})();var r={elements:c.elements||"abbr article aside audio bdi canvas data datalist details figcaption figure footer header hgroup mark meter nav output progress section summary time video",shivCSS:c.shivCSS!==!1,supportsUnknownElements:j,shivMethods:c.shivMethods!==!1,type:"default",shivDocument:q,createElement:n,createDocumentFragment:o};a.html5=r,q(b)}(this,b),e._version=d,e._prefixes=n,e._domPrefixes=q,e._cssomPrefixes=p,e.mq=z,e.hasEvent=A,e.testProp=function(a){return H([a])},e.testAllProps=J,e.testStyles=y,e.prefixed=function(a,b,c){return b?J(a,b,c):J(a,"pfx")},g.className=g.className.replace(/(^|\s)no-js(\s|$)/,"$1$2")+(f?" js "+v.join(" "):""),e}(this,this.document),function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f 9 | 10 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | TL;DRzr - an algorithmic summary generation demo 31 | 32 | 33 | 36 |
37 |
38 | 39 | 59 |
60 |
61 |

62 | Generated Summary (upto 63 | ${summary.sentence_count} sentences) 64 |

65 |
66 |
67 | 68 |
69 |
70 |

71 | Generated in ${summary.millis} milliseconds. Read more : "> [ ">Summary 76 | Link ] 77 | 78 | 79 |

80 | 81 |

Original Text

82 |
83 |
84 | 85 |
86 |
87 | 88 | 89 |
90 |
91 | back to TL;DRzr 92 |
93 | 94 | 107 |
108 |
109 |
110 | 111 | 115 | 116 | 117 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /system.properties: -------------------------------------------------------------------------------- 1 | java.runtime.version=1.6 --------------------------------------------------------------------------------