├── src └── main │ ├── webapp │ ├── js │ │ ├── main.js │ │ ├── plugins.js │ │ └── vendor │ │ │ └── modernizr-2.6.2.min.js │ ├── favicon.ico │ ├── WEB-INF │ │ ├── en-sent.bin │ │ ├── en-token.bin │ │ └── web.xml │ ├── images │ │ └── xtractor_logo_header.png │ ├── xtractor.jsp │ ├── index.html │ └── css │ │ ├── main.css │ │ └── normalize.css │ └── java │ ├── org │ └── tartarus │ │ └── snowball │ │ ├── SnowballStemmer.java │ │ ├── Among.java │ │ ├── ext │ │ ├── norwegianStemmer.java │ │ ├── swedishStemmer.java │ │ ├── danishStemmer.java │ │ ├── germanStemmer.java │ │ └── dutchStemmer.java │ │ └── SnowballProgram.java │ ├── com │ └── mohaps │ │ ├── fetch │ │ ├── FetchException.java │ │ ├── Constants.java │ │ ├── HeadResult.java │ │ ├── FetchResult.java │ │ └── Fetcher.java │ │ ├── xtractor │ │ ├── Extractor.java │ │ ├── Main.java │ │ ├── ExtractorResult.java │ │ └── XTractorServlet.java │ │ └── tldr │ │ ├── summarize │ │ ├── IStopWords.java │ │ ├── ITokenizer.java │ │ ├── Factory.java │ │ ├── RegExTokenizer.java │ │ ├── Defaults.java │ │ ├── ISummarizer.java │ │ ├── SummaryCache.java │ │ ├── StopWords.java │ │ ├── OpenNLPTokenizer.java │ │ └── Summarizer.java │ │ └── utils │ │ └── Words.java │ └── de │ └── jetwick │ └── snacktory │ ├── ExtractedContent.java │ └── OutputFormatter.java ├── system.properties ├── .gitignore ├── Procfile ├── README.md └── pom.xml /src/main/webapp/js/main.js: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /system.properties: -------------------------------------------------------------------------------- 1 | java.runtime.version=1.6 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | .settings 4 | .springBeans 5 | target 6 | .DS_Store 7 | 8 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: java $JAVA_OPTS -cp target/classes:target/dependency/* com.mohaps.xtractor.Main 2 | -------------------------------------------------------------------------------- /src/main/webapp/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/xtractor/HEAD/src/main/webapp/favicon.ico -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/en-sent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/xtractor/HEAD/src/main/webapp/WEB-INF/en-sent.bin -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/en-token.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/xtractor/HEAD/src/main/webapp/WEB-INF/en-token.bin -------------------------------------------------------------------------------- /src/main/webapp/images/xtractor_logo_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohaps/xtractor/HEAD/src/main/webapp/images/xtractor_logo_header.png -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/SnowballStemmer.java: -------------------------------------------------------------------------------- 1 | package org.tartarus.snowball; 2 | 3 | 4 | 5 | public abstract class SnowballStemmer extends SnowballProgram { 6 | public abstract boolean stem(); 7 | }; 8 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/fetch/FetchException.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.fetch; 2 | 3 | 4 | public class FetchException extends Exception { 5 | 6 | private static final long serialVersionUID = 1L; 7 | 8 | public FetchException() { 9 | super(); 10 | } 11 | public FetchException(String message) { 12 | super(message); 13 | } 14 | public FetchException(String message, Throwable error) { 15 | super(message, error); 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/fetch/Constants.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.fetch; 2 | 3 | public interface Constants { 4 | public static final String CHARSET_LATIN1 = "ISO-8859-1"; 5 | public static final String CHARSET_UTF8 = "UTF-8"; 6 | public static final String DEFAULT_CHARSET = CHARSET_LATIN1; 7 | public static final String DEFAULT_CONTENT_TYPE = "text/html"; 8 | public static final String BLANK_STRING = ""; 9 | public static final byte[] BLANK_BYTES = new byte[0]; 10 | } 11 | -------------------------------------------------------------------------------- /src/main/webapp/js/plugins.js: -------------------------------------------------------------------------------- 1 | // Avoid `console` errors in browsers that lack a console. 2 | (function() { 3 | var method; 4 | var noop = function () {}; 5 | var methods = [ 6 | 'assert', 'clear', 'count', 'debug', 'dir', 'dirxml', 'error', 7 | 'exception', 'group', 'groupCollapsed', 'groupEnd', 'info', 'log', 8 | 'markTimeline', 'profile', 'profileEnd', 'table', 'time', 'timeEnd', 9 | 'timeStamp', 'trace', 'warn' 10 | ]; 11 | var length = methods.length; 12 | var console = (window.console = window.console || {}); 13 | 14 | while (length--) { 15 | method = methods[length]; 16 | 17 | // Only stub undefined methods. 18 | if (!console[method]) { 19 | console[method] = noop; 20 | } 21 | } 22 | }()); 23 | 24 | // Place any jQuery/helper plugins in here. 25 | -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | index.html 10 | 11 | 12 | 13 | xtractor 14 | com.mohaps.xtractor.XTractorServlet 15 | 16 | 17 | 18 | xtractor 19 | /xtractor 20 | 21 | 22 | xtractor 23 | /xtractor/* 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/Among.java: -------------------------------------------------------------------------------- 1 | package org.tartarus.snowball; 2 | 3 | import java.lang.reflect.Method; 4 | 5 | public class Among { 6 | public Among(String s, int substring_i, int result, String methodname, 7 | SnowballProgram methodobject) { 8 | this.s_size = s.length(); 9 | this.s = s.toCharArray(); 10 | this.substring_i = substring_i; 11 | this.result = result; 12 | this.methodobject = methodobject; 13 | if (methodname.length() == 0) { 14 | this.method = null; 15 | } else { 16 | try { 17 | this.method = methodobject.getClass().getDeclaredMethod( 18 | methodname, new Class[0]); 19 | } catch (NoSuchMethodException e) { 20 | throw new RuntimeException(e); 21 | } 22 | } 23 | } 24 | 25 | public final int s_size; /* search string */ 26 | public final char[] s; /* search string */ 27 | public final int substring_i; /* index to longest matching substring */ 28 | public final int result; /* result of the lookup */ 29 | public final Method method; /* method to use if substring matches */ 30 | public final SnowballProgram methodobject; /* object to invoke method on */ 31 | }; 32 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/fetch/HeadResult.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.fetch; 2 | 3 | public class HeadResult { 4 | private boolean success; 5 | private String url; 6 | private String contentType; 7 | private long contentLength; 8 | private int statusCode; 9 | private HeadResult() { 10 | 11 | } 12 | public static HeadResult NewSuccess(String url, int statusCode, String contentType, long contentLength) { 13 | HeadResult result = new HeadResult(); 14 | result.url = url; 15 | result.success = true; 16 | result.contentType = contentType; 17 | result.contentLength = contentLength; 18 | result.statusCode = statusCode; 19 | return result; 20 | } 21 | public static HeadResult NewFailed(String url, int statusCode) { 22 | HeadResult result = new HeadResult(); 23 | result.url = url; 24 | result.success = false; 25 | result.statusCode = statusCode; 26 | return result; 27 | } 28 | @Override 29 | public String toString() { 30 | return "HeadResult [success=" + success + ", url=" + url 31 | + ", contentType=" + contentType + ", contentLength=" 32 | + contentLength + ", statusCode=" + statusCode + "]"; 33 | } 34 | public boolean isSuccess() { 35 | return success; 36 | } 37 | public String getUrl() { 38 | return url; 39 | } 40 | public String getContentType() { 41 | return contentType; 42 | } 43 | public long getContentLength() { 44 | return contentLength; 45 | } 46 | public int getStatusCode() { 47 | return statusCode; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/xtractor/Extractor.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.xtractor; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.InputStream; 5 | 6 | import org.jsoup.Jsoup; 7 | import org.jsoup.nodes.Document; 8 | 9 | import com.mohaps.fetch.Fetcher; 10 | 11 | import de.jetwick.snacktory.ArticleTextExtractor; 12 | import de.jetwick.snacktory.ExtractedContent; 13 | import de.jetwick.snacktory.OutputFormatter; 14 | 15 | public class Extractor { 16 | 17 | private OutputFormatter formatter = new OutputFormatter(); 18 | private Fetcher fetcher; 19 | public Extractor() { 20 | this.fetcher = null; 21 | } 22 | public Extractor(Fetcher fetcher) { 23 | this.fetcher = fetcher; 24 | } 25 | public ExtractorResult extract(byte[] content, String charset, 26 | String baseUrl) throws Exception { 27 | return extract(new ByteArrayInputStream(content), charset, baseUrl); 28 | } 29 | 30 | public ExtractorResult extract(InputStream input, String charset, 31 | String baseUrl) throws Exception { 32 | long startTime = System.currentTimeMillis(); 33 | ArticleTextExtractor extractor = new ArticleTextExtractor(fetcher); 34 | Document doc = Jsoup.parse(input, charset, baseUrl); 35 | ExtractedContent extracted = new ExtractedContent(); 36 | extracted.setOriginalUrl(baseUrl); 37 | extractor.extractContent(extracted, doc, formatter); 38 | // System.out.println(">> Extracted Image : "+extracted.getImageUrl()); 39 | long elapsed = System.currentTimeMillis() - startTime; 40 | ExtractorResult result = new ExtractorResult(extracted.getTitle(), 41 | extracted.getText(), 42 | baseUrl, 43 | extracted.getImageUrl(), 44 | extracted.getVideoUrl(), 45 | charset, 46 | elapsed, 47 | extracted.getKeywords()); 48 | return result; 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/xtractor/Main.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.xtractor; 2 | 3 | import org.eclipse.jetty.server.Server; 4 | import org.eclipse.jetty.webapp.WebAppContext; 5 | 6 | /** 7 | * 8 | * This class launches the web application in an embedded Jetty container. 9 | * This is the entry point to your application. The Java command that is used for 10 | * launching should fire this main method. 11 | * 12 | */ 13 | public class Main { 14 | 15 | /** 16 | * @param args 17 | */ 18 | public static void main(String[] args) throws Exception{ 19 | String webappDirLocation = "src/main/webapp/"; 20 | 21 | //The port that we should run on can be set into an environment variable 22 | //Look for that variable and default to 8080 if it isn't there. 23 | String webPort = System.getenv("PORT"); 24 | if(webPort == null || webPort.isEmpty()) { 25 | webPort = "8080"; 26 | } 27 | 28 | Server server = new Server(Integer.valueOf(webPort)); 29 | WebAppContext root = new WebAppContext(); 30 | 31 | root.setContextPath("/"); 32 | root.setDescriptor(webappDirLocation+"/WEB-INF/web.xml"); 33 | root.setResourceBase(webappDirLocation); 34 | 35 | //Parent loader priority is a class loader setting that Jetty accepts. 36 | //By default Jetty will behave like most web containers in that it will 37 | //allow your application to replace non-server libraries that are part of the 38 | //container. Setting parent loader priority to true changes this behavior. 39 | //Read more here: http://wiki.eclipse.org/Jetty/Reference/Jetty_Classloading 40 | root.setParentLoaderPriority(true); 41 | 42 | server.setHandler(root); 43 | 44 | server.start(); 45 | server.join(); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/xtractor/ExtractorResult.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.xtractor; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collection; 5 | import java.util.List; 6 | 7 | public class ExtractorResult { 8 | private long timeTakenMillis; 9 | private String title; 10 | private String text; 11 | private String link; 12 | private String image; 13 | private String video; 14 | private String charset; 15 | private List keywords; 16 | private String summary; 17 | private String shortUrl; 18 | public ExtractorResult(String title, String text, String link, 19 | String image, String video, String charset, long timeTakenMillis, 20 | Collection keywords) { 21 | this.timeTakenMillis = timeTakenMillis; 22 | this.text = text; 23 | this.link = link; 24 | this.title = title; 25 | this.image = image; 26 | this.charset = charset; 27 | this.keywords = new ArrayList(keywords); 28 | this.video = video; 29 | this.shortUrl = link; 30 | } 31 | public void setShortUrl(String shortUrl){ this.shortUrl = shortUrl; } 32 | public void setSummary(String summary){ this.summary = summary; } 33 | public String getSummary(){ return summary; } 34 | public String getVideo() { 35 | return video; 36 | } 37 | public long getTimeTakenMillis() { 38 | return timeTakenMillis; 39 | } 40 | 41 | public String getTitle() { 42 | return title; 43 | } 44 | 45 | public String getText() { 46 | return text; 47 | } 48 | 49 | public String getLink() { 50 | return link; 51 | } 52 | 53 | public String getCharset() { 54 | return charset; 55 | } 56 | 57 | public String getImage() { 58 | return image; 59 | } 60 | 61 | public Collection getKeywords() { 62 | return this.keywords; 63 | } 64 | 65 | @Override 66 | public String toString() { 67 | return "ExtractorResult [\n" 68 | +" timeTakenMillis=" + timeTakenMillis 69 | + ",\n title=" + title 70 | + ",\n link=" + link 71 | + ",\n image=" + image 72 | + ",\n video=" +video 73 | + "]"; 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/IStopWords.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | /** 36 | * Stopwords filter 37 | * @author mohaps 38 | * 39 | */ 40 | public interface IStopWords { 41 | boolean isStopWord(String word); 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/ITokenizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | /** 36 | * tokenize a sentence into words 37 | * @author mohaps 38 | * 39 | */ 40 | public interface ITokenizer { 41 | String[] tokenize(String input) throws Exception; 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/fetch/FetchResult.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.fetch; 2 | 3 | public class FetchResult { 4 | 5 | private String originalUrl; 6 | private String actualUrl; 7 | private int httpStatusCode; 8 | private long contentLength; 9 | private String contentType; 10 | private String charset; 11 | private long timeTakenMillis; 12 | private byte[] content; 13 | private ContentFlavor flavor; 14 | public enum ContentFlavor { 15 | PAGE, 16 | FEED, 17 | CUSTOM 18 | } 19 | 20 | public FetchResult(String originalUrl, String actualUrl, int httpStatusCode, long contentLength, String contentType, ContentFlavor flavor, String charset, long timeTakenMillis, byte[] content) { 21 | this.originalUrl = originalUrl; 22 | this.actualUrl = actualUrl; 23 | this.httpStatusCode = httpStatusCode; 24 | this.contentLength = contentLength; 25 | this.contentType = contentType; 26 | this.charset = charset; 27 | this.flavor = flavor; 28 | this.timeTakenMillis = timeTakenMillis; 29 | this.content = content; 30 | } 31 | public boolean isFeed() { 32 | return this.flavor.equals(ContentFlavor.FEED); 33 | } 34 | public boolean isPage() { 35 | return this.flavor.equals(ContentFlavor.PAGE); 36 | } 37 | public String getOriginalUrl() { 38 | return originalUrl; 39 | } 40 | public String getActualUrl() { 41 | return actualUrl; 42 | } 43 | public int getHttpStatusCode() { 44 | return httpStatusCode; 45 | } 46 | public long getContentLength() { 47 | return contentLength; 48 | } 49 | public String getContentType() { 50 | return contentType; 51 | } 52 | public String getCharset() { 53 | return charset; 54 | } 55 | public long getTimeTakenMillis() { 56 | return timeTakenMillis; 57 | } 58 | public byte[] getContent() { 59 | return content; 60 | } 61 | @Override 62 | public String toString() { 63 | return "FetchResult \n[\n originalUrl=" + originalUrl + ",\n actualUrl=" 64 | + actualUrl + ",\n httpStatusCode=" + httpStatusCode 65 | + ",\n contentLength=" + contentLength + ",\n contentType=" 66 | + contentType + ",\n charset=" + charset+ ",\n flavor=" + flavor + ",\n timeTakenMillis=" 67 | + timeTakenMillis + "\n]"; 68 | } 69 | 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/Factory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | /** 36 | * All the default implementations in one handy place (pre-initialized) 37 | * @author mohaps 38 | * 39 | */ 40 | public final class Factory { 41 | public static final IStopWords DEFAULT_STOPWORDS = new StopWords(); 42 | public static final ITokenizer DEFAULT_TOKENIZER = new OpenNLPTokenizer(); 43 | public static final ISummarizer DEFAULT_SUMMARIZER = new Summarizer(DEFAULT_STOPWORDS, DEFAULT_TOKENIZER); 44 | public static final IStopWords getStopWords() { 45 | return DEFAULT_STOPWORDS; 46 | } 47 | public static final ITokenizer getTokenizer() { 48 | return DEFAULT_TOKENIZER; 49 | } 50 | public static final ISummarizer getSummarizer() { 51 | return DEFAULT_SUMMARIZER; 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/RegExTokenizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | /** 35 | * Simple regex based tokenizer (used as a fallback) 36 | * @author mohaps 37 | * 38 | */ 39 | public class RegExTokenizer implements ITokenizer { 40 | private String tokenRegEx; 41 | public RegExTokenizer() { 42 | this(Defaults.REGEX_WORDS); 43 | } 44 | public RegExTokenizer(String tokenRegEx) { 45 | this.tokenRegEx = tokenRegEx; 46 | if(this.tokenRegEx == null) { 47 | this.tokenRegEx = Defaults.REGEX_WORDS; 48 | } 49 | } 50 | public String[] tokenize(String input) throws Exception{ 51 | if(input == null || input.length() == 0){ 52 | return Defaults.BLANK_STRING_ARRAY; 53 | } else { 54 | return input.split(tokenRegEx); 55 | } 56 | } 57 | public String toString() { 58 | return new StringBuilder("RegExTokenizer(regex=\"").append(tokenRegEx).append("\"").toString(); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/Defaults.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | /** 36 | * Default values and constants 37 | * @author mohaps 38 | * 39 | */ 40 | public final class Defaults { 41 | public static final int MAX_SENTENCES = 4; 42 | public static final int MAX_MOST_FREQUENT_WORDS = 20; 43 | public static final int MIN_WORDS_PER_SENTENCE = 5; 44 | public static final int AVG_WORDS_PER_SENTENCE = 20; 45 | 46 | public static final String REGEX_WHITESPACE = "\\W"; 47 | public static final String REGEX_WORDS = "\\s"; 48 | public static final String REGEX_SENTENCES = "(\\.|!|\\?)+(\\s|\\z)+"; 49 | public static final String BLANK_STRING = ""; 50 | public static final byte[] BLANK_BYTES = new byte[0]; 51 | public static final boolean SHOULD_IGNORE_SINGLE_OCCURENCES = true; 52 | public static final String[] BLANK_STRING_ARRAY = new String[0]; 53 | public static final int SUMMARY_LENGTH = 4; 54 | public static final int MAX_API_INPUT_LENGTH = 4*1024; 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/ISummarizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.util.Set; 36 | 37 | /** 38 | * The summarizer interface 39 | * @author mohaps 40 | * 41 | */ 42 | public interface ISummarizer { 43 | /** 44 | * summarize given text (upto sentenceCount sentences) 45 | * @param input - the input text to summarize 46 | * @param sentenceCount - macimum sentence length of the summary 47 | * @param maxFrequentWords - how many keywords to extract at the most 48 | * @param shouldIgnoreSingleOccurences - if we should consider words with occurence count 1 49 | * @return 50 | * @throws Exception 51 | */ 52 | String summarize(final String input, int sentenceCount, int maxFrequentWords, boolean shouldIgnoreSingleOccurences) throws Exception; 53 | String summarize(String input, int sentenceCount) throws Exception; 54 | 55 | /** 56 | * WIP: extract keywords from a given input text 57 | * @param input 58 | * @param maxKeyWords - number of keywords to extract 59 | * @return 60 | * @throws Exception 61 | */ 62 | Set keywords(final String input, int maxKeyWords) throws Exception; 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/SummaryCache.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.util.*; 36 | /** 37 | * A summary cache (psuedo-LRU, doesn't refresh keys) 38 | * In a real world service, this would be in memcached/redis type store. 39 | * @author mohaps 40 | * 41 | */ 42 | public final class SummaryCache { 43 | public static final int MAX_CACHE_SIZE = 40; 44 | private static final SummaryCache sInstance = new SummaryCache(); 45 | public static final SummaryCache instance() { return sInstance; } 46 | static final class Key { 47 | private byte[] key; 48 | Key(byte[] key) { this.key = key; } 49 | public boolean equals(Object o) { 50 | if(o != null && o instanceof Key) { 51 | return Arrays.equals(((Key)o).key, key); 52 | } 53 | return false; 54 | } 55 | public int hashCode() { return Arrays.hashCode(key); } 56 | } 57 | private Map cache = new LinkedHashMap(); 58 | public void put(byte[] textHash, String summary) { 59 | cache.put(new Key(textHash), summary); 60 | //very crude psuedo-LRU style eviction 61 | //since we're not updating the keys on access 62 | //it just evicts the oldest added one 63 | //TODO: make it full LRU 64 | if(cache.size() > MAX_CACHE_SIZE + 10) { 65 | while(cache.size() > MAX_CACHE_SIZE){ 66 | cache.remove(cache.keySet().iterator().next()); 67 | } 68 | } 69 | } 70 | public String get(byte[] inputHash) { 71 | //TODO: should update key order here 72 | return cache.get(new Key(inputHash)); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XTractor - heuristics based webpage text extraction demo 2 | Demo: http://xtractor.herokuapp.com 3 | 4 | Sample URL: Fox News review of the movie Oblivion : http://xtractor.herokuapp.com/xtractor/?url=http%3A%2F%2Fwww.foxnews.com%2Fentertainment%2F2013%2F04%2F18%2Freview-tom-cruise-tones-it-down-in-visually-stunning-oblivion%2F 5 | 6 | 7 | Xtractor tries to guess the main body of the article and grab one significant image from the page. Then it strips off all formatting and presents the article with just the image as header, a short summary and paragraph preserved simple readable text. 8 | 9 | I had written this close to 3 years back (https://news.ycombinator.com/item?id=5580719) 10 | 11 | It uses snacktory for dom manipulation and tldrzr (http://github.com/mohaps/tldrzr) for summarization. 12 | 13 | 14 | I was never happy with the code quality and always intended to get back to it. Received an email from someone who wished to use the code in an open source project. Opensourcing this under the BSD license. Feel free to hack on it and make it better. :) 15 | 16 | While reading the code, please keep in mind that it was written within a couple of hours of hackathon style coding :) with the sole goal of getting something working end to end. If you have questions... well, it's been so long, I mightn't even remember why I did somethings the way I did. :) But feel free to drop me a line at mohaps AT gmail DOT com 17 | 18 | Have fun! Keep hacking. 19 | 20 | ## Running the application locally 21 | 22 | First build with: 23 | 24 | $mvn clean install 25 | 26 | Then run it with: 27 | 28 | $java -cp target/classes:target/dependency/* com.mohaps.xtractor.Main 29 | 30 | ## Library usage example 31 | 32 | ### With Fetcher 33 | 34 | Fetcher fetcher = new Fetcher(); 35 | Extractor extractor = new Extractor(fetcher); 36 | ISummarizer summarizer = Factory.getSummarizer(); 37 | 38 | int summarySentenceNb = 1; 39 | long timeout = 1000; // 1s 40 | String url = "http://www.bbc.com/news/science-environment-34510869"; 41 | FetchResult fResult = fetcher.fetch(url, timeout); 42 | ExtractorResult eResult = extractor.extract(fResult.getContent(), fResult.getCharset(), fResult.getActualUrl()); 43 | String summary = summarizer.summarize(eResult.getText(), summarySentenceNb); 44 | 45 | System.out.print("title: \t"+ SHelper.replaceSmartQuotes(eResult.getTitle()) + "\n"); 46 | System.out.print("summary:\t"+ SHelper.replaceSmartQuotes(summary) + "\n"); 47 | System.out.print("image: \t"+ eResult.getImage() + "\n"); 48 | System.out.print("video: \t"+ eResult.getVideo() + "\n"); 49 | System.out.print("body: \t"+ SHelper.replaceSmartQuotes(eResult.getText()) + "\n"); 50 | 51 | fetcher.shutdown(); // Have to be shutdown to finish the process 52 | 53 | ### Without Fetcher 54 | 55 | Extractor extractor = new Extractor(); 56 | ISummarizer summarizer = Factory.getSummarizer(); 57 | 58 | int summarySentenceNb = 1; 59 | String charset = "UTF-8" 60 | String url = "http://www.random.com"; 61 | String html = "
Hello World
" 62 | ExtractorResult eResult = extractor.extract(content, charset, url); 63 | String summary = summarizer.summarize(eResult.getText(), summarySentenceNb); 64 | 65 | System.out.print("title: \t"+ SHelper.replaceSmartQuotes(eResult.getTitle()) + "\n"); 66 | System.out.print("summary:\t"+ SHelper.replaceSmartQuotes(summary) + "\n"); 67 | System.out.print("body: \t"+ SHelper.replaceSmartQuotes(eResult.getText()) + "\n"); 68 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/StopWords.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.util.HashSet; 36 | import java.util.Set; 37 | /** 38 | * Default set of stop words 39 | * @author mohaps 40 | * 41 | */ 42 | public class StopWords implements IStopWords { 43 | public static final String[] STOPWORDS = new String[] { "a", "able", 44 | "about", "across", "after", "all", "almost", "also", "am", "among", 45 | "an", "and", "any", "are", "as", "at", "be", "because", "been", 46 | "but", "by", "can", "cannot", "can\'t", "could", "dear", "did", 47 | "do", "does", "either", "else", "ever", "every", "for", "from", 48 | "get", "got", "had", "has", "have", "he", "her", "hers", "him", 49 | "his", "how", "however", "i", "if", "in", "into", "is", "it", 50 | "its", "just", "least", "let", "like", "likely", "may", "me", 51 | "might", "most", "must", "my", "neither", "no", "nor", "not", "of", 52 | "off", "often", "on", "only", "or", "other", "our", "own", 53 | "rather", "said", "say", "says", "she", "should", "since", "so", 54 | "some", "than", "that", "the", "their", "them", "then", "there", 55 | "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", 56 | "was", "we", "were", "what", "when", "where", "which", "while", 57 | "who", "whom", "why", "will", "with", "would", "won\'t", "yet", 58 | "you", "your" }; 59 | 60 | private Set stopWords = new HashSet(); 61 | 62 | public StopWords() { 63 | for(int i = 0; i < STOPWORDS.length; i++) { 64 | stopWords.add(STOPWORDS[i]); 65 | } 66 | } 67 | 68 | public boolean isStopWord(String word) { 69 | return word == null || word.length() < 2 || stopWords.contains(word); 70 | } 71 | 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/OpenNLPTokenizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.io.IOException; 36 | import java.io.InputStream; 37 | 38 | import com.mohaps.tldr.utils.Words; 39 | 40 | import opennlp.tools.tokenize.Tokenizer; 41 | import opennlp.tools.tokenize.TokenizerME; 42 | import opennlp.tools.tokenize.TokenizerModel; 43 | 44 | /** 45 | * Uses the OpenNLP tokenizer (falls back to RegExTokenizer if it can't find the model files for OpenNLP) 46 | * @author mohaps 47 | * 48 | */ 49 | public class OpenNLPTokenizer implements ITokenizer { 50 | 51 | private static TokenizerModel TOKENIZER_MODEL; 52 | private static RegExTokenizer FALLBACK; 53 | static { 54 | try { 55 | InputStream inputFile = Words.class.getClassLoader().getResourceAsStream("en-token.bin"); 56 | if(inputFile != null) { 57 | try { 58 | TOKENIZER_MODEL = new TokenizerModel(inputFile); 59 | System.out.println(">> OpenNLP Tokenizer Model loaded!"); 60 | } finally { 61 | if(inputFile != null) { 62 | try { inputFile.close(); } catch (Throwable t){} 63 | } 64 | } 65 | } 66 | } catch (IOException ex) { 67 | System.err.println("Failed to load token model for OpenNLP. error = "+ex.getLocalizedMessage()+". Will fall back to regex based token parsing"); 68 | ex.printStackTrace(); 69 | } finally { 70 | if(TOKENIZER_MODEL == null) { 71 | FALLBACK = new RegExTokenizer(); 72 | } 73 | } 74 | } 75 | 76 | public String[] tokenize(String input) throws Exception { 77 | if(TOKENIZER_MODEL != null) { 78 | Tokenizer tokenizer = new TokenizerME(TOKENIZER_MODEL); 79 | return tokenizer.tokenize(input); 80 | } else { 81 | return FALLBACK.tokenize(input); 82 | } 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | com.mohaps 6 | 1.0-SNAPSHOT 7 | XTractor 8 | jar 9 | 10 | 11 | 1.6 12 | UTF-8 13 | 14 | 15 | 16 | 17 | maven2-repository.dev.java.net 18 | Java.net Repository for Maven 19 | http://download.java.net/maven/2/ 20 | default 21 | 22 | 23 | repo1.maven.org 24 | OpenNLP repository 25 | http://repo1.maven.org/maven2/org/apache/opennlp/ 26 | default 27 | 28 | 29 | 30 | 31 | 32 | javax.servlet 33 | servlet-api 34 | 2.5 35 | 36 | 37 | 38 | 39 | org.eclipse.jetty 40 | jetty-servlet 41 | 7.6.0.v20120127 42 | 43 | 44 | org.eclipse.jetty 45 | jetty-webapp 46 | 7.6.0.v20120127 47 | 48 | 49 | org.mortbay.jetty 50 | jsp-2.1-glassfish 51 | 2.1.v20100127 52 | 53 | 54 | org.apache.httpcomponents 55 | httpasyncclient 56 | 4.0-beta3 57 | 58 | 59 | org.jsoup 60 | jsoup 61 | 1.7.2 62 | 63 | 64 | 65 | com.rosaloves 66 | bitlyj 67 | 2.0.0 68 | 69 | 70 | 71 | org.apache.opennlp 72 | opennlp-tools 73 | 1.5.2-incubating 74 | 75 | 76 | 77 | 78 | 79 | 80 | org.apache.maven.plugins 81 | maven-dependency-plugin 82 | 2.4 83 | 84 | 85 | copy-dependencies 86 | package 87 | 88 | copy-dependencies 89 | 90 | 91 | 92 | 93 | 94 | org.apache.maven.plugins 95 | maven-compiler-plugin 96 | 97 | UTF-8 98 | 99 | 100 | 101 | 102 | xtractor 103 | http://xtractor.herokuapp.com 104 | A demo of article text extraction from webpages (uses the readability/snacktory codebase as a starting point and adds improved heuristics) 105 | 106 | -------------------------------------------------------------------------------- /src/main/java/de/jetwick/snacktory/ExtractedContent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Peter Karich 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package de.jetwick.snacktory; 17 | 18 | import java.io.Serializable; 19 | import java.util.Collection; 20 | 21 | /** 22 | * Parsed result from web page containing important title, text and image. 23 | * 24 | * @author Peter Karich 25 | */ 26 | public class ExtractedContent implements Serializable { 27 | 28 | /** 29 | * 30 | */ 31 | private static final long serialVersionUID = 1L; 32 | private String title; 33 | private String url; 34 | private String originalUrl; 35 | private String canonicalUrl; 36 | private String imageUrl; 37 | private String videoUrl; 38 | private String rssUrl; 39 | private String text; 40 | private String faviconUrl; 41 | private String description; 42 | private String dateString; 43 | private Collection keywords; 44 | 45 | public ExtractedContent() { 46 | } 47 | 48 | public String getUrl() { 49 | if (url == null) 50 | return ""; 51 | return url; 52 | } 53 | 54 | public ExtractedContent setUrl(String url) { 55 | this.url = url; 56 | return this; 57 | } 58 | 59 | public ExtractedContent setOriginalUrl(String originalUrl) { 60 | this.originalUrl = originalUrl; 61 | return this; 62 | } 63 | 64 | public String getOriginalUrl() { 65 | return originalUrl; 66 | } 67 | 68 | public ExtractedContent setCanonicalUrl(String canonicalUrl) { 69 | this.canonicalUrl = canonicalUrl; 70 | return this; 71 | } 72 | 73 | public String getCanonicalUrl() { 74 | return canonicalUrl; 75 | } 76 | 77 | public String getFaviconUrl() { 78 | if (faviconUrl == null) 79 | return ""; 80 | return faviconUrl; 81 | } 82 | 83 | public ExtractedContent setFaviconUrl(String faviconUrl) { 84 | this.faviconUrl = faviconUrl; 85 | return this; 86 | } 87 | 88 | public ExtractedContent setRssUrl(String rssUrl) { 89 | this.rssUrl = rssUrl; 90 | return this; 91 | } 92 | 93 | public String getRssUrl() { 94 | if(rssUrl == null) 95 | return ""; 96 | return rssUrl; 97 | } 98 | 99 | public String getDescription() { 100 | if (description == null) 101 | return ""; 102 | return description; 103 | } 104 | 105 | public ExtractedContent setDescription(String description) { 106 | this.description = description; 107 | return this; 108 | } 109 | 110 | public String getImageUrl() { 111 | if (imageUrl == null) 112 | return ""; 113 | return imageUrl; 114 | } 115 | 116 | public ExtractedContent setImageUrl(String imageUrl) { 117 | this.imageUrl = imageUrl; 118 | return this; 119 | } 120 | 121 | public String getText() { 122 | if (text == null) 123 | return ""; 124 | 125 | return text; 126 | } 127 | 128 | public ExtractedContent setText(String text) { 129 | this.text = text; 130 | return this; 131 | } 132 | 133 | public String getTitle() { 134 | if (title == null) 135 | return ""; 136 | return title; 137 | } 138 | 139 | public ExtractedContent setTitle(String title) { 140 | this.title = title; 141 | return this; 142 | } 143 | 144 | public String getVideoUrl() { 145 | if (videoUrl == null) 146 | return ""; 147 | return videoUrl; 148 | } 149 | 150 | public ExtractedContent setVideoUrl(String videoUrl) { 151 | this.videoUrl = videoUrl; 152 | return this; 153 | } 154 | 155 | public ExtractedContent setDate(String date) { 156 | this.dateString = date; 157 | return this; 158 | } 159 | 160 | public Collection getKeywords() { 161 | return keywords; 162 | } 163 | 164 | public void setKeywords(Collection keywords) { 165 | this.keywords = keywords; 166 | } 167 | 168 | /** 169 | * @return get date from url or guessed from text 170 | */ 171 | public String getDate() { 172 | return dateString; 173 | } 174 | 175 | @Override 176 | public String toString() { 177 | return "title:" + getTitle() + " imageUrl:" + getImageUrl() + " text:" + text; 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/xtractor/XTractorServlet.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.xtractor; 2 | 3 | import java.io.IOException; 4 | import java.net.URLEncoder; 5 | import java.util.logging.Level; 6 | import java.util.logging.Logger; 7 | 8 | import javax.servlet.ServletConfig; 9 | import javax.servlet.ServletException; 10 | import javax.servlet.ServletOutputStream; 11 | import javax.servlet.http.HttpServlet; 12 | import javax.servlet.http.HttpServletRequest; 13 | import javax.servlet.http.HttpServletResponse; 14 | 15 | import com.mohaps.fetch.FetchResult; 16 | import com.mohaps.fetch.Fetcher; 17 | import com.mohaps.tldr.summarize.Defaults; 18 | import com.mohaps.tldr.summarize.Factory; 19 | import com.mohaps.tldr.summarize.ISummarizer; 20 | import com.mohaps.tldr.summarize.Summarizer; 21 | 22 | import de.jetwick.snacktory.SHelper; 23 | import com.rosaloves.bitlyj.*; 24 | import static com.rosaloves.bitlyj.Bitly.*; 25 | public class XTractorServlet extends HttpServlet { 26 | 27 | private static final Logger LOG = Logger.getLogger(XTractorServlet.class 28 | .getName()); 29 | private static Fetcher fetcher; 30 | static { 31 | try { 32 | fetcher = new Fetcher(); 33 | } catch (Exception ex) { 34 | LOG.log(Level.SEVERE, "Failed to initialize fetcher", ex); 35 | } 36 | } 37 | /** 38 | * 39 | */ 40 | private static final long serialVersionUID = 1L; 41 | private static final String BITLY_APP_KEY = "R_3f011cf0d9540bd4e89456a006a1f388"; 42 | private static final String BITLY_APP_ID = "tldrzr"; 43 | 44 | @Override 45 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) 46 | throws ServletException, IOException { 47 | String pathInfo = req.getPathInfo(); 48 | if (pathInfo == null || pathInfo.length() == 0 || pathInfo.equals("/")) { 49 | String pageUrl = req.getParameter("url"); 50 | 51 | 52 | if (pageUrl != null && pageUrl.length() > 0) { 53 | extractAndShowPage(pageUrl, req, resp); 54 | } else { 55 | showHomePage(req, resp); 56 | } 57 | } else { 58 | resp.sendError(404, "Path " + pathInfo + " not found"); 59 | } 60 | } 61 | 62 | protected void extractAndShowPage(String pageUrl, HttpServletRequest req, 63 | HttpServletResponse resp) throws ServletException, IOException { 64 | 65 | try { 66 | 67 | ExtractorResult result = fetchAndExtractFromUrl(pageUrl); 68 | String text2 = SHelper.replaceSmartQuotes(result.getText()); 69 | String[] paras = text2.split("\\n"); 70 | String fixed_title = SHelper.replaceSmartQuotes(result.getTitle()); 71 | req.setAttribute("extracted_title", fixed_title); 72 | req.setAttribute("extracted", result); 73 | req.setAttribute("extracted_paragraphs", paras); 74 | String absUrl = "http://xtractor.herokuapp.com/xtractor/?url="+URLEncoder.encode(pageUrl,result.getCharset()); 75 | //System.out.println(">>> Absolute Url : "+absUrl); 76 | String shortUrl = shortenUrl(absUrl); 77 | //System.out.println(">> Short url : "+shortUrl); 78 | result.setShortUrl(shortUrl); 79 | String tweetText = fixed_title.trim(); 80 | if(tweetText.length() > 100) { 81 | tweetText = tweetText.substring(0,100)+"..."; 82 | } 83 | tweetText="\""+tweetText+"\" [via XTractor/@tldrzr]"; 84 | String tweetItUrl = "https://twitter.com/share?text="+URLEncoder.encode(tweetText, result.getCharset())+"&url="+URLEncoder.encode(shortUrl, result.getCharset())+"&related=tldrzr"; 85 | //System.out.println("Tweet it url : "+tweetItUrl); 86 | req.setAttribute("tweet_it_url", tweetItUrl); 87 | String encodedPageUrl = URLEncoder.encode(pageUrl, result.getCharset()); 88 | req.setAttribute("encoded_page_url", encodedPageUrl); 89 | 90 | 91 | 92 | 93 | req.getRequestDispatcher("/xtractor.jsp").forward(req, resp); 94 | } catch (Exception ex) { 95 | resp.sendError(500, "Failed to extract from url " + pageUrl 96 | + ". error = " + ex.getLocalizedMessage()); 97 | //ex.printStackTrace(); 98 | } 99 | 100 | } 101 | 102 | protected String shortenUrl(String longUrl) { 103 | 104 | try { 105 | Url url = as(BITLY_APP_ID,BITLY_APP_KEY).call(shorten(longUrl)); 106 | return url.getShortUrl(); 107 | } catch (Exception ex) { 108 | System.err.println(">> Failed to shorten url "+longUrl+" on bitly : "+ex.getLocalizedMessage()); 109 | return longUrl; 110 | } 111 | 112 | } 113 | protected void showHomePage(HttpServletRequest req, HttpServletResponse resp) 114 | throws ServletException, IOException { 115 | resp.sendRedirect("/"); 116 | } 117 | 118 | private ExtractorResult fetchAndExtractFromUrl(String urlString) 119 | throws Exception { 120 | FetchResult fResult; 121 | fResult = fetcher.fetch(urlString); 122 | if (fResult != null && fResult.isPage()) { 123 | Extractor extractor = new Extractor(fetcher); 124 | ExtractorResult eResult = extractor.extract(fResult.getContent(), 125 | fResult.getCharset(), fResult.getActualUrl()); 126 | 127 | try { 128 | ISummarizer summarizer = Factory.getSummarizer(); 129 | 130 | String summary = summarizer.summarize(eResult.getText(), 131 | Defaults.MAX_SENTENCES + 1); 132 | if(summary != null) { 133 | eResult.setSummary(SHelper.replaceSmartQuotes(summary)); 134 | } 135 | } catch (Exception ex) { 136 | LOG.warning("Failed to summarize extracted text. error = " 137 | + ex.getLocalizedMessage()); 138 | } 139 | return eResult; 140 | } else if (fResult.isFeed()) { 141 | throw new Exception( 142 | "Only pages supported for now. Feed parsing not implemented yet!"); 143 | } else { 144 | throw new Exception("Extraction failed for " + fResult); 145 | } 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/main/java/de/jetwick/snacktory/OutputFormatter.java: -------------------------------------------------------------------------------- 1 | package de.jetwick.snacktory; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Element; 5 | import org.jsoup.select.Elements; 6 | 7 | import java.util.Arrays; 8 | import java.util.List; 9 | import java.util.regex.Pattern; 10 | import org.jsoup.nodes.Node; 11 | import org.jsoup.nodes.TextNode; 12 | 13 | /** 14 | * @author goose | jim 15 | * @author karussell 16 | * 17 | * this class will be responsible for taking our top node and stripping out junk we don't want and 18 | * getting it ready for how we want it presented to the user 19 | */ 20 | public class OutputFormatter { 21 | 22 | public static final int MIN_PARAGRAPH_TEXT = 10; 23 | private static final List NODES_TO_REPLACE = Arrays.asList("strong", "b", "i"); 24 | private Pattern unlikelyPattern = Pattern.compile("display\\:none|visibility\\:hidden"); 25 | protected final int minParagraphText; 26 | protected final List nodesToReplace; 27 | 28 | public OutputFormatter() { 29 | this(MIN_PARAGRAPH_TEXT, NODES_TO_REPLACE); 30 | } 31 | 32 | public OutputFormatter(int minParagraphText) { 33 | this(minParagraphText, NODES_TO_REPLACE); 34 | } 35 | 36 | public OutputFormatter(int minParagraphText, List nodesToReplace) { 37 | this.minParagraphText = minParagraphText; 38 | this.nodesToReplace = nodesToReplace; 39 | } 40 | 41 | /** 42 | * takes an element and turns the P tags into \n\n 43 | */ 44 | public String getFormattedText(Element topNode) { 45 | removeNodesWithNegativeScores(topNode); 46 | StringBuilder sb = new StringBuilder(); 47 | append(topNode, sb, "p"); 48 | String strPre = sb.toString(); 49 | String str = SHelper.innerTrim(strPre); 50 | if (str.length() > 100) { 51 | 52 | return str; 53 | } 54 | 55 | // no subelements 56 | if (str.isEmpty() || !topNode.text().isEmpty() && str.length() <= topNode.ownText().length()) 57 | str = topNode.text(); 58 | 59 | //mohaps: trying to preserve linebreaks 60 | String[] paras = str.split("\\n"); 61 | StringBuilder ret = new StringBuilder(); 62 | for(String para : paras) { 63 | ret.append(Jsoup.parse(para).text()); 64 | ret.append("\n\n"); 65 | } 66 | // if jsoup failed to parse the whole html now parse this smaller 67 | // snippet again to avoid html tags disturbing our text: 68 | return ret.toString(); 69 | } 70 | 71 | 72 | /** 73 | * If there are elements inside our top node that have a negative gravity score remove them 74 | */ 75 | protected void removeNodesWithNegativeScores(Element topNode) { 76 | Elements gravityItems = topNode.select("*[gravityScore]"); 77 | for (Element item : gravityItems) { 78 | int score = Integer.parseInt(item.attr("gravityScore")); 79 | if (score < 0 || item.text().length() < minParagraphText) 80 | item.remove(); 81 | } 82 | } 83 | 84 | protected void append(Element node, StringBuilder sb, String tagName) { 85 | // is select more costly then getElementsByTag? 86 | MAIN: 87 | for (Element e : node.select(tagName)) { 88 | Element tmpEl = e; 89 | // check all elements until 'node' 90 | while (tmpEl != null && !tmpEl.equals(node)) { 91 | if (unlikely(tmpEl)) 92 | continue MAIN; 93 | tmpEl = tmpEl.parent(); 94 | } 95 | 96 | String text = node2Text(e); 97 | if (text.isEmpty() || text.length() < minParagraphText || text.length() > SHelper.countLetters(text) * 2) 98 | continue; 99 | 100 | sb.append(text); 101 | sb.append("\n\n"); 102 | } 103 | } 104 | 105 | boolean unlikely(Node e) { 106 | if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption")) 107 | return true; 108 | 109 | String style = e.attr("style"); 110 | String clazz = e.attr("class"); 111 | if (unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find()) 112 | return true; 113 | return false; 114 | } 115 | 116 | void appendTextSkipHidden(Element e, StringBuilder accum) { 117 | for (Node child : e.childNodes()) { 118 | if (unlikely(child)) 119 | continue; 120 | if (child instanceof TextNode) { 121 | TextNode textNode = (TextNode) child; 122 | String txt = textNode.text(); 123 | accum.append(txt); 124 | } else if (child instanceof Element) { 125 | Element element = (Element) child; 126 | if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) 127 | accum.append(" "); 128 | else if (element.tagName().equals("br")) 129 | accum.append(" "); 130 | appendTextSkipHidden(element, accum); 131 | } 132 | } 133 | } 134 | 135 | boolean lastCharIsWhitespace(StringBuilder accum) { 136 | if (accum.length() == 0) 137 | return false; 138 | return Character.isWhitespace(accum.charAt(accum.length() - 1)); 139 | } 140 | 141 | protected String node2TextOld(Element el) { 142 | return el.text(); 143 | } 144 | 145 | protected String node2Text(Element el) { 146 | StringBuilder sb = new StringBuilder(200); 147 | appendTextSkipHidden(el, sb); 148 | return sb.toString(); 149 | } 150 | 151 | public OutputFormatter setUnlikelyPattern(String unlikelyPattern) { 152 | this.unlikelyPattern = Pattern.compile(unlikelyPattern); 153 | return this; 154 | } 155 | 156 | public OutputFormatter appendUnlikelyPattern(String str) { 157 | return setUnlikelyPattern(unlikelyPattern.toString() + "|" + str); 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/summarize/Summarizer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.summarize; 34 | 35 | import java.security.MessageDigest; 36 | import java.util.Comparator; 37 | import java.util.HashMap; 38 | import java.util.Iterator; 39 | import java.util.Set; 40 | import java.util.TreeSet; 41 | 42 | import com.mohaps.tldr.utils.Words; 43 | /** 44 | * The summarizer implementation (uses keyword density to create a summary) 45 | * @author mohaps 46 | * 47 | */ 48 | public class Summarizer implements ISummarizer { 49 | private IStopWords stopWords; 50 | private ITokenizer tokenizer; 51 | 52 | public Summarizer(IStopWords stopWords, ITokenizer tokenizer) { 53 | this.stopWords = stopWords; 54 | this.tokenizer = tokenizer; 55 | } 56 | 57 | public String summarize(final String inputRaw, int sentenceCount, 58 | int maxFrequentWords, boolean shouldIgnoreSingleOccurences) 59 | throws Exception { 60 | 61 | // for short bursts just return the input itself 62 | if (inputRaw.length() < sentenceCount * Defaults.AVG_WORDS_PER_SENTENCE) { 63 | return inputRaw; 64 | } else { 65 | 66 | // check summary cache for input hit (this optimizes repeated summarize calls) 67 | byte[] inputHash = sha1(inputRaw, ":sentences=", Integer.toString(sentenceCount)); 68 | String cached = SummaryCache.instance().get(inputHash); 69 | if (cached != null) { 70 | return cached; 71 | } else { 72 | // change U.S. to US etc. 73 | final String input = Words.dotCorrection(inputRaw); 74 | // get top 100 most frequent words that are not stop words 75 | Set frequentWords = Words.getMostFrequent(input, 76 | tokenizer, stopWords, maxFrequentWords, 77 | shouldIgnoreSingleOccurences?2:1); 78 | // now let's get the unique sentences 79 | Set sentences = Words.parseSentences(input, tokenizer, 80 | Defaults.MIN_WORDS_PER_SENTENCE); 81 | 82 | // hashmap to cache sentence indices 83 | final HashMap sentenceIndex = new HashMap(); 84 | // we'll sort the sentences based on their appearance in the 85 | // input 86 | // text 87 | Set outputSentences = new TreeSet( 88 | new Comparator() { 89 | public int compare(String sentence1, 90 | String sentence2) { 91 | int index1 = -1; 92 | int index2 = -1; 93 | // check cache for sentence 1 94 | Integer index1Obj = sentenceIndex 95 | .get(sentence1); 96 | if (index1Obj == null) { 97 | index1 = input.indexOf(sentence1); 98 | sentenceIndex.put(sentence1, new Integer( 99 | index1)); 100 | } else { 101 | index1 = index1Obj.intValue(); 102 | } 103 | // check cache for sentence 2 104 | Integer index2Obj = sentenceIndex 105 | .get(sentence2); 106 | if (index2Obj == null) { 107 | index2 = input.indexOf(sentence2); 108 | sentenceIndex.put(sentence2, new Integer( 109 | index2)); 110 | } else { 111 | index2 = index2Obj.intValue(); 112 | } 113 | 114 | return index1 - index2; 115 | } 116 | }); 117 | 118 | // now look through the sentences and build summary ( not 119 | // exceeding 120 | // sentenceCount sentences ) 121 | Iterator iter = sentences.iterator(); 122 | while (iter.hasNext() 123 | && outputSentences.size() < sentenceCount) { 124 | String actualSentence = iter.next(); 125 | String workingSentence = actualSentence.toLowerCase(); 126 | Iterator words = frequentWords.iterator(); 127 | while (words.hasNext()) { 128 | String word = words.next(); 129 | if (workingSentence.indexOf(word) >= 0) { 130 | outputSentences.add(actualSentence); 131 | } 132 | if (outputSentences.size() >= sentenceCount) { 133 | break; 134 | } 135 | } 136 | 137 | } 138 | // clear the sentence index cache 139 | sentenceIndex.clear(); 140 | // build the paragraph 141 | StringBuilder sb = new StringBuilder(); 142 | Iterator summarySentences = outputSentences.iterator(); 143 | while (summarySentences.hasNext()) { 144 | sb.append(summarySentences.next()).append("."); 145 | if (summarySentences.hasNext()) { 146 | sb.append(" "); 147 | } 148 | } 149 | // bob's your uncle :) 150 | String summary = sb.toString(); 151 | // update summary cache 152 | SummaryCache.instance().put(inputHash, summary); 153 | return summary; 154 | } 155 | } 156 | } 157 | 158 | public static final byte[] sha1(String ...inputs) throws Exception { 159 | MessageDigest md = MessageDigest.getInstance("SHA1"); 160 | for(String input : inputs) { 161 | md.update(input.getBytes()); 162 | } 163 | return md.digest(); 164 | } 165 | 166 | public String summarize(final String input, int sentenceCount) 167 | throws Exception { 168 | return summarize(input, sentenceCount, 169 | Defaults.MAX_MOST_FREQUENT_WORDS, 170 | Defaults.SHOULD_IGNORE_SINGLE_OCCURENCES); 171 | 172 | } 173 | 174 | public Set keywords(String input, int maxKeyWords) throws Exception { 175 | 176 | return Words.getMostFrequent(input, 177 | tokenizer, stopWords, maxKeyWords, 178 | 4); 179 | } 180 | 181 | } 182 | -------------------------------------------------------------------------------- /src/main/webapp/xtractor.jsp: -------------------------------------------------------------------------------- 1 | 9 | 10 | <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 21 | 22 | "> 23 | 24 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 36 | [<c:out value="${extracted_title}" />] via XTractor 37 | 38 | 39 | 40 | 43 |
44 |
45 | 46 | 61 |
62 |
63 |
64 | "> 66 |
67 | 68 | 73 | 74 |
75 | Generated Summary(via TL;DRzr) 77 | from [">original article] | ">permalink | ">tweet it 80 | 81 | 82 |
83 | 84 |
85 |
86 |
87 | 89 |
90 | 91 |
92 |
93 |
94 | 95 | 100 |
101 | 102 |
103 |
105 | "> 108 |
109 |
110 | 130 |
131 |
132 |
133 | 134 | 140 | 143 | 147 | 148 | 149 | 165 | 166 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/fetch/Fetcher.java: -------------------------------------------------------------------------------- 1 | package com.mohaps.fetch; 2 | 3 | import java.io.InputStream; 4 | import java.util.concurrent.Future; 5 | import java.util.concurrent.TimeUnit; 6 | import java.util.concurrent.TimeoutException; 7 | 8 | import org.apache.http.Header; 9 | import org.apache.http.HttpEntity; 10 | import org.apache.http.HttpResponse; 11 | import org.apache.http.client.methods.HttpGet; 12 | import org.apache.http.client.methods.HttpHead; 13 | import org.apache.http.concurrent.FutureCallback; 14 | import org.apache.http.impl.nio.client.DefaultHttpAsyncClient; 15 | import org.apache.http.nio.client.HttpAsyncClient; 16 | 17 | import de.jetwick.snacktory.SHelper; 18 | 19 | public class Fetcher { 20 | 21 | private HttpAsyncClient client; 22 | private boolean enforcePageOrFeed; 23 | public Fetcher() throws Exception { 24 | this(true); 25 | } 26 | public Fetcher(boolean enforcePageOrFeed) throws Exception { 27 | this.enforcePageOrFeed = enforcePageOrFeed; 28 | client = new DefaultHttpAsyncClient(); 29 | client.start(); 30 | } 31 | 32 | public void shutdown() { 33 | if (client != null) { 34 | synchronized (this) { 35 | if (client != null) { 36 | try { 37 | client.shutdown(); 38 | } catch (Exception ex) { 39 | } finally { 40 | client = null; 41 | } 42 | } 43 | } 44 | } 45 | } 46 | 47 | public boolean isActive() { 48 | return client != null; 49 | } 50 | 51 | protected String cleanupUrl(String urlString) { 52 | String cleanUrl = SHelper.getUrlFromUglyFacebookRedirect(urlString); 53 | if(cleanUrl != null) { 54 | cleanUrl = SHelper.getUrlFromUglyGoogleRedirect(cleanUrl); 55 | } else { 56 | cleanUrl = SHelper.getUrlFromUglyGoogleRedirect(urlString); 57 | } 58 | return cleanUrl == null ? urlString : cleanUrl; 59 | } 60 | 61 | public FetchResult fetch(String urlString) throws Exception { 62 | return fetch(urlString, 0); 63 | } 64 | 65 | public FetchResult fetch(String urlString, long timeoutMillis) 66 | throws Exception { 67 | if(!isActive()){ throw new FetchException("Fetcher is not active!"); } 68 | String actualUrl = cleanupUrl(urlString); 69 | HttpGet get = new HttpGet(actualUrl); 70 | long startTime = System.currentTimeMillis(); 71 | Future responseFuture = client.execute(get, null); 72 | if (responseFuture == null) { 73 | throw new FetchException("Failed to execute get request for [" 74 | + urlString + "]"); 75 | } 76 | HttpResponse response; 77 | if (timeoutMillis > 0) { 78 | try { 79 | response = responseFuture.get(timeoutMillis, 80 | TimeUnit.MILLISECONDS); 81 | } catch (TimeoutException ex) { 82 | responseFuture.cancel(true); 83 | throw new FetchException("fetch for [" + urlString 84 | + "] timed out", ex); 85 | } 86 | } else { 87 | response = responseFuture.get(); 88 | } 89 | 90 | long timeTakenMillis = System.currentTimeMillis() - startTime; 91 | 92 | if (response == null) { 93 | throw new FetchException("failed to get response while fetching [" + urlString + "]"); 94 | } else { 95 | int httpStatusCode = response.getStatusLine().getStatusCode(); 96 | long contentLength = getContentLength(response); 97 | String contentType = getContentType(response);//.split(";")[0]; 98 | String charset = getCharset(contentType); 99 | contentType = contentType.split(";")[0]; 100 | boolean htmlPage = false; 101 | boolean feedPage = false; 102 | if(contentType.equals("text/html") || contentType.equals("text/plain")) { 103 | htmlPage = true; 104 | } else { 105 | feedPage = contentType.endsWith("xml"); 106 | } 107 | if(this.enforcePageOrFeed ) { 108 | if(!htmlPage && !feedPage) { 109 | throw new FetchException("Fetched content from ["+urlString+"] has content type "+contentType+" which is neither an html or text page nor a valid xml feed"); 110 | } 111 | } 112 | 113 | FetchResult.ContentFlavor flavor = FetchResult.ContentFlavor.CUSTOM; 114 | if(htmlPage) { 115 | flavor = FetchResult.ContentFlavor.PAGE; 116 | } else if(feedPage) { 117 | flavor = FetchResult.ContentFlavor.FEED; 118 | } 119 | byte[] content = getContent(response); 120 | if (contentLength <= 0 && content != null) { 121 | contentLength = content.length; 122 | } 123 | 124 | 125 | return new FetchResult(urlString, actualUrl, httpStatusCode, 126 | contentLength, contentType, flavor, charset, timeTakenMillis, 127 | content); 128 | } 129 | 130 | } 131 | 132 | public HeadResult fetchHead(String urlString, long timeoutMillis) throws Exception { 133 | if(!isActive()){ throw new FetchException("Fetcher is not active!"); } 134 | String actualUrl = cleanupUrl(urlString); 135 | HttpHead get = new HttpHead(actualUrl); 136 | Future responseFuture = client.execute(get, null); 137 | if (responseFuture == null) { 138 | throw new FetchException("Failed to execute head request for [" 139 | + urlString + "]"); 140 | } 141 | HttpResponse response; 142 | if (timeoutMillis > 0) { 143 | try { 144 | response = responseFuture.get(timeoutMillis, 145 | TimeUnit.MILLISECONDS); 146 | } catch (TimeoutException ex) { 147 | responseFuture.cancel(true); 148 | throw new FetchException("fetch head for [" + urlString 149 | + "] timed out", ex); 150 | } 151 | } else { 152 | response = responseFuture.get(); 153 | } 154 | 155 | if (response == null) { 156 | throw new FetchException("failed to get response while fetching head for [" + urlString + "]"); 157 | } else { 158 | int httpStatusCode = response.getStatusLine().getStatusCode(); 159 | if(httpStatusCode == 200) { 160 | long contentLength = getContentLength(response); 161 | String contentType = getContentType(response); 162 | return HeadResult.NewSuccess(urlString, httpStatusCode, contentType, contentLength); 163 | } else { 164 | return HeadResult.NewFailed(urlString, httpStatusCode); 165 | } 166 | } 167 | } 168 | 169 | protected long getContentLength(HttpResponse response) { 170 | HttpEntity entity = response.getEntity(); 171 | if (entity == null) { 172 | return -1; 173 | } else { 174 | return entity.getContentLength(); 175 | } 176 | } 177 | 178 | protected String getContentType(HttpResponse response) { 179 | HttpEntity entity = response.getEntity(); 180 | if (entity == null) { 181 | return Constants.DEFAULT_CONTENT_TYPE; 182 | } else { 183 | Header header = entity.getContentType(); 184 | if (header != null) { 185 | return header.getValue(); 186 | } else { 187 | return Constants.DEFAULT_CONTENT_TYPE; 188 | } 189 | } 190 | } 191 | 192 | protected String getCharset(String contentType) { 193 | if (contentType != null && contentType.length() > 0) { 194 | String[] tokens = contentType.split(";"); 195 | for (int i = 1; i < tokens.length; i++) { 196 | String token = tokens[i]; 197 | String[] nv = token.split("="); 198 | if (nv[0].trim().equalsIgnoreCase("charset")) { 199 | return nv[1].toUpperCase(); 200 | } 201 | } 202 | } 203 | return Constants.DEFAULT_CHARSET; 204 | } 205 | 206 | protected byte[] getContent(HttpResponse response) throws Exception { 207 | InputStream in = response.getEntity().getContent(); 208 | byte[] buf = new byte[in.available()]; 209 | int offset = 0; 210 | while(offset < buf.length) { 211 | int bread = in.read(buf, offset, buf.length - offset); 212 | if(bread > 0) { 213 | offset += bread; 214 | } else { 215 | break; 216 | } 217 | } 218 | if(offset < buf.length) { 219 | throw new FetchException("Failed to get "+buf.length+" bytes of content, could retrieve only "+offset+" bytes!"); 220 | } 221 | return buf; 222 | } 223 | 224 | } 225 | -------------------------------------------------------------------------------- /src/main/webapp/index.html: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | XTractor - an algorithmic text extraction from webpages 30 | demo 31 | 32 | 33 | 36 |
37 |
38 | 39 | 55 |
56 |

Try it out!

57 |
58 |
60 | 62 | 63 |
64 |
65 |
66 | 67 |
68 | 69 |

What is XTractor?

70 |
71 |

72 | XTractor is an algorithmic text extractor from web pages written in 73 | Java. It builds upon the "commonly used web design practices" 74 | approach (from readability.js, 76 | goose and snacktory) to 78 | create a set of heuristics for fast article text extraction. It 79 | adds several features like 80 |

    81 |
  • paragraph preservation,
  • 82 |
  • better image detection heuristics,
  • 83 |
  • sibling score based enhancements to article detection
  • 84 |
85 | 86 | If you find a link which misbehaves with this, please do let me know at mohaps AT gmail DOT com or tweet it to me at @mohaps. 87 |

88 |

89 | XTractor is built using JSoup and Apache 91 | HTTP Async Client 92 |

93 | . 94 |

95 |
96 |

Live Samples

97 |
98 |

Here are some sample links of extracted text from around the 99 | web: 100 |

112 |

113 | 114 |

How does XTractor work?

115 |
116 |

Similar to the readability approach, XTractor uses heuristics 117 | based on common web design practices to extract text. It walks 118 | through the DOM of the HTML removing nodes that are unlikely to be 119 | the main article text (and fluff like ads, banners, shout outs etc.). It then 120 | assigns a score to the container elements to find the best possible 121 | match for article text. It also extracts one significant image 122 | representing that page. The resultant output is sanitized and 123 | tokenized into paragraphs and is returned to the caller.

124 | 125 |

Who built this?

126 |

127 | XTractor is a weekend 128 | project/quick hack demo created by Saurav Mohapatra. If you like 130 | this, you might also like my other hack/weekend project TL;DRzr - an 132 | algorithmic summary generator. 133 |

134 |

135 | I wrote this to improve the article detection of my algorithmic summarizer demo. 137 |

138 |
139 | 140 | 149 |
150 |
151 | 152 | 156 | 157 | 158 | 174 | 175 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/ext/norwegianStemmer.java: -------------------------------------------------------------------------------- 1 | // This file was generated automatically by the Snowball to Java compiler 2 | 3 | package org.tartarus.snowball.ext; 4 | 5 | import org.tartarus.snowball.Among; 6 | 7 | /** 8 | * This class was automatically generated by a Snowball to Java compiler It 9 | * implements the stemming algorithm defined by a snowball script. 10 | */ 11 | 12 | public class norwegianStemmer extends org.tartarus.snowball.SnowballStemmer { 13 | 14 | private static final long serialVersionUID = 1L; 15 | 16 | private final static norwegianStemmer methodObject = new norwegianStemmer(); 17 | 18 | private final static Among a_0[] = { 19 | new Among("a", -1, 1, "", methodObject), 20 | new Among("e", -1, 1, "", methodObject), 21 | new Among("ede", 1, 1, "", methodObject), 22 | new Among("ande", 1, 1, "", methodObject), 23 | new Among("ende", 1, 1, "", methodObject), 24 | new Among("ane", 1, 1, "", methodObject), 25 | new Among("ene", 1, 1, "", methodObject), 26 | new Among("hetene", 6, 1, "", methodObject), 27 | new Among("erte", 1, 3, "", methodObject), 28 | new Among("en", -1, 1, "", methodObject), 29 | new Among("heten", 9, 1, "", methodObject), 30 | new Among("ar", -1, 1, "", methodObject), 31 | new Among("er", -1, 1, "", methodObject), 32 | new Among("heter", 12, 1, "", methodObject), 33 | new Among("s", -1, 2, "", methodObject), 34 | new Among("as", 14, 1, "", methodObject), 35 | new Among("es", 14, 1, "", methodObject), 36 | new Among("edes", 16, 1, "", methodObject), 37 | new Among("endes", 16, 1, "", methodObject), 38 | new Among("enes", 16, 1, "", methodObject), 39 | new Among("hetenes", 19, 1, "", methodObject), 40 | new Among("ens", 14, 1, "", methodObject), 41 | new Among("hetens", 21, 1, "", methodObject), 42 | new Among("ers", 14, 1, "", methodObject), 43 | new Among("ets", 14, 1, "", methodObject), 44 | new Among("et", -1, 1, "", methodObject), 45 | new Among("het", 25, 1, "", methodObject), 46 | new Among("ert", -1, 3, "", methodObject), 47 | new Among("ast", -1, 1, "", methodObject) }; 48 | 49 | private final static Among a_1[] = { 50 | new Among("dt", -1, -1, "", methodObject), 51 | new Among("vt", -1, -1, "", methodObject) }; 52 | 53 | private final static Among a_2[] = { 54 | new Among("leg", -1, 1, "", methodObject), 55 | new Among("eleg", 0, 1, "", methodObject), 56 | new Among("ig", -1, 1, "", methodObject), 57 | new Among("eig", 2, 1, "", methodObject), 58 | new Among("lig", 2, 1, "", methodObject), 59 | new Among("elig", 4, 1, "", methodObject), 60 | new Among("els", -1, 1, "", methodObject), 61 | new Among("lov", -1, 1, "", methodObject), 62 | new Among("elov", 7, 1, "", methodObject), 63 | new Among("slov", 7, 1, "", methodObject), 64 | new Among("hetslov", 9, 1, "", methodObject) }; 65 | 66 | private static final char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 67 | 0, 0, 0, 0, 48, 0, 128 }; 68 | 69 | private static final char g_s_ending[] = { 119, 125, 149, 1 }; 70 | 71 | private int I_x; 72 | private int I_p1; 73 | 74 | private void copy_from(norwegianStemmer other) { 75 | I_x = other.I_x; 76 | I_p1 = other.I_p1; 77 | super.copy_from(other); 78 | } 79 | 80 | private boolean r_mark_regions() { 81 | int v_1; 82 | int v_2; 83 | // (, line 26 84 | I_p1 = limit; 85 | // test, line 30 86 | v_1 = cursor; 87 | // (, line 30 88 | // hop, line 30 89 | { 90 | int c = cursor + 3; 91 | if (0 > c || c > limit) { 92 | return false; 93 | } 94 | cursor = c; 95 | } 96 | // setmark x, line 30 97 | I_x = cursor; 98 | cursor = v_1; 99 | // goto, line 31 100 | golab0: while (true) { 101 | v_2 = cursor; 102 | lab1: do { 103 | if (!(in_grouping(g_v, 97, 248))) { 104 | break lab1; 105 | } 106 | cursor = v_2; 107 | break golab0; 108 | } while (false); 109 | cursor = v_2; 110 | if (cursor >= limit) { 111 | return false; 112 | } 113 | cursor++; 114 | } 115 | // gopast, line 31 116 | golab2: while (true) { 117 | lab3: do { 118 | if (!(out_grouping(g_v, 97, 248))) { 119 | break lab3; 120 | } 121 | break golab2; 122 | } while (false); 123 | if (cursor >= limit) { 124 | return false; 125 | } 126 | cursor++; 127 | } 128 | // setmark p1, line 31 129 | I_p1 = cursor; 130 | // try, line 32 131 | lab4: do { 132 | // (, line 32 133 | if (!(I_p1 < I_x)) { 134 | break lab4; 135 | } 136 | I_p1 = I_x; 137 | } while (false); 138 | return true; 139 | } 140 | 141 | private boolean r_main_suffix() { 142 | int among_var; 143 | int v_1; 144 | int v_2; 145 | int v_3; 146 | // (, line 37 147 | // setlimit, line 38 148 | v_1 = limit - cursor; 149 | // tomark, line 38 150 | if (cursor < I_p1) { 151 | return false; 152 | } 153 | cursor = I_p1; 154 | v_2 = limit_backward; 155 | limit_backward = cursor; 156 | cursor = limit - v_1; 157 | // (, line 38 158 | // [, line 38 159 | ket = cursor; 160 | // substring, line 38 161 | among_var = find_among_b(a_0, 29); 162 | if (among_var == 0) { 163 | limit_backward = v_2; 164 | return false; 165 | } 166 | // ], line 38 167 | bra = cursor; 168 | limit_backward = v_2; 169 | switch (among_var) { 170 | case 0: 171 | return false; 172 | case 1: 173 | // (, line 44 174 | // delete, line 44 175 | slice_del(); 176 | break; 177 | case 2: 178 | // (, line 46 179 | // or, line 46 180 | lab0: do { 181 | v_3 = limit - cursor; 182 | lab1: do { 183 | if (!(in_grouping_b(g_s_ending, 98, 122))) { 184 | break lab1; 185 | } 186 | break lab0; 187 | } while (false); 188 | cursor = limit - v_3; 189 | // (, line 46 190 | // literal, line 46 191 | if (!(eq_s_b(1, "k"))) { 192 | return false; 193 | } 194 | if (!(out_grouping_b(g_v, 97, 248))) { 195 | return false; 196 | } 197 | } while (false); 198 | // delete, line 46 199 | slice_del(); 200 | break; 201 | case 3: 202 | // (, line 48 203 | // <-, line 48 204 | slice_from("er"); 205 | break; 206 | } 207 | return true; 208 | } 209 | 210 | private boolean r_consonant_pair() { 211 | int v_1; 212 | int v_2; 213 | int v_3; 214 | // (, line 52 215 | // test, line 53 216 | v_1 = limit - cursor; 217 | // (, line 53 218 | // setlimit, line 54 219 | v_2 = limit - cursor; 220 | // tomark, line 54 221 | if (cursor < I_p1) { 222 | return false; 223 | } 224 | cursor = I_p1; 225 | v_3 = limit_backward; 226 | limit_backward = cursor; 227 | cursor = limit - v_2; 228 | // (, line 54 229 | // [, line 54 230 | ket = cursor; 231 | // substring, line 54 232 | if (find_among_b(a_1, 2) == 0) { 233 | limit_backward = v_3; 234 | return false; 235 | } 236 | // ], line 54 237 | bra = cursor; 238 | limit_backward = v_3; 239 | cursor = limit - v_1; 240 | // next, line 59 241 | if (cursor <= limit_backward) { 242 | return false; 243 | } 244 | cursor--; 245 | // ], line 59 246 | bra = cursor; 247 | // delete, line 59 248 | slice_del(); 249 | return true; 250 | } 251 | 252 | private boolean r_other_suffix() { 253 | int among_var; 254 | int v_1; 255 | int v_2; 256 | // (, line 62 257 | // setlimit, line 63 258 | v_1 = limit - cursor; 259 | // tomark, line 63 260 | if (cursor < I_p1) { 261 | return false; 262 | } 263 | cursor = I_p1; 264 | v_2 = limit_backward; 265 | limit_backward = cursor; 266 | cursor = limit - v_1; 267 | // (, line 63 268 | // [, line 63 269 | ket = cursor; 270 | // substring, line 63 271 | among_var = find_among_b(a_2, 11); 272 | if (among_var == 0) { 273 | limit_backward = v_2; 274 | return false; 275 | } 276 | // ], line 63 277 | bra = cursor; 278 | limit_backward = v_2; 279 | switch (among_var) { 280 | case 0: 281 | return false; 282 | case 1: 283 | // (, line 67 284 | // delete, line 67 285 | slice_del(); 286 | break; 287 | } 288 | return true; 289 | } 290 | 291 | public boolean stem() { 292 | int v_1; 293 | int v_2; 294 | int v_3; 295 | int v_4; 296 | // (, line 72 297 | // do, line 74 298 | v_1 = cursor; 299 | lab0: do { 300 | // call mark_regions, line 74 301 | if (!r_mark_regions()) { 302 | break lab0; 303 | } 304 | } while (false); 305 | cursor = v_1; 306 | // backwards, line 75 307 | limit_backward = cursor; 308 | cursor = limit; 309 | // (, line 75 310 | // do, line 76 311 | v_2 = limit - cursor; 312 | lab1: do { 313 | // call main_suffix, line 76 314 | if (!r_main_suffix()) { 315 | break lab1; 316 | } 317 | } while (false); 318 | cursor = limit - v_2; 319 | // do, line 77 320 | v_3 = limit - cursor; 321 | lab2: do { 322 | // call consonant_pair, line 77 323 | if (!r_consonant_pair()) { 324 | break lab2; 325 | } 326 | } while (false); 327 | cursor = limit - v_3; 328 | // do, line 78 329 | v_4 = limit - cursor; 330 | lab3: do { 331 | // call other_suffix, line 78 332 | if (!r_other_suffix()) { 333 | break lab3; 334 | } 335 | } while (false); 336 | cursor = limit - v_4; 337 | cursor = limit_backward; 338 | return true; 339 | } 340 | 341 | public boolean equals(Object o) { 342 | return o instanceof norwegianStemmer; 343 | } 344 | 345 | public int hashCode() { 346 | return norwegianStemmer.class.getName().hashCode(); 347 | } 348 | 349 | } 350 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/ext/swedishStemmer.java: -------------------------------------------------------------------------------- 1 | // This file was generated automatically by the Snowball to Java compiler 2 | 3 | package org.tartarus.snowball.ext; 4 | 5 | import org.tartarus.snowball.Among; 6 | 7 | /** 8 | * This class was automatically generated by a Snowball to Java compiler It 9 | * implements the stemming algorithm defined by a snowball script. 10 | */ 11 | 12 | public class swedishStemmer extends org.tartarus.snowball.SnowballStemmer { 13 | 14 | private static final long serialVersionUID = 1L; 15 | 16 | private final static swedishStemmer methodObject = new swedishStemmer(); 17 | 18 | private final static Among a_0[] = { 19 | new Among("a", -1, 1, "", methodObject), 20 | new Among("arna", 0, 1, "", methodObject), 21 | new Among("erna", 0, 1, "", methodObject), 22 | new Among("heterna", 2, 1, "", methodObject), 23 | new Among("orna", 0, 1, "", methodObject), 24 | new Among("ad", -1, 1, "", methodObject), 25 | new Among("e", -1, 1, "", methodObject), 26 | new Among("ade", 6, 1, "", methodObject), 27 | new Among("ande", 6, 1, "", methodObject), 28 | new Among("arne", 6, 1, "", methodObject), 29 | new Among("are", 6, 1, "", methodObject), 30 | new Among("aste", 6, 1, "", methodObject), 31 | new Among("en", -1, 1, "", methodObject), 32 | new Among("anden", 12, 1, "", methodObject), 33 | new Among("aren", 12, 1, "", methodObject), 34 | new Among("heten", 12, 1, "", methodObject), 35 | new Among("ern", -1, 1, "", methodObject), 36 | new Among("ar", -1, 1, "", methodObject), 37 | new Among("er", -1, 1, "", methodObject), 38 | new Among("heter", 18, 1, "", methodObject), 39 | new Among("or", -1, 1, "", methodObject), 40 | new Among("s", -1, 2, "", methodObject), 41 | new Among("as", 21, 1, "", methodObject), 42 | new Among("arnas", 22, 1, "", methodObject), 43 | new Among("ernas", 22, 1, "", methodObject), 44 | new Among("ornas", 22, 1, "", methodObject), 45 | new Among("es", 21, 1, "", methodObject), 46 | new Among("ades", 26, 1, "", methodObject), 47 | new Among("andes", 26, 1, "", methodObject), 48 | new Among("ens", 21, 1, "", methodObject), 49 | new Among("arens", 29, 1, "", methodObject), 50 | new Among("hetens", 29, 1, "", methodObject), 51 | new Among("erns", 21, 1, "", methodObject), 52 | new Among("at", -1, 1, "", methodObject), 53 | new Among("andet", -1, 1, "", methodObject), 54 | new Among("het", -1, 1, "", methodObject), 55 | new Among("ast", -1, 1, "", methodObject) }; 56 | 57 | private final static Among a_1[] = { 58 | new Among("dd", -1, -1, "", methodObject), 59 | new Among("gd", -1, -1, "", methodObject), 60 | new Among("nn", -1, -1, "", methodObject), 61 | new Among("dt", -1, -1, "", methodObject), 62 | new Among("gt", -1, -1, "", methodObject), 63 | new Among("kt", -1, -1, "", methodObject), 64 | new Among("tt", -1, -1, "", methodObject) }; 65 | 66 | private final static Among a_2[] = { 67 | new Among("ig", -1, 1, "", methodObject), 68 | new Among("lig", 0, 1, "", methodObject), 69 | new Among("els", -1, 1, "", methodObject), 70 | new Among("fullt", -1, 3, "", methodObject), 71 | new Among("l\u00F6st", -1, 2, "", methodObject) }; 72 | 73 | private static final char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 74 | 0, 0, 0, 0, 24, 0, 32 }; 75 | 76 | private static final char g_s_ending[] = { 119, 127, 149 }; 77 | 78 | private int I_x; 79 | private int I_p1; 80 | 81 | private void copy_from(swedishStemmer other) { 82 | I_x = other.I_x; 83 | I_p1 = other.I_p1; 84 | super.copy_from(other); 85 | } 86 | 87 | private boolean r_mark_regions() { 88 | int v_1; 89 | int v_2; 90 | // (, line 26 91 | I_p1 = limit; 92 | // test, line 29 93 | v_1 = cursor; 94 | // (, line 29 95 | // hop, line 29 96 | { 97 | int c = cursor + 3; 98 | if (0 > c || c > limit) { 99 | return false; 100 | } 101 | cursor = c; 102 | } 103 | // setmark x, line 29 104 | I_x = cursor; 105 | cursor = v_1; 106 | // goto, line 30 107 | golab0: while (true) { 108 | v_2 = cursor; 109 | lab1: do { 110 | if (!(in_grouping(g_v, 97, 246))) { 111 | break lab1; 112 | } 113 | cursor = v_2; 114 | break golab0; 115 | } while (false); 116 | cursor = v_2; 117 | if (cursor >= limit) { 118 | return false; 119 | } 120 | cursor++; 121 | } 122 | // gopast, line 30 123 | golab2: while (true) { 124 | lab3: do { 125 | if (!(out_grouping(g_v, 97, 246))) { 126 | break lab3; 127 | } 128 | break golab2; 129 | } while (false); 130 | if (cursor >= limit) { 131 | return false; 132 | } 133 | cursor++; 134 | } 135 | // setmark p1, line 30 136 | I_p1 = cursor; 137 | // try, line 31 138 | lab4: do { 139 | // (, line 31 140 | if (!(I_p1 < I_x)) { 141 | break lab4; 142 | } 143 | I_p1 = I_x; 144 | } while (false); 145 | return true; 146 | } 147 | 148 | private boolean r_main_suffix() { 149 | int among_var; 150 | int v_1; 151 | int v_2; 152 | // (, line 36 153 | // setlimit, line 37 154 | v_1 = limit - cursor; 155 | // tomark, line 37 156 | if (cursor < I_p1) { 157 | return false; 158 | } 159 | cursor = I_p1; 160 | v_2 = limit_backward; 161 | limit_backward = cursor; 162 | cursor = limit - v_1; 163 | // (, line 37 164 | // [, line 37 165 | ket = cursor; 166 | // substring, line 37 167 | among_var = find_among_b(a_0, 37); 168 | if (among_var == 0) { 169 | limit_backward = v_2; 170 | return false; 171 | } 172 | // ], line 37 173 | bra = cursor; 174 | limit_backward = v_2; 175 | switch (among_var) { 176 | case 0: 177 | return false; 178 | case 1: 179 | // (, line 44 180 | // delete, line 44 181 | slice_del(); 182 | break; 183 | case 2: 184 | // (, line 46 185 | if (!(in_grouping_b(g_s_ending, 98, 121))) { 186 | return false; 187 | } 188 | // delete, line 46 189 | slice_del(); 190 | break; 191 | } 192 | return true; 193 | } 194 | 195 | private boolean r_consonant_pair() { 196 | int v_1; 197 | int v_2; 198 | int v_3; 199 | // setlimit, line 50 200 | v_1 = limit - cursor; 201 | // tomark, line 50 202 | if (cursor < I_p1) { 203 | return false; 204 | } 205 | cursor = I_p1; 206 | v_2 = limit_backward; 207 | limit_backward = cursor; 208 | cursor = limit - v_1; 209 | // (, line 50 210 | // and, line 52 211 | v_3 = limit - cursor; 212 | // among, line 51 213 | if (find_among_b(a_1, 7) == 0) { 214 | limit_backward = v_2; 215 | return false; 216 | } 217 | cursor = limit - v_3; 218 | // (, line 52 219 | // [, line 52 220 | ket = cursor; 221 | // next, line 52 222 | if (cursor <= limit_backward) { 223 | limit_backward = v_2; 224 | return false; 225 | } 226 | cursor--; 227 | // ], line 52 228 | bra = cursor; 229 | // delete, line 52 230 | slice_del(); 231 | limit_backward = v_2; 232 | return true; 233 | } 234 | 235 | private boolean r_other_suffix() { 236 | int among_var; 237 | int v_1; 238 | int v_2; 239 | // setlimit, line 55 240 | v_1 = limit - cursor; 241 | // tomark, line 55 242 | if (cursor < I_p1) { 243 | return false; 244 | } 245 | cursor = I_p1; 246 | v_2 = limit_backward; 247 | limit_backward = cursor; 248 | cursor = limit - v_1; 249 | // (, line 55 250 | // [, line 56 251 | ket = cursor; 252 | // substring, line 56 253 | among_var = find_among_b(a_2, 5); 254 | if (among_var == 0) { 255 | limit_backward = v_2; 256 | return false; 257 | } 258 | // ], line 56 259 | bra = cursor; 260 | switch (among_var) { 261 | case 0: 262 | limit_backward = v_2; 263 | return false; 264 | case 1: 265 | // (, line 57 266 | // delete, line 57 267 | slice_del(); 268 | break; 269 | case 2: 270 | // (, line 58 271 | // <-, line 58 272 | slice_from("l\u00F6s"); 273 | break; 274 | case 3: 275 | // (, line 59 276 | // <-, line 59 277 | slice_from("full"); 278 | break; 279 | } 280 | limit_backward = v_2; 281 | return true; 282 | } 283 | 284 | public boolean stem() { 285 | int v_1; 286 | int v_2; 287 | int v_3; 288 | int v_4; 289 | // (, line 64 290 | // do, line 66 291 | v_1 = cursor; 292 | lab0: do { 293 | // call mark_regions, line 66 294 | if (!r_mark_regions()) { 295 | break lab0; 296 | } 297 | } while (false); 298 | cursor = v_1; 299 | // backwards, line 67 300 | limit_backward = cursor; 301 | cursor = limit; 302 | // (, line 67 303 | // do, line 68 304 | v_2 = limit - cursor; 305 | lab1: do { 306 | // call main_suffix, line 68 307 | if (!r_main_suffix()) { 308 | break lab1; 309 | } 310 | } while (false); 311 | cursor = limit - v_2; 312 | // do, line 69 313 | v_3 = limit - cursor; 314 | lab2: do { 315 | // call consonant_pair, line 69 316 | if (!r_consonant_pair()) { 317 | break lab2; 318 | } 319 | } while (false); 320 | cursor = limit - v_3; 321 | // do, line 70 322 | v_4 = limit - cursor; 323 | lab3: do { 324 | // call other_suffix, line 70 325 | if (!r_other_suffix()) { 326 | break lab3; 327 | } 328 | } while (false); 329 | cursor = limit - v_4; 330 | cursor = limit_backward; 331 | return true; 332 | } 333 | 334 | public boolean equals(Object o) { 335 | return o instanceof swedishStemmer; 336 | } 337 | 338 | public int hashCode() { 339 | return swedishStemmer.class.getName().hashCode(); 340 | } 341 | 342 | } 343 | -------------------------------------------------------------------------------- /src/main/java/com/mohaps/tldr/utils/Words.java: -------------------------------------------------------------------------------- 1 | /* 2 | * 3 | * TL;DRzr - A simple algorithmic summarizer 4 | * Website: http://tldrzr.com 5 | * Author: Saurav Mohapatra (mohaps@gmail.com) 6 | * 7 | * Copyright (c) 2013, Saurav Mohapatra 8 | * All rights reserved. 9 | * 10 | * 11 | * 12 | * Redistribution and use in source and binary forms, with or without modification, are permitted 13 | * provided that the following conditions are met: 14 | * 15 | * a) Redistributions of source code must retain the above copyright notice, 16 | * this list of conditions and the following disclaimer. 17 | * 18 | * b) Redistributions in binary form must reproduce the above copyright notice, 19 | * this list of conditions and the following disclaimer in the documentation 20 | * and/or other materials provided with the distribution. 21 | * 22 | * c) Neither the name of TL;DRzr nor the names of its contributors may be used 23 | * to endorse or promote products derived from this software without specific 24 | * prior written permission. 25 | * 26 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 27 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 28 | * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | package com.mohaps.tldr.utils; 34 | 35 | import java.util.ArrayList; 36 | import java.util.Collections; 37 | import java.util.Comparator; 38 | import java.util.HashMap; 39 | import java.util.HashSet; 40 | import java.util.Iterator; 41 | import java.util.Set; 42 | 43 | import opennlp.tools.sentdetect.SentenceDetectorME; 44 | import opennlp.tools.sentdetect.SentenceModel; 45 | 46 | import com.mohaps.tldr.summarize.Defaults; 47 | import com.mohaps.tldr.summarize.IStopWords; 48 | import com.mohaps.tldr.summarize.ITokenizer; 49 | 50 | import java.io.*; 51 | 52 | import org.tartarus.snowball.SnowballStemmer; 53 | import org.tartarus.snowball.ext.englishStemmer; 54 | /** 55 | * Utility methods to use for word operations 56 | * Will try to use OpenNLP by default, failing that will fall back to regex based manipulation/extraction 57 | * @author mohaps 58 | * 59 | */ 60 | public final class Words { 61 | private static SentenceModel SENTENCE_MODEL; 62 | static { 63 | try { 64 | InputStream inputFile = Words.class.getClassLoader() 65 | .getResourceAsStream("en-sent.bin"); 66 | if (inputFile != null) { 67 | try { 68 | SENTENCE_MODEL = new SentenceModel(inputFile); 69 | System.out.println(">> OpenNLP Sentence Model loaded!"); 70 | } finally { 71 | if (inputFile != null) { 72 | try { 73 | inputFile.close(); 74 | } catch (Throwable t) { 75 | } 76 | } 77 | } 78 | } 79 | } catch (IOException ex) { 80 | System.err 81 | .println("Failed to load sentence model for OpenNLP. error = " 82 | + ex.getLocalizedMessage() 83 | + ". Will fall back to regex based sentence parsing"); 84 | ex.printStackTrace(); 85 | } 86 | } 87 | 88 | private static class Word { 89 | private String word; 90 | private int frequency; 91 | 92 | public Word(String word) { 93 | this.word = word.toLowerCase(); 94 | this.frequency = 1; 95 | } 96 | 97 | public String getWord() { 98 | return word; 99 | } 100 | 101 | public int getFrequency() { 102 | return frequency; 103 | } 104 | 105 | public int increment() { 106 | return ++frequency; 107 | } 108 | 109 | public int hashCode() { 110 | return word.hashCode(); 111 | } 112 | 113 | public String toString() { 114 | return new StringBuilder(word).append("(").append(frequency) 115 | .append(")").toString(); 116 | } 117 | } 118 | public static final Set getMostFrequent(String input, 119 | ITokenizer tokenizer, IStopWords stopWords, int maxCount, 120 | int minimumOccurences) throws Exception { 121 | 122 | HashMap words = new HashMap(); 123 | ArrayList wordList = new ArrayList(); 124 | String[] wordTokens = tokenizer.tokenize(input); 125 | SnowballStemmer stemmer = new englishStemmer(); 126 | for (int i = 0; i < wordTokens.length; i++) { 127 | if(isWord(wordTokens[i]) && wordTokens[i].length() > 4) { 128 | stemmer.setCurrent(wordTokens[i]); 129 | stemmer.stem(); 130 | String wordToken = stemmer.getCurrent(); 131 | if (isWord(wordToken) && !stopWords.isStopWord(wordToken) && wordToken.length() > 4) { 132 | Word w = words.get(wordToken); 133 | if (w != null) { 134 | w.increment(); 135 | } else { 136 | w = new Word(wordToken); 137 | words.put(wordToken, w); 138 | wordList.add(w); 139 | } 140 | } 141 | } 142 | } 143 | Collections.sort(wordList, new Comparator() { 144 | 145 | public int compare(Word w1, Word w2) { 146 | if (w1.getFrequency() > w2.getFrequency()) { 147 | return -1; 148 | } else if (w1.getFrequency() < w2.getFrequency()) { 149 | return 1; 150 | } else { 151 | String s1 = w1.getWord(); 152 | String s2 = w2.getWord(); 153 | 154 | for (int i = 0; i < s1.length() && i < s2.length(); i++) { 155 | if (s1.charAt(i) > s2.charAt(i)) { 156 | return -1; 157 | } else if (s1.charAt(i) < s2.charAt(i)) { 158 | return 1; 159 | } 160 | } 161 | 162 | if (s1.length() > s2.length()) { 163 | return -1; 164 | } else if (s1.length() < s2.length()) { 165 | return 1; 166 | } else { 167 | return 0; 168 | } 169 | } 170 | 171 | } 172 | 173 | }); 174 | HashSet ret = new HashSet(); 175 | Iterator iter = wordList.iterator(); 176 | while (iter.hasNext() && ret.size() <= maxCount) { 177 | Word w = iter.next(); 178 | if(w.getFrequency() >= minimumOccurences) { 179 | ret.add(w.getWord()); 180 | } 181 | } 182 | return ret; 183 | } 184 | 185 | public static final boolean isWord(String word) { 186 | return (word != null && word.trim().length() > 0); 187 | } 188 | 189 | public static Set parseSentences(String input, 190 | ITokenizer tokenizer, int minimumWordsInASentence) throws Exception { 191 | if (SENTENCE_MODEL != null) { 192 | return parseSentencesNLP(input, tokenizer, minimumWordsInASentence); 193 | } else { 194 | return parseSentencesRegEx(input, tokenizer, 195 | minimumWordsInASentence); 196 | } 197 | } 198 | 199 | public static Set parseSentencesNLP(String input, 200 | ITokenizer tokenizer, int minimumWordsInASentence) throws Exception { 201 | SentenceDetectorME sentenceDetector = new SentenceDetectorME( 202 | SENTENCE_MODEL); 203 | String[] rawSentences = sentenceDetector.sentDetect(input); 204 | HashSet sentences = new HashSet(); 205 | for (int i = 0; i < rawSentences.length; i++) { 206 | String rawSentence = rawSentences[i]; 207 | String[] words = tokenizer.tokenize(rawSentence); 208 | if (words.length >= minimumWordsInASentence) { 209 | sentences.add(rawSentence); 210 | } 211 | } 212 | return sentences; 213 | } 214 | 215 | public static Set parseSentencesRegEx(String input, 216 | ITokenizer tokenizer, int minimumWordsInASentence) throws Exception { 217 | String[] rawSentences = input.split(Defaults.REGEX_SENTENCES); 218 | HashSet sentences = new HashSet(); 219 | for (int i = 0; i < rawSentences.length; i++) { 220 | String rawSentence = rawSentences[i]; 221 | String[] words = tokenizer.tokenize(rawSentence); 222 | if (words.length >= minimumWordsInASentence) { 223 | sentences.add(rawSentence); 224 | } 225 | } 226 | return sentences; 227 | 228 | } 229 | 230 | public static final String replaceSmartQuotes(String s) { 231 | return s.replace('\u2018', '\'') 232 | .replace('\u2019', '\'') 233 | .replace('\u201c', '\"') 234 | .replace('\u201b', '\'') 235 | .replace('\u201d', '\"') 236 | .replace('\u2026', '-') 237 | .replace('\u2013', '-') 238 | .replace('\u2014', '-') 239 | .replaceAll("–", "-") 240 | .replaceAll("“", "\"") 241 | .replaceAll("”", "\"") 242 | .replaceAll("‘", "\'") 243 | .replaceAll("’", "\'") 244 | .replaceAll("‛", "\'") 245 | .replaceAll("'", "\'") 246 | .replaceAll("…", "...") 247 | .replaceAll("—", "-"); 248 | } 249 | 250 | public static void main(String[] args) { 251 | String s = "-than estimated by Umeng-"; 252 | for(int i = 0; i < s.length(); i++){ 253 | System.out.println(">> Char Code "+(short)s.charAt(i)+" (0x"+Integer.toHexString((short)s.charAt(i))+") - {"+s.charAt(i)+"}"); 254 | } 255 | } 256 | 257 | //TODO: ugly hack. find something more efficient and elegant to replace well-known contractions with longer synonyms 258 | public static String dotCorrection(String inputRaw) { 259 | return inputRaw.replace("U.S.", "US").replace("U.K.", "UK").replace("Mass.", "Massachusetts").replace("Mr.", "Mr"); 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /src/main/webapp/css/main.css: -------------------------------------------------------------------------------- 1 | /* 2 | * HTML5 Boilerplate 3 | * 4 | * What follows is the result of much research on cross-browser styling. 5 | * Credit left inline and big thanks to Nicolas Gallagher, Jonathan Neal, 6 | * Kroc Camen, and the H5BP dev community and team. 7 | */ 8 | 9 | /* ========================================================================== 10 | Base styles: opinionated defaults 11 | ========================================================================== */ 12 | html,button,input,select,textarea { 13 | color: #222; 14 | } 15 | 16 | body { 17 | font-size: 1em; 18 | line-height: 1.4; 19 | } 20 | 21 | /* 22 | * Remove text-shadow in selection highlight: h5bp.com/i 23 | * These selection rule sets have to be separate. 24 | * Customize the background color to match your design. 25 | */ 26 | ::-moz-selection { 27 | background: #b3d4fc; 28 | text-shadow: none; 29 | } 30 | 31 | ::selection { 32 | background: #b3d4fc; 33 | text-shadow: none; 34 | } 35 | 36 | /* 37 | * A better looking default horizontal rule 38 | */ 39 | hr { 40 | display: block; 41 | height: 1px; 42 | border: 0; 43 | border-top: 1px solid #ccc; 44 | margin: 1em 0; 45 | padding: 0; 46 | } 47 | 48 | /* 49 | * Remove the gap between images and the bottom of their containers: h5bp.com/i/440 50 | */ 51 | img { 52 | vertical-align: middle; 53 | } 54 | 55 | /* 56 | * Remove default fieldset styles. 57 | */ 58 | fieldset { 59 | border: 0; 60 | margin: 0; 61 | padding: 0; 62 | } 63 | 64 | /* 65 | * Allow only vertical resizing of textareas. 66 | */ 67 | textarea { 68 | width: 100%; 69 | } 70 | 71 | /* ========================================================================== 72 | Chrome Frame prompt 73 | ========================================================================== */ 74 | .chromeframe { 75 | margin: 0.2em 0; 76 | background: #ccc; 77 | color: #000; 78 | padding: 0.2em 0; 79 | } 80 | 81 | /* ========================================================================== 82 | Author's custom styles 83 | ========================================================================== */ 84 | 85 | input[type=text] { 86 | border: 2px solid rgba(10,10,10, 0.5); 87 | -webkit-box-shadow: 88 | inset 0 0 3px rgba(0,0,0,0.1), 89 | 0 0 6px rgba(0,0,0,0.1); 90 | -moz-box-shadow: 91 | inset 0 0 3px rgba(0,0,0,0.1), 92 | 0 0 6px rgba(0,0,0,0.1); 93 | box-shadow: 94 | inset 0 0 3px rgba(0,0,0,0.1), 95 | 0 0 6px rgba(0,0,0,0.1); 96 | padding-top: 0.8em; 97 | padding-bottom: 0.8em; 98 | padding-left: 0.4em; 99 | padding-right: 0.4em; 100 | background: rgba(245,245,245,0.85); 101 | margin: 0 0 10px 0; 102 | -webkit-border-radius: 5px; 103 | -moz-border-radius: 5px; 104 | border-radius: 5px; 105 | max-width: 640px; 106 | } 107 | input[type=submit] { 108 | font-size: 1.2em; 109 | height: 2.5em; 110 | padding-top: 2px; 111 | padding-bottm: 2px; 112 | padding-left: 8px; 113 | padding-right: 8px; 114 | -webkit-border-radius: 55px; 115 | -moz-border-radius: 55px; 116 | border-radius: 55px; 117 | background-color: rgba(220,10, 10, 1); 118 | font-weight: bolder; 119 | color: rgba(250, 250,250, 1.0) 120 | } 121 | .mypage { 122 | padding: 3px; 123 | background-color: rgb(255, 255, 255); 124 | max-width: 800px; 125 | font-family: Open Sans, sans-serif; 126 | font-weight: 400; 127 | } 128 | 129 | .topnav { 130 | font-size: 0.7em; 131 | font-weight: 400; 132 | } 133 | 134 | .footer { 135 | font-size: 0.7em; 136 | } 137 | 138 | .content_section { 139 | margin-top: 6px; 140 | margin-bottom: 6px; 141 | margin-left: 2px; 142 | margin-right: 2px; 143 | padding: 5px; 144 | } 145 | 146 | .textWrapper { 147 | border: 0px; 148 | margin: 5px 0; 149 | padding: 3px; 150 | } 151 | 152 | a:link { 153 | text-decoration: none; 154 | color: #832323; 155 | } 156 | 157 | a:visited { 158 | text-decoration: none; 159 | color: #832323; 160 | } 161 | 162 | a:hover { 163 | text-decoration: underline; 164 | color: #832323; 165 | } 166 | 167 | a:active { 168 | text-decoration: underline; 169 | color: #832323; 170 | } 171 | 172 | .extracted_title { 173 | font-size: 1.8em; 174 | font-family: Open Sans, sans-serif; 175 | font-weight: 900; 176 | } 177 | .extracted_summary{ 178 | border-bottom: 1px dashed rgba(3,3,3, 0.5); 179 | border-top: 1px dashed rgba(3,3,3, 0.5); 180 | padding-top: 1.3em; 181 | margin-bottom: 1.2em; 182 | } 183 | .extracted_text { 184 | font-size: 1.1em; 185 | font-family: Open Sans, sans-serif; 186 | font-weight: 400; 187 | padding-left: 0.02em; 188 | padding-right: 0.02em; 189 | } 190 | .extracted_paragraph { 191 | padding-top: 1.5em; 192 | } 193 | .extracted_footer { 194 | margin-top: 1.2em; 195 | border-top: 1px dashed #020202; 196 | border-bottom: 1px dashed #020202; 197 | padding-top: 0.2em; 198 | padding-bottom: 0.2em; 199 | padding-left: 0.02em; 200 | padding-right: 0.02em; 201 | font-size: 0.8em; 202 | font-weight: bold; 203 | font-style: italic; 204 | color: #4f4f4f; 205 | } 206 | .extracted_image { 207 | padding: 0.4em; 208 | max-width: 100%; 209 | } 210 | 211 | .extracted_image_container { 212 | padding: 0.4em; 213 | max-width: 100%; 214 | } 215 | 216 | .extracted_outer { 217 | max-width: 800px; 218 | padding: 0.2em; 219 | } 220 | 221 | .extracted_keywords { 222 | padding-top: 2px; 223 | padding-bottom: 2px; 224 | padding-left: 1px; 225 | padding-right: 1px; 226 | font-family: Open Sans, sans-serif; 227 | font-weight: 400; 228 | font-size: 0.8em; 229 | } 230 | 231 | /* ========================================================================== 232 | Helper classes 233 | ========================================================================== */ 234 | 235 | /* 236 | * Image replacement 237 | */ 238 | .ir { 239 | background-color: transparent; 240 | border: 0; 241 | overflow: hidden; 242 | /* IE 6/7 fallback */ 243 | *text-indent: -9999px; 244 | } 245 | 246 | .ir:before { 247 | content: ""; 248 | display: block; 249 | width: 0; 250 | height: 150%; 251 | } 252 | 253 | /* 254 | * Hide from both screenreaders and browsers: h5bp.com/u 255 | */ 256 | .hidden { 257 | display: none !important; 258 | visibility: hidden; 259 | } 260 | 261 | /* 262 | * Hide only visually, but have it available for screenreaders: h5bp.com/v 263 | */ 264 | .visuallyhidden { 265 | border: 0; 266 | clip: rect(0, 0, 0, 0); 267 | height: 1px; 268 | margin: -1px; 269 | overflow: hidden; 270 | padding: 0; 271 | position: absolute; 272 | width: 1px; 273 | } 274 | 275 | /* 276 | * Extends the .visuallyhidden class to allow the element to be focusable 277 | * when navigated to via the keyboard: h5bp.com/p 278 | */ 279 | .visuallyhidden.focusable:active,.visuallyhidden.focusable:focus { 280 | clip: auto; 281 | height: auto; 282 | margin: 0; 283 | overflow: visible; 284 | position: static; 285 | width: auto; 286 | } 287 | 288 | /* 289 | * Hide visually and from screenreaders, but maintain layout 290 | */ 291 | .invisible { 292 | visibility: hidden; 293 | } 294 | 295 | /* 296 | * Clearfix: contain floats 297 | * 298 | * For modern browsers 299 | * 1. The space content is one way to avoid an Opera bug when the 300 | * `contenteditable` attribute is included anywhere else in the document. 301 | * Otherwise it causes space to appear at the top and bottom of elements 302 | * that receive the `clearfix` class. 303 | * 2. The use of `table` rather than `block` is only necessary if using 304 | * `:before` to contain the top-margins of child elements. 305 | */ 306 | .clearfix:before,.clearfix:after { 307 | content: " "; /* 1 */ 308 | display: table; /* 2 */ 309 | } 310 | 311 | .clearfix:after { 312 | clear: both; 313 | } 314 | 315 | /* 316 | * For IE 6/7 only 317 | * Include this rule to trigger hasLayout and contain floats. 318 | */ 319 | .clearfix { 320 | *zoom: 1; 321 | } 322 | 323 | /* ========================================================================== 324 | EXAMPLE Media Queries for Responsive Design. 325 | These examples override the primary ('mobile first') styles. 326 | Modify as content requires. 327 | ========================================================================== */ 328 | @media only screen and (min-width: 35em) { 329 | /* Style adjustments for viewports that meet the condition */ 330 | } 331 | 332 | @media print , ( -o-min-device-pixel-ratio : 5/4) , ( 333 | -webkit-min-device-pixel-ratio : 1.25) , ( min-resolution : 120dpi) { 334 | /* Style adjustments for high resolution devices */ 335 | } 336 | 337 | /* ========================================================================== 338 | Print styles. 339 | Inlined to avoid required HTTP connection: h5bp.com/r 340 | ========================================================================== */ 341 | @media print { 342 | * { 343 | background: transparent !important; 344 | color: #000 !important; /* Black prints faster: h5bp.com/s */ 345 | box-shadow: none !important; 346 | text-shadow: none !important; 347 | } 348 | a,a:visited { 349 | text-decoration: underline; 350 | } 351 | a[href]:after { 352 | content: " (" attr(href) ")"; 353 | } 354 | abbr[title]:after { 355 | content: " (" attr(title) ")"; 356 | } 357 | 358 | /* 359 | * Don't show links for images, or javascript/internal links 360 | */ 361 | .ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after { 362 | content: ""; 363 | } 364 | pre,blockquote { 365 | border: 1px solid #999; 366 | page-break-inside: avoid; 367 | } 368 | thead { 369 | display: table-header-group; /* h5bp.com/t */ 370 | } 371 | tr,img { 372 | page-break-inside: avoid; 373 | } 374 | img { 375 | max-width: 100% !important; 376 | } 377 | @page { 378 | margin: 0.5cm; 379 | } 380 | p,h2,h3 { 381 | orphans: 3; 382 | widows: 3; 383 | } 384 | h2,h3 { 385 | page-break-after: avoid; 386 | } 387 | } -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/ext/danishStemmer.java: -------------------------------------------------------------------------------- 1 | // This file was generated automatically by the Snowball to Java compiler 2 | 3 | package org.tartarus.snowball.ext; 4 | 5 | import org.tartarus.snowball.Among; 6 | 7 | /** 8 | * This class was automatically generated by a Snowball to Java compiler It 9 | * implements the stemming algorithm defined by a snowball script. 10 | */ 11 | 12 | public class danishStemmer extends org.tartarus.snowball.SnowballStemmer { 13 | 14 | private static final long serialVersionUID = 1L; 15 | 16 | private final static danishStemmer methodObject = new danishStemmer(); 17 | 18 | private final static Among a_0[] = { 19 | new Among("hed", -1, 1, "", methodObject), 20 | new Among("ethed", 0, 1, "", methodObject), 21 | new Among("ered", -1, 1, "", methodObject), 22 | new Among("e", -1, 1, "", methodObject), 23 | new Among("erede", 3, 1, "", methodObject), 24 | new Among("ende", 3, 1, "", methodObject), 25 | new Among("erende", 5, 1, "", methodObject), 26 | new Among("ene", 3, 1, "", methodObject), 27 | new Among("erne", 3, 1, "", methodObject), 28 | new Among("ere", 3, 1, "", methodObject), 29 | new Among("en", -1, 1, "", methodObject), 30 | new Among("heden", 10, 1, "", methodObject), 31 | new Among("eren", 10, 1, "", methodObject), 32 | new Among("er", -1, 1, "", methodObject), 33 | new Among("heder", 13, 1, "", methodObject), 34 | new Among("erer", 13, 1, "", methodObject), 35 | new Among("s", -1, 2, "", methodObject), 36 | new Among("heds", 16, 1, "", methodObject), 37 | new Among("es", 16, 1, "", methodObject), 38 | new Among("endes", 18, 1, "", methodObject), 39 | new Among("erendes", 19, 1, "", methodObject), 40 | new Among("enes", 18, 1, "", methodObject), 41 | new Among("ernes", 18, 1, "", methodObject), 42 | new Among("eres", 18, 1, "", methodObject), 43 | new Among("ens", 16, 1, "", methodObject), 44 | new Among("hedens", 24, 1, "", methodObject), 45 | new Among("erens", 24, 1, "", methodObject), 46 | new Among("ers", 16, 1, "", methodObject), 47 | new Among("ets", 16, 1, "", methodObject), 48 | new Among("erets", 28, 1, "", methodObject), 49 | new Among("et", -1, 1, "", methodObject), 50 | new Among("eret", 30, 1, "", methodObject) }; 51 | 52 | private final static Among a_1[] = { 53 | new Among("gd", -1, -1, "", methodObject), 54 | new Among("dt", -1, -1, "", methodObject), 55 | new Among("gt", -1, -1, "", methodObject), 56 | new Among("kt", -1, -1, "", methodObject) }; 57 | 58 | private final static Among a_2[] = { 59 | new Among("ig", -1, 1, "", methodObject), 60 | new Among("lig", 0, 1, "", methodObject), 61 | new Among("elig", 1, 1, "", methodObject), 62 | new Among("els", -1, 1, "", methodObject), 63 | new Among("l\u00F8st", -1, 2, "", methodObject) }; 64 | 65 | private static final char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 66 | 0, 0, 0, 0, 48, 0, 128 }; 67 | 68 | private static final char g_s_ending[] = { 239, 254, 42, 3, 0, 0, 0, 0, 0, 69 | 0, 0, 0, 0, 0, 0, 0, 16 }; 70 | 71 | private int I_x; 72 | private int I_p1; 73 | private java.lang.StringBuilder S_ch = new java.lang.StringBuilder(); 74 | 75 | private void copy_from(danishStemmer other) { 76 | I_x = other.I_x; 77 | I_p1 = other.I_p1; 78 | S_ch = other.S_ch; 79 | super.copy_from(other); 80 | } 81 | 82 | private boolean r_mark_regions() { 83 | int v_1; 84 | int v_2; 85 | // (, line 29 86 | I_p1 = limit; 87 | // test, line 33 88 | v_1 = cursor; 89 | // (, line 33 90 | // hop, line 33 91 | { 92 | int c = cursor + 3; 93 | if (0 > c || c > limit) { 94 | return false; 95 | } 96 | cursor = c; 97 | } 98 | // setmark x, line 33 99 | I_x = cursor; 100 | cursor = v_1; 101 | // goto, line 34 102 | golab0: while (true) { 103 | v_2 = cursor; 104 | lab1: do { 105 | if (!(in_grouping(g_v, 97, 248))) { 106 | break lab1; 107 | } 108 | cursor = v_2; 109 | break golab0; 110 | } while (false); 111 | cursor = v_2; 112 | if (cursor >= limit) { 113 | return false; 114 | } 115 | cursor++; 116 | } 117 | // gopast, line 34 118 | golab2: while (true) { 119 | lab3: do { 120 | if (!(out_grouping(g_v, 97, 248))) { 121 | break lab3; 122 | } 123 | break golab2; 124 | } while (false); 125 | if (cursor >= limit) { 126 | return false; 127 | } 128 | cursor++; 129 | } 130 | // setmark p1, line 34 131 | I_p1 = cursor; 132 | // try, line 35 133 | lab4: do { 134 | // (, line 35 135 | if (!(I_p1 < I_x)) { 136 | break lab4; 137 | } 138 | I_p1 = I_x; 139 | } while (false); 140 | return true; 141 | } 142 | 143 | private boolean r_main_suffix() { 144 | int among_var; 145 | int v_1; 146 | int v_2; 147 | // (, line 40 148 | // setlimit, line 41 149 | v_1 = limit - cursor; 150 | // tomark, line 41 151 | if (cursor < I_p1) { 152 | return false; 153 | } 154 | cursor = I_p1; 155 | v_2 = limit_backward; 156 | limit_backward = cursor; 157 | cursor = limit - v_1; 158 | // (, line 41 159 | // [, line 41 160 | ket = cursor; 161 | // substring, line 41 162 | among_var = find_among_b(a_0, 32); 163 | if (among_var == 0) { 164 | limit_backward = v_2; 165 | return false; 166 | } 167 | // ], line 41 168 | bra = cursor; 169 | limit_backward = v_2; 170 | switch (among_var) { 171 | case 0: 172 | return false; 173 | case 1: 174 | // (, line 48 175 | // delete, line 48 176 | slice_del(); 177 | break; 178 | case 2: 179 | // (, line 50 180 | if (!(in_grouping_b(g_s_ending, 97, 229))) { 181 | return false; 182 | } 183 | // delete, line 50 184 | slice_del(); 185 | break; 186 | } 187 | return true; 188 | } 189 | 190 | private boolean r_consonant_pair() { 191 | int v_1; 192 | int v_2; 193 | int v_3; 194 | // (, line 54 195 | // test, line 55 196 | v_1 = limit - cursor; 197 | // (, line 55 198 | // setlimit, line 56 199 | v_2 = limit - cursor; 200 | // tomark, line 56 201 | if (cursor < I_p1) { 202 | return false; 203 | } 204 | cursor = I_p1; 205 | v_3 = limit_backward; 206 | limit_backward = cursor; 207 | cursor = limit - v_2; 208 | // (, line 56 209 | // [, line 56 210 | ket = cursor; 211 | // substring, line 56 212 | if (find_among_b(a_1, 4) == 0) { 213 | limit_backward = v_3; 214 | return false; 215 | } 216 | // ], line 56 217 | bra = cursor; 218 | limit_backward = v_3; 219 | cursor = limit - v_1; 220 | // next, line 62 221 | if (cursor <= limit_backward) { 222 | return false; 223 | } 224 | cursor--; 225 | // ], line 62 226 | bra = cursor; 227 | // delete, line 62 228 | slice_del(); 229 | return true; 230 | } 231 | 232 | private boolean r_other_suffix() { 233 | int among_var; 234 | int v_1; 235 | int v_2; 236 | int v_3; 237 | int v_4; 238 | // (, line 65 239 | // do, line 66 240 | v_1 = limit - cursor; 241 | lab0: do { 242 | // (, line 66 243 | // [, line 66 244 | ket = cursor; 245 | // literal, line 66 246 | if (!(eq_s_b(2, "st"))) { 247 | break lab0; 248 | } 249 | // ], line 66 250 | bra = cursor; 251 | // literal, line 66 252 | if (!(eq_s_b(2, "ig"))) { 253 | break lab0; 254 | } 255 | // delete, line 66 256 | slice_del(); 257 | } while (false); 258 | cursor = limit - v_1; 259 | // setlimit, line 67 260 | v_2 = limit - cursor; 261 | // tomark, line 67 262 | if (cursor < I_p1) { 263 | return false; 264 | } 265 | cursor = I_p1; 266 | v_3 = limit_backward; 267 | limit_backward = cursor; 268 | cursor = limit - v_2; 269 | // (, line 67 270 | // [, line 67 271 | ket = cursor; 272 | // substring, line 67 273 | among_var = find_among_b(a_2, 5); 274 | if (among_var == 0) { 275 | limit_backward = v_3; 276 | return false; 277 | } 278 | // ], line 67 279 | bra = cursor; 280 | limit_backward = v_3; 281 | switch (among_var) { 282 | case 0: 283 | return false; 284 | case 1: 285 | // (, line 70 286 | // delete, line 70 287 | slice_del(); 288 | // do, line 70 289 | v_4 = limit - cursor; 290 | lab1: do { 291 | // call consonant_pair, line 70 292 | if (!r_consonant_pair()) { 293 | break lab1; 294 | } 295 | } while (false); 296 | cursor = limit - v_4; 297 | break; 298 | case 2: 299 | // (, line 72 300 | // <-, line 72 301 | slice_from("l\u00F8s"); 302 | break; 303 | } 304 | return true; 305 | } 306 | 307 | private boolean r_undouble() { 308 | int v_1; 309 | int v_2; 310 | // (, line 75 311 | // setlimit, line 76 312 | v_1 = limit - cursor; 313 | // tomark, line 76 314 | if (cursor < I_p1) { 315 | return false; 316 | } 317 | cursor = I_p1; 318 | v_2 = limit_backward; 319 | limit_backward = cursor; 320 | cursor = limit - v_1; 321 | // (, line 76 322 | // [, line 76 323 | ket = cursor; 324 | if (!(out_grouping_b(g_v, 97, 248))) { 325 | limit_backward = v_2; 326 | return false; 327 | } 328 | // ], line 76 329 | bra = cursor; 330 | // -> ch, line 76 331 | S_ch = slice_to(S_ch); 332 | limit_backward = v_2; 333 | // name ch, line 77 334 | if (!(eq_v_b(S_ch))) { 335 | return false; 336 | } 337 | // delete, line 78 338 | slice_del(); 339 | return true; 340 | } 341 | 342 | public boolean stem() { 343 | int v_1; 344 | int v_2; 345 | int v_3; 346 | int v_4; 347 | int v_5; 348 | // (, line 82 349 | // do, line 84 350 | v_1 = cursor; 351 | lab0: do { 352 | // call mark_regions, line 84 353 | if (!r_mark_regions()) { 354 | break lab0; 355 | } 356 | } while (false); 357 | cursor = v_1; 358 | // backwards, line 85 359 | limit_backward = cursor; 360 | cursor = limit; 361 | // (, line 85 362 | // do, line 86 363 | v_2 = limit - cursor; 364 | lab1: do { 365 | // call main_suffix, line 86 366 | if (!r_main_suffix()) { 367 | break lab1; 368 | } 369 | } while (false); 370 | cursor = limit - v_2; 371 | // do, line 87 372 | v_3 = limit - cursor; 373 | lab2: do { 374 | // call consonant_pair, line 87 375 | if (!r_consonant_pair()) { 376 | break lab2; 377 | } 378 | } while (false); 379 | cursor = limit - v_3; 380 | // do, line 88 381 | v_4 = limit - cursor; 382 | lab3: do { 383 | // call other_suffix, line 88 384 | if (!r_other_suffix()) { 385 | break lab3; 386 | } 387 | } while (false); 388 | cursor = limit - v_4; 389 | // do, line 89 390 | v_5 = limit - cursor; 391 | lab4: do { 392 | // call undouble, line 89 393 | if (!r_undouble()) { 394 | break lab4; 395 | } 396 | } while (false); 397 | cursor = limit - v_5; 398 | cursor = limit_backward; 399 | return true; 400 | } 401 | 402 | public boolean equals(Object o) { 403 | return o instanceof danishStemmer; 404 | } 405 | 406 | public int hashCode() { 407 | return danishStemmer.class.getName().hashCode(); 408 | } 409 | 410 | } 411 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/SnowballProgram.java: -------------------------------------------------------------------------------- 1 | package org.tartarus.snowball; 2 | 3 | import java.lang.reflect.InvocationTargetException; 4 | 5 | public class SnowballProgram { 6 | protected SnowballProgram() { 7 | // fix: (mohaps) changed StringBuffer to StringBuilder 8 | current = new StringBuilder(); 9 | setCurrent(""); 10 | } 11 | 12 | /** 13 | * Set the current string. 14 | */ 15 | public void setCurrent(String value) { 16 | current.replace(0, current.length(), value); 17 | cursor = 0; 18 | limit = current.length(); 19 | limit_backward = 0; 20 | bra = cursor; 21 | ket = limit; 22 | } 23 | 24 | /** 25 | * Get the current string. 26 | */ 27 | public String getCurrent() { 28 | String result = current.toString(); 29 | // Make a new StringBuffer. If we reuse the old one, and a user of 30 | // the library keeps a reference to the buffer returned (for example, 31 | // by converting it to a String in a way which doesn't force a copy), 32 | // the buffer size will not decrease, and we will risk wasting a large 33 | // amount of memory. 34 | // Thanks to Wolfram Esser for spotting this problem. 35 | // fix: (mohaps) changed StringBuffer to StringBuilder 36 | current = new StringBuilder(); 37 | return result; 38 | } 39 | 40 | // current string 41 | // fix: (mohaps) changed StringBuffer to StringBuilder 42 | protected StringBuilder current; 43 | 44 | protected int cursor; 45 | protected int limit; 46 | protected int limit_backward; 47 | protected int bra; 48 | protected int ket; 49 | 50 | protected void copy_from(SnowballProgram other) { 51 | current = other.current; 52 | cursor = other.cursor; 53 | limit = other.limit; 54 | limit_backward = other.limit_backward; 55 | bra = other.bra; 56 | ket = other.ket; 57 | } 58 | 59 | protected boolean in_grouping(char[] s, int min, int max) { 60 | if (cursor >= limit) 61 | return false; 62 | char ch = current.charAt(cursor); 63 | if (ch > max || ch < min) 64 | return false; 65 | ch -= min; 66 | if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 67 | return false; 68 | cursor++; 69 | return true; 70 | } 71 | 72 | protected boolean in_grouping_b(char[] s, int min, int max) { 73 | if (cursor <= limit_backward) 74 | return false; 75 | char ch = current.charAt(cursor - 1); 76 | if (ch > max || ch < min) 77 | return false; 78 | ch -= min; 79 | if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 80 | return false; 81 | cursor--; 82 | return true; 83 | } 84 | 85 | protected boolean out_grouping(char[] s, int min, int max) { 86 | if (cursor >= limit) 87 | return false; 88 | char ch = current.charAt(cursor); 89 | if (ch > max || ch < min) { 90 | cursor++; 91 | return true; 92 | } 93 | ch -= min; 94 | if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { 95 | cursor++; 96 | return true; 97 | } 98 | return false; 99 | } 100 | 101 | protected boolean out_grouping_b(char[] s, int min, int max) { 102 | if (cursor <= limit_backward) 103 | return false; 104 | char ch = current.charAt(cursor - 1); 105 | if (ch > max || ch < min) { 106 | cursor--; 107 | return true; 108 | } 109 | ch -= min; 110 | if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { 111 | cursor--; 112 | return true; 113 | } 114 | return false; 115 | } 116 | 117 | protected boolean in_range(int min, int max) { 118 | if (cursor >= limit) 119 | return false; 120 | char ch = current.charAt(cursor); 121 | if (ch > max || ch < min) 122 | return false; 123 | cursor++; 124 | return true; 125 | } 126 | 127 | protected boolean in_range_b(int min, int max) { 128 | if (cursor <= limit_backward) 129 | return false; 130 | char ch = current.charAt(cursor - 1); 131 | if (ch > max || ch < min) 132 | return false; 133 | cursor--; 134 | return true; 135 | } 136 | 137 | protected boolean out_range(int min, int max) { 138 | if (cursor >= limit) 139 | return false; 140 | char ch = current.charAt(cursor); 141 | if (!(ch > max || ch < min)) 142 | return false; 143 | cursor++; 144 | return true; 145 | } 146 | 147 | protected boolean out_range_b(int min, int max) { 148 | if (cursor <= limit_backward) 149 | return false; 150 | char ch = current.charAt(cursor - 1); 151 | if (!(ch > max || ch < min)) 152 | return false; 153 | cursor--; 154 | return true; 155 | } 156 | 157 | protected boolean eq_s(int s_size, String s) { 158 | if (limit - cursor < s_size) 159 | return false; 160 | int i; 161 | for (i = 0; i != s_size; i++) { 162 | if (current.charAt(cursor + i) != s.charAt(i)) 163 | return false; 164 | } 165 | cursor += s_size; 166 | return true; 167 | } 168 | 169 | protected boolean eq_s_b(int s_size, String s) { 170 | if (cursor - limit_backward < s_size) 171 | return false; 172 | int i; 173 | for (i = 0; i != s_size; i++) { 174 | if (current.charAt(cursor - s_size + i) != s.charAt(i)) 175 | return false; 176 | } 177 | cursor -= s_size; 178 | return true; 179 | } 180 | 181 | protected boolean eq_v(CharSequence s) { 182 | return eq_s(s.length(), s.toString()); 183 | } 184 | 185 | protected boolean eq_v_b(CharSequence s) { 186 | return eq_s_b(s.length(), s.toString()); 187 | } 188 | 189 | protected int find_among(Among v[], int v_size) { 190 | int i = 0; 191 | int j = v_size; 192 | 193 | int c = cursor; 194 | int l = limit; 195 | 196 | int common_i = 0; 197 | int common_j = 0; 198 | 199 | boolean first_key_inspected = false; 200 | 201 | while (true) { 202 | int k = i + ((j - i) >> 1); 203 | int diff = 0; 204 | int common = common_i < common_j ? common_i : common_j; // smaller 205 | Among w = v[k]; 206 | int i2; 207 | for (i2 = common; i2 < w.s_size; i2++) { 208 | if (c + common == l) { 209 | diff = -1; 210 | break; 211 | } 212 | diff = current.charAt(c + common) - w.s[i2]; 213 | if (diff != 0) 214 | break; 215 | common++; 216 | } 217 | if (diff < 0) { 218 | j = k; 219 | common_j = common; 220 | } else { 221 | i = k; 222 | common_i = common; 223 | } 224 | if (j - i <= 1) { 225 | if (i > 0) 226 | break; // v->s has been inspected 227 | if (j == i) 228 | break; // only one item in v 229 | 230 | // - but now we need to go round once more to get 231 | // v->s inspected. This looks messy, but is actually 232 | // the optimal approach. 233 | 234 | if (first_key_inspected) 235 | break; 236 | first_key_inspected = true; 237 | } 238 | } 239 | while (true) { 240 | Among w = v[i]; 241 | if (common_i >= w.s_size) { 242 | cursor = c + w.s_size; 243 | if (w.method == null) 244 | return w.result; 245 | boolean res; 246 | try { 247 | Object resobj = w.method.invoke(w.methodobject, 248 | new Object[0]); 249 | res = resobj.toString().equals("true"); 250 | } catch (InvocationTargetException e) { 251 | res = false; 252 | // FIXME - debug message 253 | } catch (IllegalAccessException e) { 254 | res = false; 255 | // FIXME - debug message 256 | } 257 | cursor = c + w.s_size; 258 | if (res) 259 | return w.result; 260 | } 261 | i = w.substring_i; 262 | if (i < 0) 263 | return 0; 264 | } 265 | } 266 | 267 | // find_among_b is for backwards processing. Same comments apply 268 | protected int find_among_b(Among v[], int v_size) { 269 | int i = 0; 270 | int j = v_size; 271 | 272 | int c = cursor; 273 | int lb = limit_backward; 274 | 275 | int common_i = 0; 276 | int common_j = 0; 277 | 278 | boolean first_key_inspected = false; 279 | 280 | while (true) { 281 | int k = i + ((j - i) >> 1); 282 | int diff = 0; 283 | int common = common_i < common_j ? common_i : common_j; 284 | Among w = v[k]; 285 | int i2; 286 | for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) { 287 | if (c - common == lb) { 288 | diff = -1; 289 | break; 290 | } 291 | diff = current.charAt(c - 1 - common) - w.s[i2]; 292 | if (diff != 0) 293 | break; 294 | common++; 295 | } 296 | if (diff < 0) { 297 | j = k; 298 | common_j = common; 299 | } else { 300 | i = k; 301 | common_i = common; 302 | } 303 | if (j - i <= 1) { 304 | if (i > 0) 305 | break; 306 | if (j == i) 307 | break; 308 | if (first_key_inspected) 309 | break; 310 | first_key_inspected = true; 311 | } 312 | } 313 | while (true) { 314 | Among w = v[i]; 315 | if (common_i >= w.s_size) { 316 | cursor = c - w.s_size; 317 | if (w.method == null) 318 | return w.result; 319 | 320 | boolean res; 321 | try { 322 | Object resobj = w.method.invoke(w.methodobject, 323 | new Object[0]); 324 | res = resobj.toString().equals("true"); 325 | } catch (InvocationTargetException e) { 326 | res = false; 327 | // FIXME - debug message 328 | } catch (IllegalAccessException e) { 329 | res = false; 330 | // FIXME - debug message 331 | } 332 | cursor = c - w.s_size; 333 | if (res) 334 | return w.result; 335 | } 336 | i = w.substring_i; 337 | if (i < 0) 338 | return 0; 339 | } 340 | } 341 | 342 | /* 343 | * to replace chars between c_bra and c_ket in current by the chars in s. 344 | */ 345 | protected int replace_s(int c_bra, int c_ket, String s) { 346 | int adjustment = s.length() - (c_ket - c_bra); 347 | current.replace(c_bra, c_ket, s); 348 | limit += adjustment; 349 | if (cursor >= c_ket) 350 | cursor += adjustment; 351 | else if (cursor > c_bra) 352 | cursor = c_bra; 353 | return adjustment; 354 | } 355 | 356 | protected void slice_check() { 357 | if (bra < 0 || bra > ket || ket > limit || limit > current.length()) // this 358 | // line 359 | // could 360 | // be 361 | // removed 362 | { 363 | System.err.println("faulty slice operation"); 364 | // FIXME: report error somehow. 365 | /* 366 | * fprintf(stderr, "faulty slice operation:\n"); debug(z, -1, 0); 367 | * exit(1); 368 | */ 369 | } 370 | } 371 | 372 | protected void slice_from(String s) { 373 | slice_check(); 374 | replace_s(bra, ket, s); 375 | } 376 | 377 | protected void slice_from(CharSequence s) { 378 | slice_from(s.toString()); 379 | } 380 | 381 | protected void slice_del() { 382 | slice_from(""); 383 | } 384 | 385 | protected void insert(int c_bra, int c_ket, String s) { 386 | int adjustment = replace_s(c_bra, c_ket, s); 387 | if (c_bra <= bra) 388 | bra += adjustment; 389 | if (c_bra <= ket) 390 | ket += adjustment; 391 | } 392 | 393 | protected void insert(int c_bra, int c_ket, CharSequence s) { 394 | insert(c_bra, c_ket, s.toString()); 395 | } 396 | 397 | /* Copy the slice into the supplied StringBuffer */ 398 | protected StringBuffer slice_to(StringBuffer s) { 399 | slice_check(); 400 | int len = ket - bra; 401 | s.replace(0, s.length(), current.substring(bra, ket)); 402 | return s; 403 | } 404 | 405 | /* Copy the slice into the supplied StringBuilder */ 406 | protected StringBuilder slice_to(StringBuilder s) { 407 | slice_check(); 408 | int len = ket - bra; 409 | s.replace(0, s.length(), current.substring(bra, ket)); 410 | return s; 411 | } 412 | 413 | protected StringBuffer assign_to(StringBuffer s) { 414 | s.replace(0, s.length(), current.substring(0, limit)); 415 | return s; 416 | } 417 | 418 | protected StringBuilder assign_to(StringBuilder s) { 419 | s.replace(0, s.length(), current.substring(0, limit)); 420 | return s; 421 | } 422 | 423 | /* 424 | * extern void debug(struct SN_env * z, int number, int line_count) { int i; 425 | * int limit = SIZE(z->p); //if (number >= 0) printf("%3d (line %4d): '", 426 | * number, line_count); if (number >= 0) printf("%3d (line %4d): [%d]'", 427 | * number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) 428 | * printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if 429 | * (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { 430 | * int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } 431 | * printf("'\n"); } 432 | */ 433 | 434 | }; 435 | -------------------------------------------------------------------------------- /src/main/webapp/css/normalize.css: -------------------------------------------------------------------------------- 1 | /*! normalize.css v1.1.0 | MIT License | git.io/normalize */ 2 | 3 | /* ========================================================================== 4 | HTML5 display definitions 5 | ========================================================================== */ 6 | 7 | /** 8 | * Correct `block` display not defined in IE 6/7/8/9 and Firefox 3. 9 | */ 10 | 11 | article, 12 | aside, 13 | details, 14 | figcaption, 15 | figure, 16 | footer, 17 | header, 18 | hgroup, 19 | main, 20 | nav, 21 | section, 22 | summary { 23 | display: block; 24 | } 25 | 26 | /** 27 | * Correct `inline-block` display not defined in IE 6/7/8/9 and Firefox 3. 28 | */ 29 | 30 | audio, 31 | canvas, 32 | video { 33 | display: inline-block; 34 | *display: inline; 35 | *zoom: 1; 36 | } 37 | 38 | /** 39 | * Prevent modern browsers from displaying `audio` without controls. 40 | * Remove excess height in iOS 5 devices. 41 | */ 42 | 43 | audio:not([controls]) { 44 | display: none; 45 | height: 0; 46 | } 47 | 48 | /** 49 | * Address styling not present in IE 7/8/9, Firefox 3, and Safari 4. 50 | * Known issue: no IE 6 support. 51 | */ 52 | 53 | [hidden] { 54 | display: none; 55 | } 56 | 57 | /* ========================================================================== 58 | Base 59 | ========================================================================== */ 60 | 61 | /** 62 | * 1. Correct text resizing oddly in IE 6/7 when body `font-size` is set using 63 | * `em` units. 64 | * 2. Prevent iOS text size adjust after orientation change, without disabling 65 | * user zoom. 66 | */ 67 | 68 | html { 69 | font-size: 100%; /* 1 */ 70 | -webkit-text-size-adjust: 100%; /* 2 */ 71 | -ms-text-size-adjust: 100%; /* 2 */ 72 | } 73 | 74 | /** 75 | * Address `font-family` inconsistency between `textarea` and other form 76 | * elements. 77 | */ 78 | 79 | html, 80 | button, 81 | input, 82 | select, 83 | textarea { 84 | font-family: sans-serif; 85 | } 86 | 87 | /** 88 | * Address margins handled incorrectly in IE 6/7. 89 | */ 90 | 91 | body { 92 | margin: 0; 93 | } 94 | 95 | /* ========================================================================== 96 | Links 97 | ========================================================================== */ 98 | 99 | /** 100 | * Address `outline` inconsistency between Chrome and other browsers. 101 | */ 102 | 103 | a:focus { 104 | outline: thin dotted; 105 | } 106 | 107 | /** 108 | * Improve readability when focused and also mouse hovered in all browsers. 109 | */ 110 | 111 | a:active, 112 | a:hover { 113 | outline: 0; 114 | } 115 | 116 | /* ========================================================================== 117 | Typography 118 | ========================================================================== */ 119 | 120 | /** 121 | * Address font sizes and margins set differently in IE 6/7. 122 | * Address font sizes within `section` and `article` in Firefox 4+, Safari 5, 123 | * and Chrome. 124 | */ 125 | 126 | h1 { 127 | font-size: 2em; 128 | margin: 0.67em 0; 129 | } 130 | 131 | h2 { 132 | font-size: 1.5em; 133 | margin: 0.83em 0; 134 | } 135 | 136 | h3 { 137 | font-size: 1.17em; 138 | margin: 1em 0; 139 | } 140 | 141 | h4 { 142 | font-size: 1em; 143 | margin: 1.33em 0; 144 | } 145 | 146 | h5 { 147 | font-size: 0.83em; 148 | margin: 1.67em 0; 149 | } 150 | 151 | h6 { 152 | font-size: 0.67em; 153 | margin: 2.33em 0; 154 | } 155 | 156 | /** 157 | * Address styling not present in IE 7/8/9, Safari 5, and Chrome. 158 | */ 159 | 160 | abbr[title] { 161 | border-bottom: 1px dotted; 162 | } 163 | 164 | /** 165 | * Address style set to `bolder` in Firefox 3+, Safari 4/5, and Chrome. 166 | */ 167 | 168 | b, 169 | strong { 170 | font-weight: bold; 171 | } 172 | 173 | blockquote { 174 | margin: 1em 40px; 175 | } 176 | 177 | /** 178 | * Address styling not present in Safari 5 and Chrome. 179 | */ 180 | 181 | dfn { 182 | font-style: italic; 183 | } 184 | 185 | /** 186 | * Address differences between Firefox and other browsers. 187 | * Known issue: no IE 6/7 normalization. 188 | */ 189 | 190 | hr { 191 | -moz-box-sizing: content-box; 192 | box-sizing: content-box; 193 | height: 0; 194 | } 195 | 196 | /** 197 | * Address styling not present in IE 6/7/8/9. 198 | */ 199 | 200 | mark { 201 | background: #ff0; 202 | color: #000; 203 | } 204 | 205 | /** 206 | * Address margins set differently in IE 6/7. 207 | */ 208 | 209 | p, 210 | pre { 211 | margin: 1em 0; 212 | } 213 | 214 | /** 215 | * Correct font family set oddly in IE 6, Safari 4/5, and Chrome. 216 | */ 217 | 218 | code, 219 | kbd, 220 | pre, 221 | samp { 222 | font-family: monospace, serif; 223 | _font-family: 'courier new', monospace; 224 | font-size: 1em; 225 | } 226 | 227 | /** 228 | * Improve readability of pre-formatted text in all browsers. 229 | */ 230 | 231 | pre { 232 | white-space: pre; 233 | white-space: pre-wrap; 234 | word-wrap: break-word; 235 | } 236 | 237 | /** 238 | * Address CSS quotes not supported in IE 6/7. 239 | */ 240 | 241 | q { 242 | quotes: none; 243 | } 244 | 245 | /** 246 | * Address `quotes` property not supported in Safari 4. 247 | */ 248 | 249 | q:before, 250 | q:after { 251 | content: ''; 252 | content: none; 253 | } 254 | 255 | /** 256 | * Address inconsistent and variable font size in all browsers. 257 | */ 258 | 259 | small { 260 | font-size: 80%; 261 | } 262 | 263 | /** 264 | * Prevent `sub` and `sup` affecting `line-height` in all browsers. 265 | */ 266 | 267 | sub, 268 | sup { 269 | font-size: 75%; 270 | line-height: 0; 271 | position: relative; 272 | vertical-align: baseline; 273 | } 274 | 275 | sup { 276 | top: -0.5em; 277 | } 278 | 279 | sub { 280 | bottom: -0.25em; 281 | } 282 | 283 | /* ========================================================================== 284 | Lists 285 | ========================================================================== */ 286 | 287 | /** 288 | * Address margins set differently in IE 6/7. 289 | */ 290 | 291 | dl, 292 | menu, 293 | ol, 294 | ul { 295 | margin: 1em 0; 296 | } 297 | 298 | dd { 299 | margin: 0 0 0 40px; 300 | } 301 | 302 | /** 303 | * Address paddings set differently in IE 6/7. 304 | */ 305 | 306 | menu, 307 | ol, 308 | ul { 309 | padding: 0 0 0 40px; 310 | } 311 | 312 | /** 313 | * Correct list images handled incorrectly in IE 7. 314 | */ 315 | 316 | nav ul, 317 | nav ol { 318 | list-style: none; 319 | list-style-image: none; 320 | } 321 | 322 | /* ========================================================================== 323 | Embedded content 324 | ========================================================================== */ 325 | 326 | /** 327 | * 1. Remove border when inside `a` element in IE 6/7/8/9 and Firefox 3. 328 | * 2. Improve image quality when scaled in IE 7. 329 | */ 330 | 331 | img { 332 | border: 0; /* 1 */ 333 | -ms-interpolation-mode: bicubic; /* 2 */ 334 | } 335 | 336 | /** 337 | * Correct overflow displayed oddly in IE 9. 338 | */ 339 | 340 | svg:not(:root) { 341 | overflow: hidden; 342 | } 343 | 344 | /* ========================================================================== 345 | Figures 346 | ========================================================================== */ 347 | 348 | /** 349 | * Address margin not present in IE 6/7/8/9, Safari 5, and Opera 11. 350 | */ 351 | 352 | figure { 353 | margin: 0; 354 | } 355 | 356 | /* ========================================================================== 357 | Forms 358 | ========================================================================== */ 359 | 360 | /** 361 | * Correct margin displayed oddly in IE 6/7. 362 | */ 363 | 364 | form { 365 | margin: 0; 366 | } 367 | 368 | /** 369 | * Define consistent border, margin, and padding. 370 | */ 371 | 372 | fieldset { 373 | border: 1px solid #c0c0c0; 374 | margin: 0 2px; 375 | padding: 0.35em 0.625em 0.75em; 376 | } 377 | 378 | /** 379 | * 1. Correct color not being inherited in IE 6/7/8/9. 380 | * 2. Correct text not wrapping in Firefox 3. 381 | * 3. Correct alignment displayed oddly in IE 6/7. 382 | */ 383 | 384 | legend { 385 | border: 0; /* 1 */ 386 | padding: 0; 387 | white-space: normal; /* 2 */ 388 | *margin-left: -7px; /* 3 */ 389 | } 390 | 391 | /** 392 | * 1. Correct font size not being inherited in all browsers. 393 | * 2. Address margins set differently in IE 6/7, Firefox 3+, Safari 5, 394 | * and Chrome. 395 | * 3. Improve appearance and consistency in all browsers. 396 | */ 397 | 398 | button, 399 | input, 400 | select, 401 | textarea { 402 | font-size: 100%; /* 1 */ 403 | margin: 0; /* 2 */ 404 | vertical-align: baseline; /* 3 */ 405 | *vertical-align: middle; /* 3 */ 406 | } 407 | 408 | /** 409 | * Address Firefox 3+ setting `line-height` on `input` using `!important` in 410 | * the UA stylesheet. 411 | */ 412 | 413 | button, 414 | input { 415 | line-height: normal; 416 | } 417 | 418 | /** 419 | * Address inconsistent `text-transform` inheritance for `button` and `select`. 420 | * All other form control elements do not inherit `text-transform` values. 421 | * Correct `button` style inheritance in Chrome, Safari 5+, and IE 6+. 422 | * Correct `select` style inheritance in Firefox 4+ and Opera. 423 | */ 424 | 425 | button, 426 | select { 427 | text-transform: none; 428 | } 429 | 430 | /** 431 | * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio` 432 | * and `video` controls. 433 | * 2. Correct inability to style clickable `input` types in iOS. 434 | * 3. Improve usability and consistency of cursor style between image-type 435 | * `input` and others. 436 | * 4. Remove inner spacing in IE 7 without affecting normal text inputs. 437 | * Known issue: inner spacing remains in IE 6. 438 | */ 439 | 440 | button, 441 | html input[type="button"], /* 1 */ 442 | input[type="reset"], 443 | input[type="submit"] { 444 | -webkit-appearance: button; /* 2 */ 445 | cursor: pointer; /* 3 */ 446 | *overflow: visible; /* 4 */ 447 | } 448 | 449 | /** 450 | * Re-set default cursor for disabled elements. 451 | */ 452 | 453 | button[disabled], 454 | html input[disabled] { 455 | cursor: default; 456 | } 457 | 458 | /** 459 | * 1. Address box sizing set to content-box in IE 8/9. 460 | * 2. Remove excess padding in IE 8/9. 461 | * 3. Remove excess padding in IE 7. 462 | * Known issue: excess padding remains in IE 6. 463 | */ 464 | 465 | input[type="checkbox"], 466 | input[type="radio"] { 467 | box-sizing: border-box; /* 1 */ 468 | padding: 0; /* 2 */ 469 | *height: 13px; /* 3 */ 470 | *width: 13px; /* 3 */ 471 | } 472 | 473 | /** 474 | * 1. Address `appearance` set to `searchfield` in Safari 5 and Chrome. 475 | * 2. Address `box-sizing` set to `border-box` in Safari 5 and Chrome 476 | * (include `-moz` to future-proof). 477 | */ 478 | 479 | input[type="search"] { 480 | -webkit-appearance: textfield; /* 1 */ 481 | -moz-box-sizing: content-box; 482 | -webkit-box-sizing: content-box; /* 2 */ 483 | box-sizing: content-box; 484 | } 485 | 486 | /** 487 | * Remove inner padding and search cancel button in Safari 5 and Chrome 488 | * on OS X. 489 | */ 490 | 491 | input[type="search"]::-webkit-search-cancel-button, 492 | input[type="search"]::-webkit-search-decoration { 493 | -webkit-appearance: none; 494 | } 495 | 496 | /** 497 | * Remove inner padding and border in Firefox 3+. 498 | */ 499 | 500 | button::-moz-focus-inner, 501 | input::-moz-focus-inner { 502 | border: 0; 503 | padding: 0; 504 | } 505 | 506 | /** 507 | * 1. Remove default vertical scrollbar in IE 6/7/8/9. 508 | * 2. Improve readability and alignment in all browsers. 509 | */ 510 | 511 | textarea { 512 | overflow: auto; /* 1 */ 513 | vertical-align: top; /* 2 */ 514 | } 515 | 516 | /* ========================================================================== 517 | Tables 518 | ========================================================================== */ 519 | 520 | /** 521 | * Remove most spacing between table cells. 522 | */ 523 | 524 | table { 525 | border-collapse: collapse; 526 | border-spacing: 0; 527 | } 528 | -------------------------------------------------------------------------------- /src/main/webapp/js/vendor/modernizr-2.6.2.min.js: -------------------------------------------------------------------------------- 1 | /* Modernizr 2.6.2 (Custom Build) | MIT & BSD 2 | * Build: http://modernizr.com/download/#-fontface-backgroundsize-borderimage-borderradius-boxshadow-flexbox-hsla-multiplebgs-opacity-rgba-textshadow-cssanimations-csscolumns-generatedcontent-cssgradients-cssreflections-csstransforms-csstransforms3d-csstransitions-applicationcache-canvas-canvastext-draganddrop-hashchange-history-audio-video-indexeddb-input-inputtypes-localstorage-postmessage-sessionstorage-websockets-websqldatabase-webworkers-geolocation-inlinesvg-smil-svg-svgclippaths-touch-webgl-shiv-mq-cssclasses-addtest-prefixed-teststyles-testprop-testallprops-hasevent-prefixes-domprefixes-load 3 | */ 4 | ;window.Modernizr=function(a,b,c){function D(a){j.cssText=a}function E(a,b){return D(n.join(a+";")+(b||""))}function F(a,b){return typeof a===b}function G(a,b){return!!~(""+a).indexOf(b)}function H(a,b){for(var d in a){var e=a[d];if(!G(e,"-")&&j[e]!==c)return b=="pfx"?e:!0}return!1}function I(a,b,d){for(var e in a){var f=b[a[e]];if(f!==c)return d===!1?a[e]:F(f,"function")?f.bind(d||b):f}return!1}function J(a,b,c){var d=a.charAt(0).toUpperCase()+a.slice(1),e=(a+" "+p.join(d+" ")+d).split(" ");return F(b,"string")||F(b,"undefined")?H(e,b):(e=(a+" "+q.join(d+" ")+d).split(" "),I(e,b,c))}function K(){e.input=function(c){for(var d=0,e=c.length;d',a,""].join(""),l.id=h,(m?l:n).innerHTML+=f,n.appendChild(l),m||(n.style.background="",n.style.overflow="hidden",k=g.style.overflow,g.style.overflow="hidden",g.appendChild(n)),i=c(l,a),m?l.parentNode.removeChild(l):(n.parentNode.removeChild(n),g.style.overflow=k),!!i},z=function(b){var c=a.matchMedia||a.msMatchMedia;if(c)return c(b).matches;var d;return y("@media "+b+" { #"+h+" { position: absolute; } }",function(b){d=(a.getComputedStyle?getComputedStyle(b,null):b.currentStyle)["position"]=="absolute"}),d},A=function(){function d(d,e){e=e||b.createElement(a[d]||"div"),d="on"+d;var f=d in e;return f||(e.setAttribute||(e=b.createElement("div")),e.setAttribute&&e.removeAttribute&&(e.setAttribute(d,""),f=F(e[d],"function"),F(e[d],"undefined")||(e[d]=c),e.removeAttribute(d))),e=null,f}var a={select:"input",change:"input",submit:"form",reset:"form",error:"img",load:"img",abort:"img"};return d}(),B={}.hasOwnProperty,C;!F(B,"undefined")&&!F(B.call,"undefined")?C=function(a,b){return B.call(a,b)}:C=function(a,b){return b in a&&F(a.constructor.prototype[b],"undefined")},Function.prototype.bind||(Function.prototype.bind=function(b){var c=this;if(typeof c!="function")throw new TypeError;var d=w.call(arguments,1),e=function(){if(this instanceof e){var a=function(){};a.prototype=c.prototype;var f=new a,g=c.apply(f,d.concat(w.call(arguments)));return Object(g)===g?g:f}return c.apply(b,d.concat(w.call(arguments)))};return e}),s.flexbox=function(){return J("flexWrap")},s.canvas=function(){var a=b.createElement("canvas");return!!a.getContext&&!!a.getContext("2d")},s.canvastext=function(){return!!e.canvas&&!!F(b.createElement("canvas").getContext("2d").fillText,"function")},s.webgl=function(){return!!a.WebGLRenderingContext},s.touch=function(){var c;return"ontouchstart"in a||a.DocumentTouch&&b instanceof DocumentTouch?c=!0:y(["@media (",n.join("touch-enabled),("),h,")","{#modernizr{top:9px;position:absolute}}"].join(""),function(a){c=a.offsetTop===9}),c},s.geolocation=function(){return"geolocation"in navigator},s.postmessage=function(){return!!a.postMessage},s.websqldatabase=function(){return!!a.openDatabase},s.indexedDB=function(){return!!J("indexedDB",a)},s.hashchange=function(){return A("hashchange",a)&&(b.documentMode===c||b.documentMode>7)},s.history=function(){return!!a.history&&!!history.pushState},s.draganddrop=function(){var a=b.createElement("div");return"draggable"in a||"ondragstart"in a&&"ondrop"in a},s.websockets=function(){return"WebSocket"in a||"MozWebSocket"in a},s.rgba=function(){return D("background-color:rgba(150,255,150,.5)"),G(j.backgroundColor,"rgba")},s.hsla=function(){return D("background-color:hsla(120,40%,100%,.5)"),G(j.backgroundColor,"rgba")||G(j.backgroundColor,"hsla")},s.multiplebgs=function(){return D("background:url(https://),url(https://),red url(https://)"),/(url\s*\(.*?){3}/.test(j.background)},s.backgroundsize=function(){return J("backgroundSize")},s.borderimage=function(){return J("borderImage")},s.borderradius=function(){return J("borderRadius")},s.boxshadow=function(){return J("boxShadow")},s.textshadow=function(){return b.createElement("div").style.textShadow===""},s.opacity=function(){return E("opacity:.55"),/^0.55$/.test(j.opacity)},s.cssanimations=function(){return J("animationName")},s.csscolumns=function(){return J("columnCount")},s.cssgradients=function(){var a="background-image:",b="gradient(linear,left top,right bottom,from(#9f9),to(white));",c="linear-gradient(left top,#9f9, white);";return D((a+"-webkit- ".split(" ").join(b+a)+n.join(c+a)).slice(0,-a.length)),G(j.backgroundImage,"gradient")},s.cssreflections=function(){return J("boxReflect")},s.csstransforms=function(){return!!J("transform")},s.csstransforms3d=function(){var a=!!J("perspective");return a&&"webkitPerspective"in g.style&&y("@media (transform-3d),(-webkit-transform-3d){#modernizr{left:9px;position:absolute;height:3px;}}",function(b,c){a=b.offsetLeft===9&&b.offsetHeight===3}),a},s.csstransitions=function(){return J("transition")},s.fontface=function(){var a;return y('@font-face {font-family:"font";src:url("https://")}',function(c,d){var e=b.getElementById("smodernizr"),f=e.sheet||e.styleSheet,g=f?f.cssRules&&f.cssRules[0]?f.cssRules[0].cssText:f.cssText||"":"";a=/src/i.test(g)&&g.indexOf(d.split(" ")[0])===0}),a},s.generatedcontent=function(){var a;return y(["#",h,"{font:0/0 a}#",h,':after{content:"',l,'";visibility:hidden;font:3px/1 a}'].join(""),function(b){a=b.offsetHeight>=3}),a},s.video=function(){var a=b.createElement("video"),c=!1;try{if(c=!!a.canPlayType)c=new Boolean(c),c.ogg=a.canPlayType('video/ogg; codecs="theora"').replace(/^no$/,""),c.h264=a.canPlayType('video/mp4; codecs="avc1.42E01E"').replace(/^no$/,""),c.webm=a.canPlayType('video/webm; codecs="vp8, vorbis"').replace(/^no$/,"")}catch(d){}return c},s.audio=function(){var a=b.createElement("audio"),c=!1;try{if(c=!!a.canPlayType)c=new Boolean(c),c.ogg=a.canPlayType('audio/ogg; codecs="vorbis"').replace(/^no$/,""),c.mp3=a.canPlayType("audio/mpeg;").replace(/^no$/,""),c.wav=a.canPlayType('audio/wav; codecs="1"').replace(/^no$/,""),c.m4a=(a.canPlayType("audio/x-m4a;")||a.canPlayType("audio/aac;")).replace(/^no$/,"")}catch(d){}return c},s.localstorage=function(){try{return localStorage.setItem(h,h),localStorage.removeItem(h),!0}catch(a){return!1}},s.sessionstorage=function(){try{return sessionStorage.setItem(h,h),sessionStorage.removeItem(h),!0}catch(a){return!1}},s.webworkers=function(){return!!a.Worker},s.applicationcache=function(){return!!a.applicationCache},s.svg=function(){return!!b.createElementNS&&!!b.createElementNS(r.svg,"svg").createSVGRect},s.inlinesvg=function(){var a=b.createElement("div");return a.innerHTML="",(a.firstChild&&a.firstChild.namespaceURI)==r.svg},s.smil=function(){return!!b.createElementNS&&/SVGAnimate/.test(m.call(b.createElementNS(r.svg,"animate")))},s.svgclippaths=function(){return!!b.createElementNS&&/SVGClipPath/.test(m.call(b.createElementNS(r.svg,"clipPath")))};for(var L in s)C(s,L)&&(x=L.toLowerCase(),e[x]=s[L](),v.push((e[x]?"":"no-")+x));return e.input||K(),e.addTest=function(a,b){if(typeof a=="object")for(var d in a)C(a,d)&&e.addTest(d,a[d]);else{a=a.toLowerCase();if(e[a]!==c)return e;b=typeof b=="function"?b():b,typeof f!="undefined"&&f&&(g.className+=" "+(b?"":"no-")+a),e[a]=b}return e},D(""),i=k=null,function(a,b){function k(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function l(){var a=r.elements;return typeof a=="string"?a.split(" "):a}function m(a){var b=i[a[g]];return b||(b={},h++,a[g]=h,i[h]=b),b}function n(a,c,f){c||(c=b);if(j)return c.createElement(a);f||(f=m(c));var g;return f.cache[a]?g=f.cache[a].cloneNode():e.test(a)?g=(f.cache[a]=f.createElem(a)).cloneNode():g=f.createElem(a),g.canHaveChildren&&!d.test(a)?f.frag.appendChild(g):g}function o(a,c){a||(a=b);if(j)return a.createDocumentFragment();c=c||m(a);var d=c.frag.cloneNode(),e=0,f=l(),g=f.length;for(;e",f="hidden"in a,j=a.childNodes.length==1||function(){b.createElement("a");var a=b.createDocumentFragment();return typeof a.cloneNode=="undefined"||typeof a.createDocumentFragment=="undefined"||typeof a.createElement=="undefined"}()}catch(c){f=!0,j=!0}})();var r={elements:c.elements||"abbr article aside audio bdi canvas data datalist details figcaption figure footer header hgroup mark meter nav output progress section summary time video",shivCSS:c.shivCSS!==!1,supportsUnknownElements:j,shivMethods:c.shivMethods!==!1,type:"default",shivDocument:q,createElement:n,createDocumentFragment:o};a.html5=r,q(b)}(this,b),e._version=d,e._prefixes=n,e._domPrefixes=q,e._cssomPrefixes=p,e.mq=z,e.hasEvent=A,e.testProp=function(a){return H([a])},e.testAllProps=J,e.testStyles=y,e.prefixed=function(a,b,c){return b?J(a,b,c):J(a,"pfx")},g.className=g.className.replace(/(^|\s)no-js(\s|$)/,"$1$2")+(f?" js "+v.join(" "):""),e}(this,this.document),function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f= limit) { 108 | break lab1; 109 | } 110 | cursor++; 111 | } while (false); 112 | continue replab0; 113 | } while (false); 114 | cursor = v_2; 115 | break replab0; 116 | } 117 | cursor = v_1; 118 | // repeat, line 41 119 | replab4: while (true) { 120 | v_4 = cursor; 121 | lab5: do { 122 | // goto, line 41 123 | golab6: while (true) { 124 | v_5 = cursor; 125 | lab7: do { 126 | // (, line 41 127 | if (!(in_grouping(g_v, 97, 252))) { 128 | break lab7; 129 | } 130 | // [, line 42 131 | bra = cursor; 132 | // or, line 42 133 | lab8: do { 134 | v_6 = cursor; 135 | lab9: do { 136 | // (, line 42 137 | // literal, line 42 138 | if (!(eq_s(1, "u"))) { 139 | break lab9; 140 | } 141 | // ], line 42 142 | ket = cursor; 143 | if (!(in_grouping(g_v, 97, 252))) { 144 | break lab9; 145 | } 146 | // <-, line 42 147 | slice_from("U"); 148 | break lab8; 149 | } while (false); 150 | cursor = v_6; 151 | // (, line 43 152 | // literal, line 43 153 | if (!(eq_s(1, "y"))) { 154 | break lab7; 155 | } 156 | // ], line 43 157 | ket = cursor; 158 | if (!(in_grouping(g_v, 97, 252))) { 159 | break lab7; 160 | } 161 | // <-, line 43 162 | slice_from("Y"); 163 | } while (false); 164 | cursor = v_5; 165 | break golab6; 166 | } while (false); 167 | cursor = v_5; 168 | if (cursor >= limit) { 169 | break lab5; 170 | } 171 | cursor++; 172 | } 173 | continue replab4; 174 | } while (false); 175 | cursor = v_4; 176 | break replab4; 177 | } 178 | return true; 179 | } 180 | 181 | private boolean r_mark_regions() { 182 | int v_1; 183 | // (, line 47 184 | I_p1 = limit; 185 | I_p2 = limit; 186 | // test, line 52 187 | v_1 = cursor; 188 | // (, line 52 189 | // hop, line 52 190 | { 191 | int c = cursor + 3; 192 | if (0 > c || c > limit) { 193 | return false; 194 | } 195 | cursor = c; 196 | } 197 | // setmark x, line 52 198 | I_x = cursor; 199 | cursor = v_1; 200 | // gopast, line 54 201 | golab0: while (true) { 202 | lab1: do { 203 | if (!(in_grouping(g_v, 97, 252))) { 204 | break lab1; 205 | } 206 | break golab0; 207 | } while (false); 208 | if (cursor >= limit) { 209 | return false; 210 | } 211 | cursor++; 212 | } 213 | // gopast, line 54 214 | golab2: while (true) { 215 | lab3: do { 216 | if (!(out_grouping(g_v, 97, 252))) { 217 | break lab3; 218 | } 219 | break golab2; 220 | } while (false); 221 | if (cursor >= limit) { 222 | return false; 223 | } 224 | cursor++; 225 | } 226 | // setmark p1, line 54 227 | I_p1 = cursor; 228 | // try, line 55 229 | lab4: do { 230 | // (, line 55 231 | if (!(I_p1 < I_x)) { 232 | break lab4; 233 | } 234 | I_p1 = I_x; 235 | } while (false); 236 | // gopast, line 56 237 | golab5: while (true) { 238 | lab6: do { 239 | if (!(in_grouping(g_v, 97, 252))) { 240 | break lab6; 241 | } 242 | break golab5; 243 | } while (false); 244 | if (cursor >= limit) { 245 | return false; 246 | } 247 | cursor++; 248 | } 249 | // gopast, line 56 250 | golab7: while (true) { 251 | lab8: do { 252 | if (!(out_grouping(g_v, 97, 252))) { 253 | break lab8; 254 | } 255 | break golab7; 256 | } while (false); 257 | if (cursor >= limit) { 258 | return false; 259 | } 260 | cursor++; 261 | } 262 | // setmark p2, line 56 263 | I_p2 = cursor; 264 | return true; 265 | } 266 | 267 | private boolean r_postlude() { 268 | int among_var; 269 | int v_1; 270 | // repeat, line 60 271 | replab0: while (true) { 272 | v_1 = cursor; 273 | lab1: do { 274 | // (, line 60 275 | // [, line 62 276 | bra = cursor; 277 | // substring, line 62 278 | among_var = find_among(a_0, 6); 279 | if (among_var == 0) { 280 | break lab1; 281 | } 282 | // ], line 62 283 | ket = cursor; 284 | switch (among_var) { 285 | case 0: 286 | break lab1; 287 | case 1: 288 | // (, line 63 289 | // <-, line 63 290 | slice_from("y"); 291 | break; 292 | case 2: 293 | // (, line 64 294 | // <-, line 64 295 | slice_from("u"); 296 | break; 297 | case 3: 298 | // (, line 65 299 | // <-, line 65 300 | slice_from("a"); 301 | break; 302 | case 4: 303 | // (, line 66 304 | // <-, line 66 305 | slice_from("o"); 306 | break; 307 | case 5: 308 | // (, line 67 309 | // <-, line 67 310 | slice_from("u"); 311 | break; 312 | case 6: 313 | // (, line 68 314 | // next, line 68 315 | if (cursor >= limit) { 316 | break lab1; 317 | } 318 | cursor++; 319 | break; 320 | } 321 | continue replab0; 322 | } while (false); 323 | cursor = v_1; 324 | break replab0; 325 | } 326 | return true; 327 | } 328 | 329 | private boolean r_R1() { 330 | if (!(I_p1 <= cursor)) { 331 | return false; 332 | } 333 | return true; 334 | } 335 | 336 | private boolean r_R2() { 337 | if (!(I_p2 <= cursor)) { 338 | return false; 339 | } 340 | return true; 341 | } 342 | 343 | private boolean r_standard_suffix() { 344 | int among_var; 345 | int v_1; 346 | int v_2; 347 | int v_3; 348 | int v_4; 349 | int v_5; 350 | int v_6; 351 | int v_7; 352 | int v_8; 353 | int v_9; 354 | int v_10; 355 | // (, line 78 356 | // do, line 79 357 | v_1 = limit - cursor; 358 | lab0: do { 359 | // (, line 79 360 | // [, line 80 361 | ket = cursor; 362 | // substring, line 80 363 | among_var = find_among_b(a_1, 7); 364 | if (among_var == 0) { 365 | break lab0; 366 | } 367 | // ], line 80 368 | bra = cursor; 369 | // call R1, line 80 370 | if (!r_R1()) { 371 | break lab0; 372 | } 373 | switch (among_var) { 374 | case 0: 375 | break lab0; 376 | case 1: 377 | // (, line 82 378 | // delete, line 82 379 | slice_del(); 380 | break; 381 | case 2: 382 | // (, line 85 383 | // delete, line 85 384 | slice_del(); 385 | // try, line 86 386 | v_2 = limit - cursor; 387 | lab1: do { 388 | // (, line 86 389 | // [, line 86 390 | ket = cursor; 391 | // literal, line 86 392 | if (!(eq_s_b(1, "s"))) { 393 | cursor = limit - v_2; 394 | break lab1; 395 | } 396 | // ], line 86 397 | bra = cursor; 398 | // literal, line 86 399 | if (!(eq_s_b(3, "nis"))) { 400 | cursor = limit - v_2; 401 | break lab1; 402 | } 403 | // delete, line 86 404 | slice_del(); 405 | } while (false); 406 | break; 407 | case 3: 408 | // (, line 89 409 | if (!(in_grouping_b(g_s_ending, 98, 116))) { 410 | break lab0; 411 | } 412 | // delete, line 89 413 | slice_del(); 414 | break; 415 | } 416 | } while (false); 417 | cursor = limit - v_1; 418 | // do, line 93 419 | v_3 = limit - cursor; 420 | lab2: do { 421 | // (, line 93 422 | // [, line 94 423 | ket = cursor; 424 | // substring, line 94 425 | among_var = find_among_b(a_2, 4); 426 | if (among_var == 0) { 427 | break lab2; 428 | } 429 | // ], line 94 430 | bra = cursor; 431 | // call R1, line 94 432 | if (!r_R1()) { 433 | break lab2; 434 | } 435 | switch (among_var) { 436 | case 0: 437 | break lab2; 438 | case 1: 439 | // (, line 96 440 | // delete, line 96 441 | slice_del(); 442 | break; 443 | case 2: 444 | // (, line 99 445 | if (!(in_grouping_b(g_st_ending, 98, 116))) { 446 | break lab2; 447 | } 448 | // hop, line 99 449 | { 450 | int c = cursor - 3; 451 | if (limit_backward > c || c > limit) { 452 | break lab2; 453 | } 454 | cursor = c; 455 | } 456 | // delete, line 99 457 | slice_del(); 458 | break; 459 | } 460 | } while (false); 461 | cursor = limit - v_3; 462 | // do, line 103 463 | v_4 = limit - cursor; 464 | lab3: do { 465 | // (, line 103 466 | // [, line 104 467 | ket = cursor; 468 | // substring, line 104 469 | among_var = find_among_b(a_4, 8); 470 | if (among_var == 0) { 471 | break lab3; 472 | } 473 | // ], line 104 474 | bra = cursor; 475 | // call R2, line 104 476 | if (!r_R2()) { 477 | break lab3; 478 | } 479 | switch (among_var) { 480 | case 0: 481 | break lab3; 482 | case 1: 483 | // (, line 106 484 | // delete, line 106 485 | slice_del(); 486 | // try, line 107 487 | v_5 = limit - cursor; 488 | lab4: do { 489 | // (, line 107 490 | // [, line 107 491 | ket = cursor; 492 | // literal, line 107 493 | if (!(eq_s_b(2, "ig"))) { 494 | cursor = limit - v_5; 495 | break lab4; 496 | } 497 | // ], line 107 498 | bra = cursor; 499 | // not, line 107 500 | { 501 | v_6 = limit - cursor; 502 | lab5: do { 503 | // literal, line 107 504 | if (!(eq_s_b(1, "e"))) { 505 | break lab5; 506 | } 507 | cursor = limit - v_5; 508 | break lab4; 509 | } while (false); 510 | cursor = limit - v_6; 511 | } 512 | // call R2, line 107 513 | if (!r_R2()) { 514 | cursor = limit - v_5; 515 | break lab4; 516 | } 517 | // delete, line 107 518 | slice_del(); 519 | } while (false); 520 | break; 521 | case 2: 522 | // (, line 110 523 | // not, line 110 524 | { 525 | v_7 = limit - cursor; 526 | lab6: do { 527 | // literal, line 110 528 | if (!(eq_s_b(1, "e"))) { 529 | break lab6; 530 | } 531 | break lab3; 532 | } while (false); 533 | cursor = limit - v_7; 534 | } 535 | // delete, line 110 536 | slice_del(); 537 | break; 538 | case 3: 539 | // (, line 113 540 | // delete, line 113 541 | slice_del(); 542 | // try, line 114 543 | v_8 = limit - cursor; 544 | lab7: do { 545 | // (, line 114 546 | // [, line 115 547 | ket = cursor; 548 | // or, line 115 549 | lab8: do { 550 | v_9 = limit - cursor; 551 | lab9: do { 552 | // literal, line 115 553 | if (!(eq_s_b(2, "er"))) { 554 | break lab9; 555 | } 556 | break lab8; 557 | } while (false); 558 | cursor = limit - v_9; 559 | // literal, line 115 560 | if (!(eq_s_b(2, "en"))) { 561 | cursor = limit - v_8; 562 | break lab7; 563 | } 564 | } while (false); 565 | // ], line 115 566 | bra = cursor; 567 | // call R1, line 115 568 | if (!r_R1()) { 569 | cursor = limit - v_8; 570 | break lab7; 571 | } 572 | // delete, line 115 573 | slice_del(); 574 | } while (false); 575 | break; 576 | case 4: 577 | // (, line 119 578 | // delete, line 119 579 | slice_del(); 580 | // try, line 120 581 | v_10 = limit - cursor; 582 | lab10: do { 583 | // (, line 120 584 | // [, line 121 585 | ket = cursor; 586 | // substring, line 121 587 | among_var = find_among_b(a_3, 2); 588 | if (among_var == 0) { 589 | cursor = limit - v_10; 590 | break lab10; 591 | } 592 | // ], line 121 593 | bra = cursor; 594 | // call R2, line 121 595 | if (!r_R2()) { 596 | cursor = limit - v_10; 597 | break lab10; 598 | } 599 | switch (among_var) { 600 | case 0: 601 | cursor = limit - v_10; 602 | break lab10; 603 | case 1: 604 | // (, line 123 605 | // delete, line 123 606 | slice_del(); 607 | break; 608 | } 609 | } while (false); 610 | break; 611 | } 612 | } while (false); 613 | cursor = limit - v_4; 614 | return true; 615 | } 616 | 617 | public boolean stem() { 618 | int v_1; 619 | int v_2; 620 | int v_3; 621 | int v_4; 622 | // (, line 133 623 | // do, line 134 624 | v_1 = cursor; 625 | lab0: do { 626 | // call prelude, line 134 627 | if (!r_prelude()) { 628 | break lab0; 629 | } 630 | } while (false); 631 | cursor = v_1; 632 | // do, line 135 633 | v_2 = cursor; 634 | lab1: do { 635 | // call mark_regions, line 135 636 | if (!r_mark_regions()) { 637 | break lab1; 638 | } 639 | } while (false); 640 | cursor = v_2; 641 | // backwards, line 136 642 | limit_backward = cursor; 643 | cursor = limit; 644 | // do, line 137 645 | v_3 = limit - cursor; 646 | lab2: do { 647 | // call standard_suffix, line 137 648 | if (!r_standard_suffix()) { 649 | break lab2; 650 | } 651 | } while (false); 652 | cursor = limit - v_3; 653 | cursor = limit_backward; // do, line 138 654 | v_4 = cursor; 655 | lab3: do { 656 | // call postlude, line 138 657 | if (!r_postlude()) { 658 | break lab3; 659 | } 660 | } while (false); 661 | cursor = v_4; 662 | return true; 663 | } 664 | 665 | public boolean equals(Object o) { 666 | return o instanceof germanStemmer; 667 | } 668 | 669 | public int hashCode() { 670 | return germanStemmer.class.getName().hashCode(); 671 | } 672 | 673 | } 674 | -------------------------------------------------------------------------------- /src/main/java/org/tartarus/snowball/ext/dutchStemmer.java: -------------------------------------------------------------------------------- 1 | // This file was generated automatically by the Snowball to Java compiler 2 | 3 | package org.tartarus.snowball.ext; 4 | 5 | import org.tartarus.snowball.Among; 6 | 7 | /** 8 | * This class was automatically generated by a Snowball to Java compiler It 9 | * implements the stemming algorithm defined by a snowball script. 10 | */ 11 | 12 | public class dutchStemmer extends org.tartarus.snowball.SnowballStemmer { 13 | 14 | private static final long serialVersionUID = 1L; 15 | 16 | private final static dutchStemmer methodObject = new dutchStemmer(); 17 | 18 | private final static Among a_0[] = { 19 | new Among("", -1, 6, "", methodObject), 20 | new Among("\u00E1", 0, 1, "", methodObject), 21 | new Among("\u00E4", 0, 1, "", methodObject), 22 | new Among("\u00E9", 0, 2, "", methodObject), 23 | new Among("\u00EB", 0, 2, "", methodObject), 24 | new Among("\u00ED", 0, 3, "", methodObject), 25 | new Among("\u00EF", 0, 3, "", methodObject), 26 | new Among("\u00F3", 0, 4, "", methodObject), 27 | new Among("\u00F6", 0, 4, "", methodObject), 28 | new Among("\u00FA", 0, 5, "", methodObject), 29 | new Among("\u00FC", 0, 5, "", methodObject) }; 30 | 31 | private final static Among a_1[] = { 32 | new Among("", -1, 3, "", methodObject), 33 | new Among("I", 0, 2, "", methodObject), 34 | new Among("Y", 0, 1, "", methodObject) }; 35 | 36 | private final static Among a_2[] = { 37 | new Among("dd", -1, -1, "", methodObject), 38 | new Among("kk", -1, -1, "", methodObject), 39 | new Among("tt", -1, -1, "", methodObject) }; 40 | 41 | private final static Among a_3[] = { 42 | new Among("ene", -1, 2, "", methodObject), 43 | new Among("se", -1, 3, "", methodObject), 44 | new Among("en", -1, 2, "", methodObject), 45 | new Among("heden", 2, 1, "", methodObject), 46 | new Among("s", -1, 3, "", methodObject) }; 47 | 48 | private final static Among a_4[] = { 49 | new Among("end", -1, 1, "", methodObject), 50 | new Among("ig", -1, 2, "", methodObject), 51 | new Among("ing", -1, 1, "", methodObject), 52 | new Among("lijk", -1, 3, "", methodObject), 53 | new Among("baar", -1, 4, "", methodObject), 54 | new Among("bar", -1, 5, "", methodObject) }; 55 | 56 | private final static Among a_5[] = { 57 | new Among("aa", -1, -1, "", methodObject), 58 | new Among("ee", -1, -1, "", methodObject), 59 | new Among("oo", -1, -1, "", methodObject), 60 | new Among("uu", -1, -1, "", methodObject) }; 61 | 62 | private static final char g_v[] = { 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 63 | 0, 0, 0, 0, 128 }; 64 | 65 | private static final char g_v_I[] = { 1, 0, 0, 17, 65, 16, 1, 0, 0, 0, 0, 66 | 0, 0, 0, 0, 0, 0, 0, 0, 128 }; 67 | 68 | private static final char g_v_j[] = { 17, 67, 16, 1, 0, 0, 0, 0, 0, 0, 0, 69 | 0, 0, 0, 0, 0, 128 }; 70 | 71 | private int I_p2; 72 | private int I_p1; 73 | private boolean B_e_found; 74 | 75 | private void copy_from(dutchStemmer other) { 76 | I_p2 = other.I_p2; 77 | I_p1 = other.I_p1; 78 | B_e_found = other.B_e_found; 79 | super.copy_from(other); 80 | } 81 | 82 | private boolean r_prelude() { 83 | int among_var; 84 | int v_1; 85 | int v_2; 86 | int v_3; 87 | int v_4; 88 | int v_5; 89 | int v_6; 90 | // (, line 41 91 | // test, line 42 92 | v_1 = cursor; 93 | // repeat, line 42 94 | replab0: while (true) { 95 | v_2 = cursor; 96 | lab1: do { 97 | // (, line 42 98 | // [, line 43 99 | bra = cursor; 100 | // substring, line 43 101 | among_var = find_among(a_0, 11); 102 | if (among_var == 0) { 103 | break lab1; 104 | } 105 | // ], line 43 106 | ket = cursor; 107 | switch (among_var) { 108 | case 0: 109 | break lab1; 110 | case 1: 111 | // (, line 45 112 | // <-, line 45 113 | slice_from("a"); 114 | break; 115 | case 2: 116 | // (, line 47 117 | // <-, line 47 118 | slice_from("e"); 119 | break; 120 | case 3: 121 | // (, line 49 122 | // <-, line 49 123 | slice_from("i"); 124 | break; 125 | case 4: 126 | // (, line 51 127 | // <-, line 51 128 | slice_from("o"); 129 | break; 130 | case 5: 131 | // (, line 53 132 | // <-, line 53 133 | slice_from("u"); 134 | break; 135 | case 6: 136 | // (, line 54 137 | // next, line 54 138 | if (cursor >= limit) { 139 | break lab1; 140 | } 141 | cursor++; 142 | break; 143 | } 144 | continue replab0; 145 | } while (false); 146 | cursor = v_2; 147 | break replab0; 148 | } 149 | cursor = v_1; 150 | // try, line 57 151 | v_3 = cursor; 152 | lab2: do { 153 | // (, line 57 154 | // [, line 57 155 | bra = cursor; 156 | // literal, line 57 157 | if (!(eq_s(1, "y"))) { 158 | cursor = v_3; 159 | break lab2; 160 | } 161 | // ], line 57 162 | ket = cursor; 163 | // <-, line 57 164 | slice_from("Y"); 165 | } while (false); 166 | // repeat, line 58 167 | replab3: while (true) { 168 | v_4 = cursor; 169 | lab4: do { 170 | // goto, line 58 171 | golab5: while (true) { 172 | v_5 = cursor; 173 | lab6: do { 174 | // (, line 58 175 | if (!(in_grouping(g_v, 97, 232))) { 176 | break lab6; 177 | } 178 | // [, line 59 179 | bra = cursor; 180 | // or, line 59 181 | lab7: do { 182 | v_6 = cursor; 183 | lab8: do { 184 | // (, line 59 185 | // literal, line 59 186 | if (!(eq_s(1, "i"))) { 187 | break lab8; 188 | } 189 | // ], line 59 190 | ket = cursor; 191 | if (!(in_grouping(g_v, 97, 232))) { 192 | break lab8; 193 | } 194 | // <-, line 59 195 | slice_from("I"); 196 | break lab7; 197 | } while (false); 198 | cursor = v_6; 199 | // (, line 60 200 | // literal, line 60 201 | if (!(eq_s(1, "y"))) { 202 | break lab6; 203 | } 204 | // ], line 60 205 | ket = cursor; 206 | // <-, line 60 207 | slice_from("Y"); 208 | } while (false); 209 | cursor = v_5; 210 | break golab5; 211 | } while (false); 212 | cursor = v_5; 213 | if (cursor >= limit) { 214 | break lab4; 215 | } 216 | cursor++; 217 | } 218 | continue replab3; 219 | } while (false); 220 | cursor = v_4; 221 | break replab3; 222 | } 223 | return true; 224 | } 225 | 226 | private boolean r_mark_regions() { 227 | // (, line 64 228 | I_p1 = limit; 229 | I_p2 = limit; 230 | // gopast, line 69 231 | golab0: while (true) { 232 | lab1: do { 233 | if (!(in_grouping(g_v, 97, 232))) { 234 | break lab1; 235 | } 236 | break golab0; 237 | } while (false); 238 | if (cursor >= limit) { 239 | return false; 240 | } 241 | cursor++; 242 | } 243 | // gopast, line 69 244 | golab2: while (true) { 245 | lab3: do { 246 | if (!(out_grouping(g_v, 97, 232))) { 247 | break lab3; 248 | } 249 | break golab2; 250 | } while (false); 251 | if (cursor >= limit) { 252 | return false; 253 | } 254 | cursor++; 255 | } 256 | // setmark p1, line 69 257 | I_p1 = cursor; 258 | // try, line 70 259 | lab4: do { 260 | // (, line 70 261 | if (!(I_p1 < 3)) { 262 | break lab4; 263 | } 264 | I_p1 = 3; 265 | } while (false); 266 | // gopast, line 71 267 | golab5: while (true) { 268 | lab6: do { 269 | if (!(in_grouping(g_v, 97, 232))) { 270 | break lab6; 271 | } 272 | break golab5; 273 | } while (false); 274 | if (cursor >= limit) { 275 | return false; 276 | } 277 | cursor++; 278 | } 279 | // gopast, line 71 280 | golab7: while (true) { 281 | lab8: do { 282 | if (!(out_grouping(g_v, 97, 232))) { 283 | break lab8; 284 | } 285 | break golab7; 286 | } while (false); 287 | if (cursor >= limit) { 288 | return false; 289 | } 290 | cursor++; 291 | } 292 | // setmark p2, line 71 293 | I_p2 = cursor; 294 | return true; 295 | } 296 | 297 | private boolean r_postlude() { 298 | int among_var; 299 | int v_1; 300 | // repeat, line 75 301 | replab0: while (true) { 302 | v_1 = cursor; 303 | lab1: do { 304 | // (, line 75 305 | // [, line 77 306 | bra = cursor; 307 | // substring, line 77 308 | among_var = find_among(a_1, 3); 309 | if (among_var == 0) { 310 | break lab1; 311 | } 312 | // ], line 77 313 | ket = cursor; 314 | switch (among_var) { 315 | case 0: 316 | break lab1; 317 | case 1: 318 | // (, line 78 319 | // <-, line 78 320 | slice_from("y"); 321 | break; 322 | case 2: 323 | // (, line 79 324 | // <-, line 79 325 | slice_from("i"); 326 | break; 327 | case 3: 328 | // (, line 80 329 | // next, line 80 330 | if (cursor >= limit) { 331 | break lab1; 332 | } 333 | cursor++; 334 | break; 335 | } 336 | continue replab0; 337 | } while (false); 338 | cursor = v_1; 339 | break replab0; 340 | } 341 | return true; 342 | } 343 | 344 | private boolean r_R1() { 345 | if (!(I_p1 <= cursor)) { 346 | return false; 347 | } 348 | return true; 349 | } 350 | 351 | private boolean r_R2() { 352 | if (!(I_p2 <= cursor)) { 353 | return false; 354 | } 355 | return true; 356 | } 357 | 358 | private boolean r_undouble() { 359 | int v_1; 360 | // (, line 90 361 | // test, line 91 362 | v_1 = limit - cursor; 363 | // among, line 91 364 | if (find_among_b(a_2, 3) == 0) { 365 | return false; 366 | } 367 | cursor = limit - v_1; 368 | // [, line 91 369 | ket = cursor; 370 | // next, line 91 371 | if (cursor <= limit_backward) { 372 | return false; 373 | } 374 | cursor--; 375 | // ], line 91 376 | bra = cursor; 377 | // delete, line 91 378 | slice_del(); 379 | return true; 380 | } 381 | 382 | private boolean r_e_ending() { 383 | int v_1; 384 | // (, line 94 385 | // unset e_found, line 95 386 | B_e_found = false; 387 | // [, line 96 388 | ket = cursor; 389 | // literal, line 96 390 | if (!(eq_s_b(1, "e"))) { 391 | return false; 392 | } 393 | // ], line 96 394 | bra = cursor; 395 | // call R1, line 96 396 | if (!r_R1()) { 397 | return false; 398 | } 399 | // test, line 96 400 | v_1 = limit - cursor; 401 | if (!(out_grouping_b(g_v, 97, 232))) { 402 | return false; 403 | } 404 | cursor = limit - v_1; 405 | // delete, line 96 406 | slice_del(); 407 | // set e_found, line 97 408 | B_e_found = true; 409 | // call undouble, line 98 410 | if (!r_undouble()) { 411 | return false; 412 | } 413 | return true; 414 | } 415 | 416 | private boolean r_en_ending() { 417 | int v_1; 418 | int v_2; 419 | // (, line 101 420 | // call R1, line 102 421 | if (!r_R1()) { 422 | return false; 423 | } 424 | // and, line 102 425 | v_1 = limit - cursor; 426 | if (!(out_grouping_b(g_v, 97, 232))) { 427 | return false; 428 | } 429 | cursor = limit - v_1; 430 | // not, line 102 431 | { 432 | v_2 = limit - cursor; 433 | lab0: do { 434 | // literal, line 102 435 | if (!(eq_s_b(3, "gem"))) { 436 | break lab0; 437 | } 438 | return false; 439 | } while (false); 440 | cursor = limit - v_2; 441 | } 442 | // delete, line 102 443 | slice_del(); 444 | // call undouble, line 103 445 | if (!r_undouble()) { 446 | return false; 447 | } 448 | return true; 449 | } 450 | 451 | private boolean r_standard_suffix() { 452 | int among_var; 453 | int v_1; 454 | int v_2; 455 | int v_3; 456 | int v_4; 457 | int v_5; 458 | int v_6; 459 | int v_7; 460 | int v_8; 461 | int v_9; 462 | int v_10; 463 | // (, line 106 464 | // do, line 107 465 | v_1 = limit - cursor; 466 | lab0: do { 467 | // (, line 107 468 | // [, line 108 469 | ket = cursor; 470 | // substring, line 108 471 | among_var = find_among_b(a_3, 5); 472 | if (among_var == 0) { 473 | break lab0; 474 | } 475 | // ], line 108 476 | bra = cursor; 477 | switch (among_var) { 478 | case 0: 479 | break lab0; 480 | case 1: 481 | // (, line 110 482 | // call R1, line 110 483 | if (!r_R1()) { 484 | break lab0; 485 | } 486 | // <-, line 110 487 | slice_from("heid"); 488 | break; 489 | case 2: 490 | // (, line 113 491 | // call en_ending, line 113 492 | if (!r_en_ending()) { 493 | break lab0; 494 | } 495 | break; 496 | case 3: 497 | // (, line 116 498 | // call R1, line 116 499 | if (!r_R1()) { 500 | break lab0; 501 | } 502 | if (!(out_grouping_b(g_v_j, 97, 232))) { 503 | break lab0; 504 | } 505 | // delete, line 116 506 | slice_del(); 507 | break; 508 | } 509 | } while (false); 510 | cursor = limit - v_1; 511 | // do, line 120 512 | v_2 = limit - cursor; 513 | lab1: do { 514 | // call e_ending, line 120 515 | if (!r_e_ending()) { 516 | break lab1; 517 | } 518 | } while (false); 519 | cursor = limit - v_2; 520 | // do, line 122 521 | v_3 = limit - cursor; 522 | lab2: do { 523 | // (, line 122 524 | // [, line 122 525 | ket = cursor; 526 | // literal, line 122 527 | if (!(eq_s_b(4, "heid"))) { 528 | break lab2; 529 | } 530 | // ], line 122 531 | bra = cursor; 532 | // call R2, line 122 533 | if (!r_R2()) { 534 | break lab2; 535 | } 536 | // not, line 122 537 | { 538 | v_4 = limit - cursor; 539 | lab3: do { 540 | // literal, line 122 541 | if (!(eq_s_b(1, "c"))) { 542 | break lab3; 543 | } 544 | break lab2; 545 | } while (false); 546 | cursor = limit - v_4; 547 | } 548 | // delete, line 122 549 | slice_del(); 550 | // [, line 123 551 | ket = cursor; 552 | // literal, line 123 553 | if (!(eq_s_b(2, "en"))) { 554 | break lab2; 555 | } 556 | // ], line 123 557 | bra = cursor; 558 | // call en_ending, line 123 559 | if (!r_en_ending()) { 560 | break lab2; 561 | } 562 | } while (false); 563 | cursor = limit - v_3; 564 | // do, line 126 565 | v_5 = limit - cursor; 566 | lab4: do { 567 | // (, line 126 568 | // [, line 127 569 | ket = cursor; 570 | // substring, line 127 571 | among_var = find_among_b(a_4, 6); 572 | if (among_var == 0) { 573 | break lab4; 574 | } 575 | // ], line 127 576 | bra = cursor; 577 | switch (among_var) { 578 | case 0: 579 | break lab4; 580 | case 1: 581 | // (, line 129 582 | // call R2, line 129 583 | if (!r_R2()) { 584 | break lab4; 585 | } 586 | // delete, line 129 587 | slice_del(); 588 | // or, line 130 589 | lab5: do { 590 | v_6 = limit - cursor; 591 | lab6: do { 592 | // (, line 130 593 | // [, line 130 594 | ket = cursor; 595 | // literal, line 130 596 | if (!(eq_s_b(2, "ig"))) { 597 | break lab6; 598 | } 599 | // ], line 130 600 | bra = cursor; 601 | // call R2, line 130 602 | if (!r_R2()) { 603 | break lab6; 604 | } 605 | // not, line 130 606 | { 607 | v_7 = limit - cursor; 608 | lab7: do { 609 | // literal, line 130 610 | if (!(eq_s_b(1, "e"))) { 611 | break lab7; 612 | } 613 | break lab6; 614 | } while (false); 615 | cursor = limit - v_7; 616 | } 617 | // delete, line 130 618 | slice_del(); 619 | break lab5; 620 | } while (false); 621 | cursor = limit - v_6; 622 | // call undouble, line 130 623 | if (!r_undouble()) { 624 | break lab4; 625 | } 626 | } while (false); 627 | break; 628 | case 2: 629 | // (, line 133 630 | // call R2, line 133 631 | if (!r_R2()) { 632 | break lab4; 633 | } 634 | // not, line 133 635 | { 636 | v_8 = limit - cursor; 637 | lab8: do { 638 | // literal, line 133 639 | if (!(eq_s_b(1, "e"))) { 640 | break lab8; 641 | } 642 | break lab4; 643 | } while (false); 644 | cursor = limit - v_8; 645 | } 646 | // delete, line 133 647 | slice_del(); 648 | break; 649 | case 3: 650 | // (, line 136 651 | // call R2, line 136 652 | if (!r_R2()) { 653 | break lab4; 654 | } 655 | // delete, line 136 656 | slice_del(); 657 | // call e_ending, line 136 658 | if (!r_e_ending()) { 659 | break lab4; 660 | } 661 | break; 662 | case 4: 663 | // (, line 139 664 | // call R2, line 139 665 | if (!r_R2()) { 666 | break lab4; 667 | } 668 | // delete, line 139 669 | slice_del(); 670 | break; 671 | case 5: 672 | // (, line 142 673 | // call R2, line 142 674 | if (!r_R2()) { 675 | break lab4; 676 | } 677 | // Boolean test e_found, line 142 678 | if (!(B_e_found)) { 679 | break lab4; 680 | } 681 | // delete, line 142 682 | slice_del(); 683 | break; 684 | } 685 | } while (false); 686 | cursor = limit - v_5; 687 | // do, line 146 688 | v_9 = limit - cursor; 689 | lab9: do { 690 | // (, line 146 691 | if (!(out_grouping_b(g_v_I, 73, 232))) { 692 | break lab9; 693 | } 694 | // test, line 148 695 | v_10 = limit - cursor; 696 | // (, line 148 697 | // among, line 149 698 | if (find_among_b(a_5, 4) == 0) { 699 | break lab9; 700 | } 701 | if (!(out_grouping_b(g_v, 97, 232))) { 702 | break lab9; 703 | } 704 | cursor = limit - v_10; 705 | // [, line 152 706 | ket = cursor; 707 | // next, line 152 708 | if (cursor <= limit_backward) { 709 | break lab9; 710 | } 711 | cursor--; 712 | // ], line 152 713 | bra = cursor; 714 | // delete, line 152 715 | slice_del(); 716 | } while (false); 717 | cursor = limit - v_9; 718 | return true; 719 | } 720 | 721 | public boolean stem() { 722 | int v_1; 723 | int v_2; 724 | int v_3; 725 | int v_4; 726 | // (, line 157 727 | // do, line 159 728 | v_1 = cursor; 729 | lab0: do { 730 | // call prelude, line 159 731 | if (!r_prelude()) { 732 | break lab0; 733 | } 734 | } while (false); 735 | cursor = v_1; 736 | // do, line 160 737 | v_2 = cursor; 738 | lab1: do { 739 | // call mark_regions, line 160 740 | if (!r_mark_regions()) { 741 | break lab1; 742 | } 743 | } while (false); 744 | cursor = v_2; 745 | // backwards, line 161 746 | limit_backward = cursor; 747 | cursor = limit; 748 | // do, line 162 749 | v_3 = limit - cursor; 750 | lab2: do { 751 | // call standard_suffix, line 162 752 | if (!r_standard_suffix()) { 753 | break lab2; 754 | } 755 | } while (false); 756 | cursor = limit - v_3; 757 | cursor = limit_backward; // do, line 163 758 | v_4 = cursor; 759 | lab3: do { 760 | // call postlude, line 163 761 | if (!r_postlude()) { 762 | break lab3; 763 | } 764 | } while (false); 765 | cursor = v_4; 766 | return true; 767 | } 768 | 769 | public boolean equals(Object o) { 770 | return o instanceof dutchStemmer; 771 | } 772 | 773 | public int hashCode() { 774 | return dutchStemmer.class.getName().hashCode(); 775 | } 776 | 777 | } 778 | --------------------------------------------------------------------------------