├── project ├── build.properties └── plugins.sbt ├── corpus ├── corpusEN.bin ├── corpusES.bin ├── corpusEN2.bin └── README.md ├── src ├── test │ └── scala │ │ └── com │ │ └── textteaser │ │ └── summarizer │ │ ├── SummarizerSuite.scala │ │ ├── SummarySuite.scala │ │ └── ParserSuite.scala └── main │ └── scala │ └── com │ └── textteaser │ └── summarizer │ ├── Article.scala │ ├── Config.scala │ ├── Summary.scala │ ├── models │ ├── Keyword.scala │ └── Summary.scala │ ├── KeywordService.scala │ ├── MongoKeywordService.scala │ ├── SimpleREPL.scala │ ├── GuiceModule.scala │ ├── Parser.scala │ ├── Main.scala │ ├── StopWords.scala │ └── Summarizer.scala ├── .gitignore ├── README.md └── LICENSE /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.6 2 | -------------------------------------------------------------------------------- /corpus/corpusEN.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MojoJolo/textteaser/HEAD/corpus/corpusEN.bin -------------------------------------------------------------------------------- /corpus/corpusES.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MojoJolo/textteaser/HEAD/corpus/corpusES.bin -------------------------------------------------------------------------------- /corpus/corpusEN2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MojoJolo/textteaser/HEAD/corpus/corpusEN2.bin -------------------------------------------------------------------------------- /src/test/scala/com/textteaser/summarizer/SummarizerSuite.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | class SummarizerSuite { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/Article.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | case class Article(id: String, 4 | title: String, 5 | article: String, 6 | url: String = "", 7 | blog: String = "", 8 | category: String = "") -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0") 2 | 3 | addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0") 4 | 5 | resolvers += "Sonatype snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/" 6 | 7 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.class 3 | *.log 4 | 5 | # sbt specific 6 | dist/* 7 | target/ 8 | lib_managed/ 9 | src_managed/ 10 | project/boot/ 11 | project/plugins/project/ 12 | project/plugins/project/ 13 | 14 | # Scala-IDE specific 15 | .scala_dependencies 16 | .settings 17 | .cache 18 | .classpath 19 | .project 20 | 21 | # Idea-specific 22 | .idea 23 | .idea_modules 24 | -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/Config.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | class Config { 4 | 5 | def lang = "EN" 6 | 7 | object words { 8 | def ideal = 20 9 | } 10 | 11 | object db { 12 | def host = "localhost" 13 | def port = 27017 14 | def name = "tt_db" 15 | def username = "" 16 | def password = "" 17 | } 18 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TextTeaser 2 | ========== 3 | 4 | TextTeaser is an automatic summarization algorithm that combines the power of natural language processing and machine learning to produce good results. 5 | 6 | [TextTeaser is ported in Python](https://github.com/IndigoResearch/textteaser) 7 | 8 | ### Requirements 9 | 10 | [SBT](http://www.scala-sbt.org/) is needed for TextTeaser to run. 11 | Use of [Scala IDE](http://scala-ide.org/) is recommended 12 | 13 | ### Setup 14 | 15 | ```bash 16 | $ git clone https://github.com/MojoJolo/textteaser.git 17 | $ sbt compile 18 | $ sbt eclipse // If using Eclipse. 19 | $ sbt run 20 | ``` 21 | -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/Summary.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import scala.collection.immutable.IndexedSeq 4 | 5 | case class Summary(results: IndexedSeq[Sentence]) extends Traversable[String] { 6 | lazy val charCount = if (results.isEmpty) 0 else results.map(_.sentence.size).sum 7 | 8 | def foreach[U](f: String => U) { 9 | results.foreach( s => f(s.sentence) ) 10 | } 11 | 12 | def takeChars(limitCharCount: Int): Summary = { 13 | var count = 0 14 | 15 | val newSentences = results.takeWhile { sentence => 16 | count += sentence.sentence.size 17 | limitCharCount >= count 18 | } 19 | Summary(newSentences) 20 | } 21 | } -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/models/Keyword.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer.models 2 | 3 | import net.liftweb.mongodb.record.field._ 4 | import net.liftweb.record.field._ 5 | import net.liftweb.mongodb.record._ 6 | 7 | class Keyword extends MongoRecord[Keyword] with ObjectIdPk[Keyword] { 8 | def meta = Keyword 9 | 10 | object word extends StringField(this, "") 11 | object score extends LongField(this, 0) 12 | object date extends DateField(this) 13 | object summaryId extends StringField(this, 10) 14 | object blog extends StringField(this, "Undefined") 15 | object category extends StringField(this, "Undefined") 16 | } 17 | 18 | object Keyword extends Keyword with MongoMetaRecord[Keyword] -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/models/Summary.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer.models 2 | 3 | import net.liftweb.mongodb.record.field._ 4 | import net.liftweb.record.field._ 5 | import net.liftweb.mongodb.record._ 6 | 7 | class Summary extends MongoRecord[Summary] with ObjectIdPk[Summary] { 8 | def meta = Summary 9 | 10 | object summaryId extends StringField(this, 10) 11 | object title extends StringField(this, "") 12 | object summary extends StringField(this, "") 13 | object status extends StringField(this, "Pending") 14 | object url extends StringField(this, "") 15 | object blog extends StringField(this, "Undefined") 16 | object category extends StringField(this, "Undefined") 17 | } 18 | 19 | object Summary extends Summary with MongoMetaRecord[Summary] -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/KeywordService.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | trait KeywordService { 4 | def getBlogCount(blog: String): Long 5 | def getCategoryCount(cat: String): Long 6 | def getBlogScore(word: String, blog: String): Long 7 | def getCategoryScore(word: String, cat: String): Long 8 | def add(word: String, count: Long, summaryId: String, blog: String, cat: String): Unit 9 | } 10 | 11 | class DummyKeywordService extends KeywordService { 12 | def getBlogCount(blog: String): Long = 1 13 | def getCategoryCount(cat: String): Long = 1 14 | def getBlogScore(word: String, blog: String): Long = 1 15 | def getCategoryScore(word: String, cat: String): Long = 1 16 | def add(word: String, count: Long, summaryId: String, blog: String, cat: String) {} 17 | } -------------------------------------------------------------------------------- /corpus/README.md: -------------------------------------------------------------------------------- 1 | Training new models 2 | =================== 3 | 4 | This folder contains Maximum Entropy models for sentence splitting, as needed by OpenNLP 1.5. 5 | 6 | See http://opennlp.apache.org/documentation/1.5.3/manual/opennlp.html#tools.sentdetect.training 7 | 8 | For more models, see: http://opennlp.sourceforge.net/models-1.5/ 9 | 10 | Migrating from NLTK 11 | ------------------- 12 | 13 |
14 | $ pip install nltk
15 | $ python
16 | >>> import nltk
17 | >>> nltk.download()
18 | 
19 | 20 | _install punkt_ 21 | 22 |
23 | >>> import nltk.data
24 | >>> sent_detector = nltk.data.load('tokenizers/punkt/spanish.pickle')
25 | >>> import codecs
26 | >>> text=codecs.open("/path/to/corpus-utf8.txt","r","utf-8").read()
27 | >>> sents=sent_detector.tokenize(text)
28 | >>> w=codecs.open("train.txt","w","utf-8")
29 | >>>  for s in sents:
30 | ...   w.write(s + '\n')
31 | ...   c += 1
32 | ...   if c == 10:
33 | ...     w.write('\n')
34 | ...     c = 0
35 | >>> w.close()
36 | >>> 
37 | $ opennlp SentenceDetectorTrainer -model corpusES.bin -lang es -data train.txt -encoding UTF-8
38 | 
39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Jolo Balbin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/MongoKeywordService.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import com.google.inject.Inject 4 | import com.textteaser.summarizer.models.Keyword 5 | import com.foursquare.rogue.LiftRogue._ 6 | import org.joda.time.DateTime 7 | 8 | class MongoKeywordService extends KeywordService { 9 | val controller = "keywords" 10 | 11 | def getBlogCount(blog: String): Long = Keyword.where(_.blog eqs blog).count 12 | 13 | def getCategoryCount(cat: String): Long = Keyword.where(_.category eqs cat).count 14 | 15 | def getBlogScore(word: String, blog: String): Long = Keyword.where(_.word eqs word).and(_.blog eqs blog) 16 | .fetch.map(_.score._1) 17 | .reduceLeftOption(_ + _).getOrElse(0) 18 | 19 | def getCategoryScore(word: String, cat: String): Long = Keyword.where(_.word eqs word).and(_.category eqs cat) 20 | .fetch.map(_.score._1) 21 | .reduceLeftOption(_ + _).getOrElse(0) 22 | 23 | def add(word: String, count: Long, summaryId: String, blog: String, cat: String) = Keyword.createRecord 24 | .word(word) 25 | .score(count) 26 | .summaryId(summaryId) 27 | .blog(blog) 28 | .category(cat) 29 | .date(new DateTime().toDate) 30 | .save 31 | } -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/SimpleREPL.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import net.codingwell.scalaguice.InjectorExtensions.ScalaInjector 4 | import com.google.inject.Guice 5 | import com.textteaser.summarizer.models.Keyword 6 | import org.slf4j._ 7 | import scala.io.StdIn 8 | 9 | object SimpleREPL extends App { 10 | 11 | val config = new Config { override def lang = "ES" } 12 | val guice = new ScalaInjector(Guice.createInjector(new GuiceModule(config, true))) 13 | 14 | val summarizer = guice.instance[Summarizer] 15 | val log = guice.instance[Logger] 16 | 17 | while(true) { 18 | println("Ready for summarizing:") 19 | println("Provide the article title:") 20 | val title = StdIn.readLine() 21 | println("Provide the article text (with now newlines \\n):") 22 | val text = StdIn.readLine().replaceAll("\\\\n", "\n") 23 | println(text) 24 | val article = Article("not_important", title, text) 25 | val summary = summarizer.summarize(article.article, article.title, article.id, article.blog, article.category) 26 | 27 | println("---- Summary ----") 28 | summary.foreach(println) 29 | println("-----------------") 30 | 31 | log.info("Summarization completed.") 32 | } 33 | } -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/GuiceModule.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import net.codingwell.scalaguice.InjectorExtensions.ScalaInjector 4 | import net.codingwell.scalaguice.ScalaModule 5 | import com.google.inject._ 6 | import javax.inject.Named 7 | import opennlp.tools.sentdetect._ 8 | import java.io.FileInputStream 9 | import com.mongodb._ 10 | import org.slf4j.LoggerFactory 11 | 12 | class GuiceModule(config: Config, dummyKeywordService: Boolean = false) extends AbstractModule with ScalaModule { 13 | def configure { 14 | bind[Config].toInstance(config) 15 | bind[Parser].in[Singleton] 16 | bind[Summarizer].in[Singleton] 17 | if (dummyKeywordService) 18 | bind[KeywordService].to[DummyKeywordService].in[Singleton] 19 | else 20 | bind[KeywordService].to[MongoKeywordService].in[Singleton] 21 | } 22 | 23 | @Provides 24 | def mongo = { 25 | val server = new ServerAddress(config.db.host, config.db.port) 26 | new Mongo(server) 27 | } 28 | 29 | @Provides 30 | @Singleton 31 | def sentenceDetector(@Named("lang")lang: String) = { 32 | val model = new SentenceModel(new FileInputStream("corpus/corpus" + lang + ".bin")) 33 | new SentenceDetectorME(model) 34 | } 35 | 36 | @Provides 37 | @Singleton 38 | @Named("lang") 39 | def lang(config: Config): String = config.lang 40 | 41 | @Provides 42 | @Singleton 43 | def log = LoggerFactory.getLogger("") 44 | 45 | @Provides 46 | @Singleton 47 | def stopWords(@Named("lang")lang: String) = StopWords.forLang(lang) 48 | } -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/Parser.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import opennlp.tools.sentdetect._ 4 | import com.google.inject.Inject 5 | import com.google.common.base.{CharMatcher, Splitter} 6 | import scala.collection.JavaConverters 7 | 8 | class Parser @Inject() (sentenceDetector: SentenceDetectorME, stopWordList: StopWords, config: Config) { 9 | 10 | val ideal = config.words.ideal 11 | lazy val stopWords = stopWordList.stopWords 12 | 13 | /* 14 | * Sentence Length: Computed using this formula 15 | * (ideal - Math.abs(ideal - words.size)) / ideal 16 | */ 17 | def sentenceLength(sentence: Array[String]) = 1 - (Math.abs(ideal - sentence.size) / ideal.toDouble) 18 | 19 | /* 20 | * Split Words: Split words via white space and new lines. Then remove whites space in the resulting array. 21 | */ 22 | def splitWords(source: String) = JavaConverters.iterableAsScalaIterableConverter( 23 | Splitter.on("""[^\w]""".r.pattern) 24 | .trimResults().omitEmptyStrings() 25 | .split(source)).asScala.toArray 26 | 27 | def titleScore(titleWords: Array[String], sentence: Array[String]) = 28 | sentence.count(w => !stopWords.contains(w) && titleWords.contains(w)) / titleWords.size.toDouble 29 | 30 | def getKeywords(text: String): KeywordList = { 31 | val keyWords = splitWords(text) 32 | val sizeWithRepeatingWords = keyWords.length 33 | KeywordList( 34 | keyWords.filterNot(w => stopWords.contains(w)) 35 | .groupBy(w => w) 36 | .map(w => ArticleKeyword(w._1, w._2.length)) 37 | .toList.sortBy(-_.count), 38 | sizeWithRepeatingWords) 39 | } 40 | 41 | def splitSentences(source: String) = sentenceDetector.sentDetect(source) 42 | 43 | def sentencePosition(ctr: Int, sentenceCount: Double) = { 44 | val normalized = ctr / sentenceCount 45 | 46 | if(normalized > 1.0) 47 | 0d 48 | else if (normalized > 0.9) 49 | 0.15 50 | else if (normalized > 0.8) 51 | 0.04 52 | else if (normalized > 0.7) 53 | 0.04 54 | else if (normalized > 0.6) 55 | 0.06 56 | else if (normalized > 0.5) 57 | 0.04 58 | else if (normalized > 0.4) 59 | 0.05 60 | else if (normalized > 0.3) 61 | 0.08 62 | else if (normalized > 0.2) 63 | 0.14 64 | else if (normalized > 0.1) 65 | 0.23 66 | else if (normalized > 0) 67 | 0.17 68 | 0d 69 | } 70 | } 71 | 72 | case class ArticleKeyword(word: String, count: Int) 73 | case class KeywordList(keywords: List[ArticleKeyword], wordCount: Int) 74 | -------------------------------------------------------------------------------- /src/test/scala/com/textteaser/summarizer/SummarySuite.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import org.scalatest.{BeforeAndAfter, FunSuite} 4 | import scala.collection.immutable.{VectorBuilder, IndexedSeq} 5 | 6 | class SummarySuite extends FunSuite with BeforeAndAfter { 7 | 8 | var singleNonEmptySentenceSeq: IndexedSeq[Sentence] = _ 9 | var twoSentenceSeq: IndexedSeq[Sentence] = _ 10 | var summaryOnEmptySentence: Summary = _ 11 | var summaryOnSingleSentenceSeq: Summary = _ 12 | var summaryOnTwoSentenceSeq: Summary = _ 13 | var targetSummaryForForeach: Summary = _ 14 | var targetVectorForForeach: VectorBuilder[String] = _ 15 | 16 | before { 17 | singleNonEmptySentenceSeq = Vector(Sentence("Hello world", 1d, 1)) 18 | twoSentenceSeq = Vector(Sentence("Hello world", 1d, 1), Sentence("Hello world", 1d, 1)) 19 | summaryOnEmptySentence = Summary(Vector.empty[Sentence]) 20 | summaryOnSingleSentenceSeq = Summary(singleNonEmptySentenceSeq) 21 | summaryOnTwoSentenceSeq = Summary(twoSentenceSeq) 22 | targetSummaryForForeach = Summary(Vector(Sentence("a", 1d, 1), Sentence("b", 1d, 1), Sentence("c", 1d, 1))) 23 | targetVectorForForeach = new VectorBuilder[String] 24 | } 25 | 26 | test("charCount on an empty sequence") { 27 | assert(summaryOnEmptySentence.charCount === 0) 28 | } 29 | 30 | test("""charCount on "Hello world" is 11""") { 31 | assert(summaryOnSingleSentenceSeq.charCount === 11) 32 | } 33 | 34 | test("""charCount on 2 x "Hello world" is 22""") { 35 | assert(summaryOnTwoSentenceSeq.charCount === 22) 36 | } 37 | 38 | test("""foreach on an non-empty-sentence summary""") { 39 | targetSummaryForForeach foreach { case s => targetVectorForForeach += s } 40 | assert(targetVectorForForeach.result === Vector("a", "b", "c")) 41 | } 42 | 43 | test("""foreach on an empty-sentence summary""") { 44 | summaryOnEmptySentence foreach { case s => targetVectorForForeach += s } 45 | assert(targetVectorForForeach.result === Vector()) 46 | } 47 | 48 | test("""takeChars on an empty-sentence summary""") { 49 | assert(summaryOnEmptySentence.takeChars(100) === summaryOnEmptySentence) 50 | } 51 | 52 | test("""takeChars on a non-empty-sentence summary""") { 53 | assert(summaryOnTwoSentenceSeq.takeChars(11) === summaryOnSingleSentenceSeq) 54 | } 55 | 56 | test("""takeChars on a non-empty-sentence summary with 0 to take""") { 57 | assert(summaryOnTwoSentenceSeq.takeChars(0) === summaryOnEmptySentence) 58 | } 59 | 60 | test("""Summary constructor itself""") { 61 | assert(Summary(Vector(Sentence("Hello world", 1d, 1))) === summaryOnSingleSentenceSeq) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/Main.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import net.codingwell.scalaguice.InjectorExtensions.ScalaInjector 4 | import com.google.inject.Guice 5 | import com.mongodb._ 6 | import net.liftweb.mongodb._ 7 | import com.textteaser.summarizer.models.Keyword 8 | import com.foursquare.rogue.LiftRogue._ 9 | import org.json4s._ 10 | import org.json4s.native.JsonMethods._ 11 | import org.slf4j._ 12 | 13 | object Main extends App { 14 | 15 | implicit val formats = DefaultFormats 16 | val config = new Config 17 | val guice = new ScalaInjector(Guice.createInjector(new GuiceModule(config))) 18 | 19 | val summarizer = guice.instance[Summarizer] 20 | val log = guice.instance[Logger] 21 | 22 | log.info("Starting...") 23 | 24 | MongoDB.defineDb(DefaultMongoIdentifier, guice.instance[Mongo], config.db.name) 25 | 26 | log.info("App is now runnning.") 27 | 28 | val id = "anythingyoulikehere" 29 | val title = "Astronomic news: the universe may not be expanding after all" 30 | val text = "Now that conventional thinking has been turned on its head in a paper by Prof Christof Wetterich at the University of Heidelberg in Germany. He points out that the tell-tale light emitted by atoms is also governed by the masses of their constituent particles, notably their electrons. The way these absorb and emit light would shift towards the blue part of the spectrum if atoms were to grow in mass, and to the red if they lost it. Because the frequency or ÒpitchÓ of light increases with mass, Prof Wetterich argues that masses could have been lower long ago. If they had been constantly increasing, the colours of old galaxies would look red-shifted Ð and the degree of red shift would depend on how far away they were from Earth. ÒNone of my colleagues has so far found any fault [with this],Ó he says. Although his research has yet to be published in a peer-reviewed publication, Nature reports that the idea that the universe is not expanding at all Ð or even contracting Ð is being taken seriously by some experts, such as Dr HongSheng Zhao, a cosmologist at the University of St Andrews who has worked on an alternative theory of gravity. ÒI see no fault in [Prof WetterichÕs] mathematical treatment,Ó he says. ÒThere were rudimentary versions of this idea two decades ago, and I think it is fascinating to explore this alternative representation of the cosmic expansion, where the evolution of the universe is like a piano keyboard played out from low to high pitch.Ó Prof Wetterich takes the detached, even playful, view that his work marks a change in perspective, with two different views of reality: either the distances between galaxies grow, as in the traditional balloon picture, or the size of atoms shrinks, increasing their mass. Or itÕs a complex blend of the two. One benefit of this idea is that he is able to rid physics of the singularity at the start of time, a nasty infinity where the laws of physics break down. Instead, the Big Bang is smeared over the distant past: the first note of the ''cosmic pianoÕÕ was long and low-pitched. Harry Cliff, a physicist working at CERN who is the Science MuseumÕs fellow of modern science, thinks it striking that a universe where particles are getting heavier could look identical to one where space/time is expanding. ÒFinding two different ways of thinking about the same problem often leads to new insights,Ó he says. ÒString theory, for instance, is full of 'dualitiesÕ like this, which allow theorists to pick whichever view makes their calculations simpler.Ó If this idea turns out to be right Ð and that is a very big if Ð it could pave the way for new ways to think about our universe. If we are lucky, they might even be as revolutionary as Edwin HubbleÕs, almost a century ago. Roger Highfield is director of external affairs at the Science Museum" 31 | 32 | val article = Article(id, title, text) 33 | val summary = summarizer.summarize(article.article, article.title, article.id, article.blog, article.category) 34 | 35 | println(summarizer.toJSON(summary)) 36 | 37 | log.info("Summarization completed.") 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/StopWords.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import scala.collection.JavaConverters 4 | 5 | trait StopWords { 6 | val stopWords: Set[String] 7 | } 8 | 9 | object StopWords { 10 | def forLang(lang: String): StopWords = if (lang.equalsIgnoreCase("ES")) new StopWordsES() else new StopWordsEN(); 11 | } 12 | 13 | class StopWordsEN extends StopWords { 14 | lazy val stopWords = Set("-", " ", ",", ".", "a", "e", "i", "o", "u", "t", "about", "above", "above", "across", 15 | "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", 16 | "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", 17 | "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", 18 | "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", 19 | "beyond", "both", "bottom", "but", "by", "call", "can", "cannot", "can't", "co", "con", "could", "couldn't", "de", 20 | "describe", "detail", "did", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", 21 | "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", 22 | "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", 23 | "found", "four", "from", "front", "full", "further", "get", "give", "go", "got", "had", "has", "hasnt", "have", 24 | "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", 25 | "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "into", "is", "it", "its", "it's", 26 | "itself", "just", "keep", "last", "latter", "latterly", "least", "less", "like", "ltd", "made", "make", "many", 27 | "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", 28 | "my", "myself", "name", "namely", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody", 29 | "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", 30 | "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "people", 31 | "per", "perhaps", "please", "put", "rather", "re", "said", "same", "see", "seem", "seemed", "seeming", "seems", 32 | "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", 33 | "something", "sometime", "sometimes", "somewhere", "still", "such", "take", "ten", "than", "that", "the", "their", 34 | "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", 35 | "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", 36 | "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", 37 | "up", "upon", "us", "use", "very", "via", "want", "was", "we", "well", "were", "what", "whatever", "when", 38 | "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", 39 | "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", 40 | "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the", "reuters", "news", "monday", 41 | "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "mon", "tue", "wed", "thu", "fri", "sat", 42 | "sun", "rappler", "rapplercom", "inquirer", "yahoo", "home", "sports", "1", "10", "2012", "sa", "says", "tweet", 43 | "pm", "home", "homepage", "sports", "section", "newsinfo", "stories", "story", "photo", "2013", "na", "ng", "ang", 44 | "year", "years", "percent", "ko", "ako", "yung", "yun", "2", "3", "4", "5", "6", "7", "8", "9", "0", "time", 45 | "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", 46 | "december", "philippine", "government", "police", "manila") 47 | } 48 | 49 | class StopWordsES extends StopWords { 50 | lazy val stopWords = Set("un", "una", "unas", "unos", "uno", "sobre", "todo", "también", "tras", "otro", "algún", 51 | "alguno", "alguna", "algunos", "algunas", "ser", "es", "soy", "eres", "somos", "sois", "estoy", "esta", "estamos", 52 | "estais", "estan", "como", "en", "para", "atras", "porque", "por", "qué", "estado", "estaba", "ante", "antes", 53 | "siendo", "ambos", "pero", "por", "poder", "puede", "puedo", "podemos", "podeis", "pueden", "fui", "fue", "fuimos", 54 | "fueron", "hacer", "hago", "hace", "hacemos", "haceis", "hacen", "cada", "fin", "incluso", "primero", "desde", 55 | "conseguir", "consigo", "consigue", "consigues", "conseguimos", "consiguen", "ir", "voy", "va", "vamos", "vais", 56 | "van", "vaya", "gueno", "ha", "tener", "tengo", "tiene", "tenemos", "teneis", "tienen", "el", "la", "lo", "las", 57 | "los", "su", "aqui", "mio", "tuyo", "ellos", "ellas", "nos", "nosotros", "vosotros", "vosotras", "si", "dentro", 58 | "solo", "solamente", "saber", "sabes", "sabe", "sabemos", "sabeis", "saben", "ultimo", "largo", "bastante", "haces", 59 | "muchos", "aquellos", "aquellas", "sus", "entonces", "tiempo", "verdad", "verdadero", "verdadera", "cierto", "ciertos", 60 | "cierta", "ciertas", "intentar", "intento", "intenta", "intentas", "intentamos", "intentais", "intentan", "dos", 61 | "bajo", "arriba", "encima", "usar", "uso", "usas", "usa", "usamos", "usais", "usan", "emplear", "empleo", "empleas", 62 | "emplean", "ampleamos", "empleais", "valor", "muy", "era", "eras", "eramos", "eran", "modo", "bien", "cual", "cuando", 63 | "donde", "mientras", "quien", "con", "entre", "sin", "trabajo", "trabajar", "trabajas", "trabaja", "trabajamos", 64 | "trabajais", "trabajan", "podria", "podrias", "podriamos", "podrian", "podriais", "yo", "aquel") 65 | } -------------------------------------------------------------------------------- /src/main/scala/com/textteaser/summarizer/Summarizer.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import com.google.inject.Inject 4 | import org.json4s.native.JsonMethods._ 5 | import org.json4s.JsonDSL._ 6 | import scala.collection.mutable.ListBuffer 7 | 8 | class Summarizer @Inject() (parser: Parser, keywordService: KeywordService) { 9 | 10 | private var _summarySize: Int = 5 11 | private var _keywordsSize: Int = 10 12 | 13 | def summarySize = _summarySize 14 | def summarySize_=(newSize: Int) = { 15 | _summarySize = newSize 16 | } 17 | 18 | def keywordsSize = _keywordsSize 19 | def keywordsSize_=(newSize: Int) = { 20 | _keywordsSize = newSize 21 | } 22 | 23 | private def ensureSizeDoesNotExceedLimit(size: Int, limit: Int): Int = { 24 | size.min(limit) 25 | } 26 | 27 | def summarize(text: String, title: String, link: String, blog: String, category: String) = { 28 | val sentences = parser.splitSentences(text) 29 | def titleWords = parser.splitWords(title) 30 | val resKeywords = parser.getKeywords(text) 31 | val keywords = resKeywords.keywords 32 | keywordsSize = ensureSizeDoesNotExceedLimit(keywordsSize, keywords.size) 33 | val topKeywords = getTopKeywords(keywords.take(keywordsSize), resKeywords.wordCount, link, blog, category) 34 | val result = computeScore(sentences, titleWords, topKeywords) 35 | summarySize = ensureSizeDoesNotExceedLimit(summarySize, result.size) 36 | Summary(result.sortBy(-_.score).take(summarySize).sortBy(_.order).toIndexedSeq) 37 | } 38 | 39 | def toJSON(summary: Summary) = compact(render("sentences" -> summary.toList)) 40 | 41 | def getTopKeywords(keywords: List[ArticleKeyword], 42 | articleCount: Int, link: String, 43 | blog: String, category: String): List[TopKeyword] = 44 | keywords.map { k => 45 | val blogCount = keywordService.getBlogCount(blog) + 1.0 46 | val categoryCount = keywordService.getCategoryCount(category) + 1.0 47 | 48 | keywordService.add(k.word, k.count, link, blog, category) 49 | 50 | val articleScore = k.count / articleCount 51 | val blogScore = keywordService.getBlogScore(k.word, blog) / blogCount 52 | val categoryScore = keywordService.getCategoryScore(k.word, category) / categoryCount 53 | val totalScore = articleScore * 1.5 + blogScore + categoryScore 54 | 55 | TopKeyword(k.word, totalScore) 56 | } 57 | 58 | def computeScore(sentences: Array[String], titleWords: Array[String], topKeywords: List[TopKeyword]) = 59 | Array.tabulate(sentences.size) { i => 60 | val sentence = parser.splitWords(sentences(i)) 61 | val titleFeature = parser.titleScore(titleWords, sentence) 62 | val sentenceLength = parser.sentenceLength(sentence) 63 | val sentencePosition = parser.sentencePosition(i, sentences.size) 64 | val sbsFeature = sbs(sentence, topKeywords) 65 | val dbsFeature = dbs(sentence, topKeywords) 66 | val keywordFrequency = (sbsFeature + dbsFeature) / 2.0 * 10.0 67 | val totalScore = (titleFeature * 1.5 + keywordFrequency * 2.0 + sentenceLength * 0.5 + sentencePosition * 1.0) / 4.0 68 | 69 | Sentence(sentences(i), totalScore, i) 70 | } 71 | 72 | def sbs(words: Array[String], topKeywords: List[TopKeyword]): Double = { 73 | if (words.size == 0) 74 | 0 75 | else { 76 | val summ = words.map { word => 77 | topKeywords.find(_.word == word) match { 78 | case None => 0 79 | case Some(x) => x.score 80 | } 81 | }.sum 82 | 83 | 1.0 / Math.abs(words.size) * summ 84 | } 85 | } 86 | 87 | def dbs(words: Array[String], topKeywords: List[TopKeyword]): Double = { 88 | if (words.size == 0) 89 | 0 90 | else { 91 | val res = words.map { word => 92 | topKeywords.find(_.word == word) match { 93 | case None => 0 94 | case Some(x) => x.score 95 | } 96 | }.zipWithIndex.filter(_._1 > 0) 97 | 98 | 99 | val summ = res.zip(res.slice(1, res.size)).map { r => 100 | (r._1._1 * r._2._1) / Math.pow(r._1._2 - r._2._2, 2) 101 | }.sum 102 | 103 | val k = words.intersect(topKeywords.map(_.word)).size + 1 104 | 105 | (1.0 / (k * (k + 1.0))) * summ 106 | } 107 | } 108 | 109 | def canonical_dbs(words: Array[String], topKeywords: List[TopKeyword]): Double = { 110 | if (words.size == 0) 111 | 0 112 | else { 113 | val res = words.map { word => 114 | topKeywords.find(_.word == word) match { 115 | case None => 0 116 | case Some(x) => x.score 117 | } 118 | }.zipWithIndex.filter(_._1 > 0) 119 | 120 | 121 | val summ = res.zip(res.slice(1, res.size)).map { r => 122 | (r._1._1 * r._2._1) / Math.pow(r._1._2 - r._2._2, 2) 123 | }.sum 124 | 125 | val k = words.intersect(topKeywords.map(_.word)).size + 1 126 | 127 | (1.0 / (k * (k + 1.0))) * summ 128 | } 129 | } 130 | } 131 | 132 | case class TopKeyword(word: String, score: Double) 133 | case class Sentence(sentence: String, score: Double, order: Int) 134 | 135 | /* 136 | * The Density Based Selection (DBS) above is so fucking abstracted. 137 | * USE THIS FOR REFERENCE: 138 | * 139 | * def dbs(sentence, topKeywords) { 140 | def words = parserService.splitWords sentence 141 | words.removeAll(" ") 142 | words = words*.toLowerCase() 143 | 144 | if(words.size == 0) 145 | return 0 146 | 147 | def k = words.intersect(topKeywords.word).size() + 1 148 | def summ = 0 149 | def firstWord = [] 150 | def secondWord = [] 151 | 152 | for(def i = 0; i < words.size(); i++) { 153 | def index = topKeywords.word.indexOf(words[i]) 154 | 155 | if(index > -1) { 156 | def score = topKeywords[index].totalScore 157 | 158 | if(firstWord == []) { 159 | firstWord = [i: i, score: score] 160 | } 161 | else { 162 | secondWord = firstWord 163 | firstWord = [i: i, score: score] 164 | 165 | summ += (firstWord.score * secondWord.score) / Math.pow((firstWord.i - secondWord.i), 2) 166 | } 167 | 168 | } 169 | } 170 | 171 | def formula = ((1 / k * (k + 1)) * summ) as double 172 | 173 | return formula 174 | } 175 | 176 | Just for backup, this is for Summation Based Selection (SBS): 177 | 178 | def sbs(sentence, topKeywords) { 179 | def words = parserService.splitWords sentence 180 | words.removeAll(" ") 181 | 182 | if(words.size == 0) 183 | return 0 184 | 185 | def summ = 0 186 | 187 | words.each { word -> 188 | word = word.toLowerCase() 189 | def index = topKeywords.word.indexOf(word) 190 | def score = index == -1 ? 0 : topKeywords[index].totalScore 191 | summ += score 192 | } 193 | 194 | def formula = (1 / Math.abs(words.size) * summ) as double 195 | 196 | return formula 197 | } 198 | 199 | */ -------------------------------------------------------------------------------- /src/test/scala/com/textteaser/summarizer/ParserSuite.scala: -------------------------------------------------------------------------------- 1 | package com.textteaser.summarizer 2 | 3 | import org.scalatest.{BeforeAndAfter, FunSuite} 4 | import net.codingwell.scalaguice.InjectorExtensions.ScalaInjector 5 | import com.google.inject.Guice 6 | 7 | class ParserSuite extends FunSuite with BeforeAndAfter { 8 | 9 | val guice = new ScalaInjector(Guice.createInjector(new GuiceModule(new Config))) 10 | val parser = guice.instance[Parser] 11 | 12 | val sentenceWithFiveWords: Array[String] = Array("1", "2", "3", "4", "5") 13 | val emptySentence: Array[String] = Array() 14 | val sentenceWithTwentyWords: Array[String] = (1 to 20).map(_.toString).toArray 15 | 16 | 17 | val textBuilder = StringBuilder.newBuilder 18 | val longTextBuilder = StringBuilder.newBuilder 19 | val stopWordsSentence = Array("hereafter", "hereby", "herein") 20 | val noStopWordsSentence = Array("Accommodation", "globalization", "emancipation") 21 | val title = Array("Accommodation", "globalization", "emancipation") 22 | val textForKeywords = "oneone twotwo twotwo threethree threethree threethree" 23 | 24 | before { 25 | longTextBuilder ++= "1914 translation by H. Rackham\n\n" 26 | longTextBuilder ++= "On the other hand, we denounce with righteous indignation and dislike men " 27 | longTextBuilder ++= "who are so beguiled and demoralized by the charms " 28 | longTextBuilder ++= "of pleasure of the moment, so blinded by desire, that they cannot foresee the pain and trouble " 29 | longTextBuilder ++= "that are bound to ensue; and equal blame belongs to those who fail in their duty " 30 | longTextBuilder ++= "through weakness of will, which is the same as saying through shrinking from toil and pain. " 31 | longTextBuilder ++= "These cases are perfectly simple and easy to distinguish. In a free hour, " 32 | longTextBuilder ++= "when our power of choice is untrammelled and when nothing prevents our being able to do " 33 | longTextBuilder ++= "what we like best, every pleasure is to be welcomed and every pain avoided. " 34 | longTextBuilder ++= "But in certain circumstances and owing to the claims of duty or the obligations of business " 35 | longTextBuilder ++= "it will frequently occur that pleasures have to be repudiated and annoyances accepted. " 36 | longTextBuilder ++= "The wise man therefore always holds in these matters to this principle of selection: " 37 | longTextBuilder ++= "he rejects pleasures to secure other greater pleasures, or else he endures pains to avoid worse pains." 38 | 39 | textBuilder ++= "Now that conventional thinking has been turned on its head in a paper by " 40 | textBuilder ++= "Prof Christof Wetterich at the University of Heidelberg in Germany. " 41 | textBuilder ++= "He points out that the tell-tale light emitted by atoms is also governed by the masses " 42 | textBuilder ++= "of their constituent particles, notably their electrons. The way these absorb and emit " 43 | textBuilder ++= "light would shift towards the blue part of the spectrum if atoms were to grow in mass, " 44 | textBuilder ++= "and to the red if they lost it. Because the frequency or ÒpitchÓ of light increases with mass, " 45 | textBuilder ++= "Prof Wetterich argues that masses could have been lower long ago. " 46 | textBuilder ++= "If they had been constantly increasing, the colours of old galaxies would look red-shifted Ð" 47 | textBuilder ++= "and the degree of red shift would depend on how far away they were from Earth. " 48 | textBuilder ++= "ÒNone of my colleagues has so far found any fault [with this],Ó he says. " 49 | textBuilder ++= "Although his research has yet to be published in a peer-reviewed publication, Nature reports " 50 | textBuilder ++= "that the idea that the universe is not expanding at all Ð or even contracting Ð is being taken " 51 | textBuilder ++= "seriously by some experts, such as Dr HongSheng Zhao, a cosmologist at the University of " 52 | textBuilder ++= "St Andrews who has worked on an alternative theory of gravity. ÒI see no fault in [Prof WetterichÕs] " 53 | textBuilder ++= "mathematical treatment,Ó he says. ÒThere were rudimentary versions of this idea two decades ago, and " 54 | textBuilder ++= "I think it is fascinating to explore this alternative representation of the cosmic expansion, where the evolution" 55 | textBuilder ++= "of the universe is like a piano keyboard played out from low to high pitch.Ó Prof Wetterich takes the detached," 56 | textBuilder ++= " even playful, view that his work marks a change in perspective, with two different views of reality: " 57 | textBuilder ++= "either the distances between galaxies grow, as in the traditional balloon picture, or the size of atoms " 58 | textBuilder ++= "shrinks, increasing their mass. Or itÕs a complex blend of the two. One benefit of this idea" 59 | textBuilder ++= "is that he is able to rid physics of the singularity at the start of time, a nasty infinity where " 60 | textBuilder ++= "the laws of physics break down. Instead, the Big Bang is smeared over the distant past : " 61 | textBuilder ++= "the first note of the ''cosmic pianoÕÕ was long and low-pitched. Harry Cliff, a physicist working at CERN" 62 | textBuilder ++= "who is the Science MuseumÕs fellow of modern science, thinks it striking that a universe where particles are " 63 | textBuilder ++= "getting heavier could look identical to one where space/time is expanding. ÒFinding two different " 64 | textBuilder ++= "ways of thinking about the same problem often leads to new insights,Ó he says. ÒString theory, " 65 | textBuilder ++= " for instance, is full of 'dualitiesÕ like this, which allow theorists to pick whichever view " 66 | textBuilder ++= "makes their calculations simpler.Ó If this idea turns out to be right Ð and that is a very big " 67 | textBuilder ++= "if Ð it could pave the way for new ways to think about our universe. If we are lucky, they might " 68 | textBuilder ++= "even be as revolutionary as Edwin HubbleÕs, almost a century ago. Roger Highfield is director " 69 | textBuilder ++= "of external affairs at the Science Museum" 70 | } 71 | 72 | test("Sentence length on empty sentence returns 0") { 73 | assert(parser.sentenceLength(emptySentence) === 0.0) 74 | } 75 | 76 | test("Sentence length on non-empty sentence returns it's length according to formula") { 77 | assert(parser.sentenceLength(sentenceWithFiveWords) === 0.25) 78 | } 79 | 80 | test("When `ideal` is equal to `sentence` array length, sentence length should be 1") { 81 | assert(parser.sentenceLength(sentenceWithTwentyWords) === 1.0) 82 | } 83 | 84 | test("Splitting string into words should return no empty strings") { 85 | assert(!parser.splitWords(longTextBuilder.toString()).contains("")) 86 | assert(!parser.splitWords(textBuilder.toString()).contains("")) 87 | } 88 | 89 | test("Splitting string into words should not produce whitespaces in output") { 90 | assert(parser.splitWords(longTextBuilder.toString()).forall(s => """\s+""".r.findFirstIn(s) == None)) 91 | assert(parser.splitWords(textBuilder.toString()).forall(s => """\s+""".r.findFirstIn(s) == None)) 92 | } 93 | 94 | test("Splitting string into words should not produce newlines in output") { 95 | assert(parser.splitWords(longTextBuilder.toString()).forall(s => """\r?\n+""".r.findFirstIn(s) == None)) 96 | assert(parser.splitWords(textBuilder.toString()).forall(s => """\r?\n+""".r.findFirstIn(s) == None)) 97 | } 98 | 99 | test("Splitting string into words should let digits and letters pass") { 100 | assert(parser.splitWords(longTextBuilder.toString()).forall(s => s matches """\w+""")) 101 | assert(parser.splitWords(textBuilder.toString()).forall(s => s matches """\w+""")) 102 | } 103 | 104 | test("Title score of sentence consisting solely of stop words should be 0") { 105 | assert(parser.titleScore(title, stopWordsSentence) === 0.0) 106 | } 107 | 108 | test("Title score of sentence that hasn't stop words should be 1") { 109 | assert(parser.titleScore(title, noStopWordsSentence) === 1.0) 110 | } 111 | 112 | test("Keywords are sorted in descending order") { 113 | assert(parser.getKeywords(textForKeywords) === 114 | KeywordList(List(ArticleKeyword("threethree", 3), ArticleKeyword("twotwo", 2), ArticleKeyword("oneone", 1)), 6)) 115 | } 116 | 117 | test("Keywords are unique") { 118 | assert(parser.getKeywords(textForKeywords).keywords.toSet === 119 | Set(ArticleKeyword("threethree", 3), ArticleKeyword("twotwo", 2), ArticleKeyword("oneone", 1))) 120 | } 121 | 122 | test("Any keyword isn't present in stopWords list") { 123 | assert(parser.getKeywords(textForKeywords).keywords.forall(aw => !parser.stopWords.contains(aw.word))) 124 | } 125 | 126 | 127 | } 128 | --------------------------------------------------------------------------------