├── project
    ├── build.properties
    └── plugins.sbt
├── corpus
    ├── corpusEN.bin
    ├── corpusES.bin
    ├── corpusEN2.bin
    └── README.md
├── src
    ├── test
    │   └── scala
    │   │   └── com
    │   │       └── textteaser
    │   │           └── summarizer
    │   │               ├── SummarizerSuite.scala
    │   │               ├── SummarySuite.scala
    │   │               └── ParserSuite.scala
    └── main
    │   └── scala
    │       └── com
    │           └── textteaser
    │               └── summarizer
    │                   ├── Article.scala
    │                   ├── Config.scala
    │                   ├── Summary.scala
    │                   ├── models
    │                       ├── Keyword.scala
    │                       └── Summary.scala
    │                   ├── KeywordService.scala
    │                   ├── MongoKeywordService.scala
    │                   ├── SimpleREPL.scala
    │                   ├── GuiceModule.scala
    │                   ├── Parser.scala
    │                   ├── Main.scala
    │                   ├── StopWords.scala
    │                   └── Summarizer.scala
├── .gitignore
├── README.md
└── LICENSE


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.6
2 | 


--------------------------------------------------------------------------------
/corpus/corpusEN.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MojoJolo/textteaser/HEAD/corpus/corpusEN.bin


--------------------------------------------------------------------------------
/corpus/corpusES.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MojoJolo/textteaser/HEAD/corpus/corpusES.bin


--------------------------------------------------------------------------------
/corpus/corpusEN2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MojoJolo/textteaser/HEAD/corpus/corpusEN2.bin


--------------------------------------------------------------------------------
/src/test/scala/com/textteaser/summarizer/SummarizerSuite.scala:
--------------------------------------------------------------------------------
1 | package com.textteaser.summarizer
2 | 
3 | class SummarizerSuite {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/Article.scala:
--------------------------------------------------------------------------------
1 | package com.textteaser.summarizer
2 | 
3 | case class Article(id: String,
4 |   title: String,
5 |   article: String,
6 |   url: String = "",
7 |   blog: String = "",
8 |   category: String = "")


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0")
2 | 
3 | addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0")
4 | 
5 | resolvers += "Sonatype snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/"
6 | 
7 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
8 | 
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.class
 3 | *.log
 4 | 
 5 | # sbt specific
 6 | dist/*
 7 | target/
 8 | lib_managed/
 9 | src_managed/
10 | project/boot/
11 | project/plugins/project/
12 | project/plugins/project/
13 | 
14 | # Scala-IDE specific
15 | .scala_dependencies
16 | .settings
17 | .cache
18 | .classpath
19 | .project
20 | 
21 | # Idea-specific
22 | .idea
23 | .idea_modules
24 | 


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/Config.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | class Config {
 4 |   
 5 |   def lang = "EN"
 6 |   
 7 |   object words {
 8 |     def ideal = 20
 9 |   }
10 | 
11 |   object db {
12 |     def host = "localhost"
13 |     def port = 27017
14 |     def name = "tt_db"
15 |     def username = ""
16 |     def password = ""
17 |   }
18 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | TextTeaser
 2 | ==========
 3 | 
 4 | TextTeaser is an automatic summarization algorithm that combines the power of natural language processing and machine learning to produce good results.
 5 | 
 6 | [TextTeaser is ported in Python](https://github.com/IndigoResearch/textteaser)
 7 | 
 8 | ### Requirements
 9 | 
10 | [SBT](http://www.scala-sbt.org/) is needed for TextTeaser to run.
11 | Use of [Scala IDE](http://scala-ide.org/) is recommended
12 | 
13 | ### Setup
14 | 
15 | ```bash
16 | $ git clone https://github.com/MojoJolo/textteaser.git
17 | $ sbt compile
18 | $ sbt eclipse // If using Eclipse.
19 | $ sbt run
20 | ```
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/Summary.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | import scala.collection.immutable.IndexedSeq
 4 | 
 5 | case class Summary(results: IndexedSeq[Sentence]) extends Traversable[String] {
 6 |   lazy val charCount = if (results.isEmpty) 0 else results.map(_.sentence.size).sum
 7 | 
 8 |   def foreach[U](f: String => U) {
 9 |     results.foreach( s => f(s.sentence) )
10 |   }
11 | 
12 |   def takeChars(limitCharCount: Int): Summary = {
13 |     var count = 0
14 | 
15 |     val newSentences = results.takeWhile { sentence =>
16 |       count += sentence.sentence.size
17 |       limitCharCount >= count
18 |     }
19 |     Summary(newSentences)
20 |   }
21 | }


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/models/Keyword.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer.models
 2 | 
 3 | import net.liftweb.mongodb.record.field._
 4 | import net.liftweb.record.field._
 5 | import net.liftweb.mongodb.record._
 6 | 
 7 | class Keyword extends MongoRecord[Keyword] with ObjectIdPk[Keyword] {
 8 |   def meta = Keyword
 9 | 
10 |   object word extends StringField(this, "")
11 |   object score extends LongField(this, 0)
12 |   object date extends DateField(this)
13 |   object summaryId extends StringField(this, 10)
14 |   object blog extends StringField(this, "Undefined")
15 |   object category extends StringField(this, "Undefined")
16 | }
17 | 
18 | object Keyword extends Keyword with MongoMetaRecord[Keyword]


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/models/Summary.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer.models
 2 | 
 3 | import net.liftweb.mongodb.record.field._
 4 | import net.liftweb.record.field._
 5 | import net.liftweb.mongodb.record._
 6 | 
 7 | class Summary extends MongoRecord[Summary] with ObjectIdPk[Summary] {
 8 |   def meta = Summary
 9 | 
10 |   object summaryId extends StringField(this, 10)
11 |   object title extends StringField(this, "")
12 |   object summary extends StringField(this, "")
13 |   object status extends StringField(this, "Pending")
14 |   object url extends StringField(this, "")
15 |   object blog extends StringField(this, "Undefined")
16 |   object category extends StringField(this, "Undefined")
17 | }
18 | 
19 | object Summary extends Summary with MongoMetaRecord[Summary]


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/KeywordService.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | trait KeywordService {
 4 |   def getBlogCount(blog: String): Long
 5 |   def getCategoryCount(cat: String): Long
 6 |   def getBlogScore(word: String, blog: String): Long
 7 |   def getCategoryScore(word: String, cat: String): Long
 8 |   def add(word: String, count: Long, summaryId: String, blog: String, cat: String): Unit
 9 | }
10 | 
11 | class DummyKeywordService extends KeywordService {
12 |   def getBlogCount(blog: String): Long = 1
13 |   def getCategoryCount(cat: String): Long = 1
14 |   def getBlogScore(word: String, blog: String): Long = 1
15 |   def getCategoryScore(word: String, cat: String): Long = 1
16 |   def add(word: String, count: Long, summaryId: String, blog: String, cat: String) {}
17 | }


--------------------------------------------------------------------------------
/corpus/README.md:
--------------------------------------------------------------------------------
 1 | Training new models
 2 | ===================
 3 | 
 4 | This folder contains Maximum Entropy models for sentence splitting, as needed by OpenNLP 1.5.
 5 | 
 6 | See http://opennlp.apache.org/documentation/1.5.3/manual/opennlp.html#tools.sentdetect.training
 7 | 
 8 | For more models, see: http://opennlp.sourceforge.net/models-1.5/
 9 | 
10 | Migrating from NLTK
11 | -------------------
12 | 
13 | <pre>
14 | $ pip install nltk
15 | $ python
16 | >>> import nltk
17 | >>> nltk.download()
18 | </pre>
19 | 
20 | _install punkt_
21 | 
22 | <pre>
23 | >>> import nltk.data
24 | >>> sent_detector = nltk.data.load('tokenizers/punkt/spanish.pickle')
25 | >>> import codecs
26 | >>> text=codecs.open("/path/to/corpus-utf8.txt","r","utf-8").read()
27 | >>> sents=sent_detector.tokenize(text)
28 | >>> w=codecs.open("train.txt","w","utf-8")
29 | >>>  for s in sents:
30 | ...   w.write(s + '\n')
31 | ...   c += 1
32 | ...   if c == 10:
33 | ...     w.write('\n')
34 | ...     c = 0
35 | >>> w.close()
36 | >>> 
37 | $ opennlp SentenceDetectorTrainer -model corpusES.bin -lang es -data train.txt -encoding UTF-8
38 | </pre>
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Jolo Balbin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/MongoKeywordService.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | import com.google.inject.Inject
 4 | import com.textteaser.summarizer.models.Keyword
 5 | import com.foursquare.rogue.LiftRogue._
 6 | import org.joda.time.DateTime
 7 | 
 8 | class MongoKeywordService extends KeywordService {
 9 |   val controller = "keywords"
10 | 
11 |   def getBlogCount(blog: String): Long = Keyword.where(_.blog eqs blog).count
12 | 
13 |   def getCategoryCount(cat: String): Long = Keyword.where(_.category eqs cat).count
14 | 
15 |   def getBlogScore(word: String, blog: String): Long = Keyword.where(_.word eqs word).and(_.blog eqs blog)
16 |     .fetch.map(_.score._1)
17 |     .reduceLeftOption(_ + _).getOrElse(0)
18 | 
19 |   def getCategoryScore(word: String, cat: String): Long = Keyword.where(_.word eqs word).and(_.category eqs cat)
20 |     .fetch.map(_.score._1)
21 |     .reduceLeftOption(_ + _).getOrElse(0)
22 | 
23 |   def add(word: String, count: Long, summaryId: String, blog: String, cat: String) = Keyword.createRecord
24 |     .word(word)
25 |     .score(count)
26 |     .summaryId(summaryId)
27 |     .blog(blog)
28 |     .category(cat)
29 |     .date(new DateTime().toDate)
30 |     .save
31 | }


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/SimpleREPL.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | import net.codingwell.scalaguice.InjectorExtensions.ScalaInjector
 4 | import com.google.inject.Guice
 5 | import com.textteaser.summarizer.models.Keyword
 6 | import org.slf4j._
 7 | import scala.io.StdIn
 8 | 
 9 | object SimpleREPL extends App {
10 | 
11 |   val config = new Config { override def lang = "ES" }
12 |   val guice = new ScalaInjector(Guice.createInjector(new GuiceModule(config, true)))
13 | 
14 |   val summarizer = guice.instance[Summarizer]
15 |   val log = guice.instance[Logger]
16 | 
17 |   while(true) {
18 |     println("Ready for summarizing:")
19 |     println("Provide the article title:")
20 |     val title = StdIn.readLine()
21 |     println("Provide the article text (with now newlines \\n):")
22 |     val text = StdIn.readLine().replaceAll("\\\\n", "\n")
23 |     println(text)
24 |     val article = Article("not_important", title, text)
25 |     val summary = summarizer.summarize(article.article, article.title, article.id, article.blog, article.category)
26 | 
27 |     println("---- Summary ----")
28 |     summary.foreach(println)
29 |     println("-----------------")
30 | 
31 |     log.info("Summarization completed.")
32 |   }
33 | }


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/GuiceModule.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | import net.codingwell.scalaguice.InjectorExtensions.ScalaInjector
 4 | import net.codingwell.scalaguice.ScalaModule
 5 | import com.google.inject._
 6 | import javax.inject.Named
 7 | import opennlp.tools.sentdetect._
 8 | import java.io.FileInputStream
 9 | import com.mongodb._
10 | import org.slf4j.LoggerFactory
11 | 
12 | class GuiceModule(config: Config, dummyKeywordService: Boolean = false) extends AbstractModule with ScalaModule {
13 |   def configure {
14 |     bind[Config].toInstance(config)
15 |     bind[Parser].in[Singleton]
16 |     bind[Summarizer].in[Singleton]
17 |     if (dummyKeywordService)
18 |       bind[KeywordService].to[DummyKeywordService].in[Singleton]
19 |     else
20 |       bind[KeywordService].to[MongoKeywordService].in[Singleton]
21 |   }
22 | 
23 |   @Provides
24 |   def mongo = {
25 |     val server = new ServerAddress(config.db.host, config.db.port)
26 |     new Mongo(server)
27 |   }
28 | 
29 |   @Provides
30 |   @Singleton
31 |   def sentenceDetector(@Named("lang")lang: String) = {
32 |     val model = new SentenceModel(new FileInputStream("corpus/corpus" + lang + ".bin"))
33 |     new SentenceDetectorME(model)
34 |   }
35 |   
36 |   @Provides
37 |   @Singleton
38 |   @Named("lang")
39 |   def lang(config: Config): String = config.lang
40 | 
41 |   @Provides
42 |   @Singleton
43 |   def log = LoggerFactory.getLogger("")
44 |   
45 |   @Provides
46 |   @Singleton
47 |   def stopWords(@Named("lang")lang: String) = StopWords.forLang(lang)
48 | }


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/Parser.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | import opennlp.tools.sentdetect._
 4 | import com.google.inject.Inject
 5 | import com.google.common.base.{CharMatcher, Splitter}
 6 | import scala.collection.JavaConverters
 7 | 
 8 | class Parser @Inject() (sentenceDetector: SentenceDetectorME, stopWordList: StopWords, config: Config) {
 9 | 
10 |   val ideal = config.words.ideal
11 |   lazy val stopWords = stopWordList.stopWords
12 | 
13 |   /*
14 |    * Sentence Length: Computed using this formula
15 |    * (ideal - Math.abs(ideal - words.size)) / ideal
16 |    */
17 |   def sentenceLength(sentence: Array[String]) = 1 - (Math.abs(ideal - sentence.size) / ideal.toDouble)
18 | 
19 |   /*
20 |    * Split Words: Split words via white space and new lines. Then remove whites space in the resulting array.
21 |    */
22 |   def splitWords(source: String) = JavaConverters.iterableAsScalaIterableConverter(
23 |     Splitter.on("""[^\w]""".r.pattern)
24 |     .trimResults().omitEmptyStrings()
25 |     .split(source)).asScala.toArray
26 | 
27 |   def titleScore(titleWords: Array[String], sentence: Array[String]) =
28 |     sentence.count(w => !stopWords.contains(w) && titleWords.contains(w)) / titleWords.size.toDouble
29 | 
30 |   def getKeywords(text: String): KeywordList = {
31 |     val keyWords = splitWords(text)
32 |     val sizeWithRepeatingWords = keyWords.length
33 |     KeywordList(
34 |       keyWords.filterNot(w => stopWords.contains(w))
35 |       .groupBy(w => w)
36 |       .map(w => ArticleKeyword(w._1, w._2.length))
37 |       .toList.sortBy(-_.count),
38 |       sizeWithRepeatingWords)
39 |   }
40 | 
41 |   def splitSentences(source: String) = sentenceDetector.sentDetect(source)
42 | 
43 |   def sentencePosition(ctr: Int, sentenceCount: Double) = {
44 |     val normalized = ctr / sentenceCount
45 | 
46 |     if(normalized > 1.0)
47 |       0d
48 |     else if (normalized > 0.9)
49 |       0.15
50 |     else if (normalized > 0.8)
51 |       0.04
52 |     else if (normalized > 0.7) 
53 |       0.04
54 |     else if (normalized > 0.6) 
55 |       0.06
56 |     else if (normalized > 0.5) 
57 |       0.04
58 |     else if (normalized > 0.4) 
59 |       0.05
60 |     else if (normalized > 0.3) 
61 |       0.08
62 |     else if (normalized > 0.2) 
63 |       0.14
64 |     else if (normalized > 0.1) 
65 |       0.23
66 |     else if (normalized > 0) 
67 |       0.17
68 |     0d
69 |   }
70 | }
71 | 
72 | case class ArticleKeyword(word: String, count: Int)
73 | case class KeywordList(keywords: List[ArticleKeyword], wordCount: Int)
74 | 


--------------------------------------------------------------------------------
/src/test/scala/com/textteaser/summarizer/SummarySuite.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | import org.scalatest.{BeforeAndAfter, FunSuite}
 4 | import scala.collection.immutable.{VectorBuilder, IndexedSeq}
 5 | 
 6 | class SummarySuite extends FunSuite with BeforeAndAfter {
 7 | 
 8 |   var singleNonEmptySentenceSeq: IndexedSeq[Sentence] = _
 9 |   var twoSentenceSeq: IndexedSeq[Sentence] = _
10 |   var summaryOnEmptySentence: Summary = _
11 |   var summaryOnSingleSentenceSeq: Summary = _
12 |   var summaryOnTwoSentenceSeq: Summary = _
13 |   var targetSummaryForForeach: Summary = _
14 |   var targetVectorForForeach: VectorBuilder[String] = _
15 | 
16 |   before {
17 |     singleNonEmptySentenceSeq = Vector(Sentence("Hello world", 1d, 1))
18 |     twoSentenceSeq = Vector(Sentence("Hello world", 1d, 1), Sentence("Hello world", 1d, 1))
19 |     summaryOnEmptySentence = Summary(Vector.empty[Sentence])
20 |     summaryOnSingleSentenceSeq = Summary(singleNonEmptySentenceSeq)
21 |     summaryOnTwoSentenceSeq = Summary(twoSentenceSeq)
22 |     targetSummaryForForeach = Summary(Vector(Sentence("a", 1d, 1), Sentence("b", 1d, 1), Sentence("c", 1d, 1)))
23 |     targetVectorForForeach = new VectorBuilder[String]
24 |   }
25 | 
26 |   test("charCount on an empty sequence") {
27 |     assert(summaryOnEmptySentence.charCount === 0)
28 |   }
29 | 
30 |   test("""charCount on "Hello world" is 11""") {
31 |     assert(summaryOnSingleSentenceSeq.charCount === 11)
32 |   }
33 | 
34 |   test("""charCount on 2 x "Hello world" is 22""") {
35 |     assert(summaryOnTwoSentenceSeq.charCount === 22)
36 |   }
37 | 
38 |   test("""foreach on an non-empty-sentence summary""") {
39 |     targetSummaryForForeach foreach { case s => targetVectorForForeach += s }
40 |     assert(targetVectorForForeach.result === Vector("a", "b", "c"))
41 |   }
42 | 
43 |   test("""foreach on an empty-sentence summary""") {
44 |     summaryOnEmptySentence foreach { case s => targetVectorForForeach += s }
45 |     assert(targetVectorForForeach.result === Vector())
46 |   }
47 | 
48 |   test("""takeChars on an empty-sentence summary""") {
49 |     assert(summaryOnEmptySentence.takeChars(100) === summaryOnEmptySentence)
50 |   }
51 | 
52 |   test("""takeChars on a non-empty-sentence summary""") {
53 |     assert(summaryOnTwoSentenceSeq.takeChars(11) === summaryOnSingleSentenceSeq)
54 |   }
55 | 
56 |   test("""takeChars on a non-empty-sentence summary with 0 to take""") {
57 |     assert(summaryOnTwoSentenceSeq.takeChars(0) === summaryOnEmptySentence)
58 |   }
59 | 
60 |   test("""Summary constructor itself""") {
61 |     assert(Summary(Vector(Sentence("Hello world", 1d, 1))) === summaryOnSingleSentenceSeq)
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/Main.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | import net.codingwell.scalaguice.InjectorExtensions.ScalaInjector
 4 | import com.google.inject.Guice
 5 | import com.mongodb._
 6 | import net.liftweb.mongodb._
 7 | import com.textteaser.summarizer.models.Keyword
 8 | import com.foursquare.rogue.LiftRogue._
 9 | import org.json4s._
10 | import org.json4s.native.JsonMethods._
11 | import org.slf4j._
12 | 
13 | object Main extends App {
14 | 
15 |   implicit val formats = DefaultFormats
16 |   val config = new Config
17 |   val guice = new ScalaInjector(Guice.createInjector(new GuiceModule(config)))
18 | 
19 |   val summarizer = guice.instance[Summarizer]
20 |   val log = guice.instance[Logger]
21 | 
22 |   log.info("Starting...")
23 | 
24 |   MongoDB.defineDb(DefaultMongoIdentifier, guice.instance[Mongo], config.db.name)
25 | 
26 |   log.info("App is now runnning.")
27 | 
28 |   val id = "anythingyoulikehere"
29 |   val title = "Astronomic news: the universe may not be expanding after all"
30 |   val text = "Now that conventional thinking has been turned on its head in a paper by Prof Christof Wetterich at the University of Heidelberg in Germany. He points out that the tell-tale light emitted by atoms is also governed by the masses of their constituent particles, notably their electrons. The way these absorb and emit light would shift towards the blue part of the spectrum if atoms were to grow in mass, and to the red if they lost it.  Because the frequency or ÒpitchÓ of light increases with mass, Prof Wetterich argues that masses could have been lower long ago. If they had been constantly increasing, the colours of old galaxies would look red-shifted Ð and the degree of red shift would depend on how far away they were from Earth. ÒNone of my colleagues has so far found any fault [with this],Ó he says.  Although his research has yet to be published in a peer-reviewed publication, Nature reports that the idea that the universe is not expanding at all Ð or even contracting Ð is being taken seriously by some experts, such as Dr HongSheng Zhao, a cosmologist at the University of St Andrews who has worked on an alternative theory of gravity. ÒI see no fault in [Prof WetterichÕs] mathematical treatment,Ó he says. ÒThere were rudimentary versions of this idea two decades ago, and I think it is fascinating to explore this alternative representation of the cosmic expansion, where the evolution of the universe is like a piano keyboard played out from low to high pitch.Ó  Prof Wetterich takes the detached, even playful, view that his work marks a change in perspective, with two different views of reality: either the distances between galaxies grow, as in the traditional balloon picture, or the size of atoms shrinks, increasing their mass. Or itÕs a complex blend of the two. One benefit of this idea is that he is able to rid physics of the singularity at the start of time, a nasty infinity where the laws of physics break down. Instead, the Big Bang is smeared over the distant past: the first note of the ''cosmic pianoÕÕ was long and low-pitched.  Harry Cliff, a physicist working at CERN who is the Science MuseumÕs fellow of modern science, thinks it striking that a universe where particles are getting heavier could look identical to one where space/time is expanding. ÒFinding two different ways of thinking about the same problem often leads to new insights,Ó he says. ÒString theory, for instance, is full of 'dualitiesÕ like this, which allow theorists to pick whichever view makes their calculations simpler.Ó  If this idea turns out to be right Ð and that is a very big if Ð it could pave the way for new ways to think about our universe. If we are lucky, they might even be as revolutionary as Edwin HubbleÕs, almost a century ago.  Roger Highfield is director of external affairs at the Science Museum"
31 | 
32 |   val article = Article(id, title, text)
33 |   val summary = summarizer.summarize(article.article, article.title, article.id, article.blog, article.category)
34 | 
35 |   println(summarizer.toJSON(summary))
36 | 
37 |   log.info("Summarization completed.")
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/StopWords.scala:
--------------------------------------------------------------------------------
 1 | package com.textteaser.summarizer
 2 | 
 3 | import scala.collection.JavaConverters
 4 | 
 5 | trait StopWords {
 6 |   val stopWords: Set[String]
 7 | }
 8 | 
 9 | object StopWords {
10 |   def forLang(lang: String): StopWords = if (lang.equalsIgnoreCase("ES")) new StopWordsES() else new StopWordsEN();
11 | }
12 | 
13 | class StopWordsEN extends StopWords {
14 |   lazy val stopWords = Set("-", " ", ",", ".", "a", "e", "i", "o", "u", "t", "about", "above", "above", "across",
15 |     "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although",
16 |     "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone",
17 |     "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become",
18 |     "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between",
19 |     "beyond", "both", "bottom", "but", "by", "call", "can", "cannot", "can't", "co", "con", "could", "couldn't", "de",
20 |     "describe", "detail", "did", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven",
21 |     "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere",
22 |     "except", "few", "fifteen", "fifty", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty",
23 |     "found", "four", "from", "front", "full", "further", "get", "give", "go", "got", "had", "has", "hasnt", "have",
24 |     "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself",
25 |     "his", "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", "into", "is", "it", "its", "it's",
26 |     "itself", "just", "keep", "last", "latter", "latterly", "least", "less", "like", "ltd", "made", "make", "many",
27 |     "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must",
28 |     "my", "myself", "name", "namely", "neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody",
29 |     "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only",
30 |     "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "people",
31 |     "per", "perhaps", "please", "put", "rather", "re", "said", "same", "see", "seem", "seemed", "seeming", "seems",
32 |     "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
33 |     "something", "sometime", "sometimes", "somewhere", "still", "such", "take", "ten", "than", "that", "the", "their",
34 |     "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon",
35 |     "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru",
36 |     "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until",
37 |     "up", "upon", "us", "use", "very", "via", "want", "was", "we", "well", "were", "what", "whatever", "when",
38 |     "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether",
39 |     "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within",
40 |     "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the", "reuters", "news", "monday",
41 |     "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "mon", "tue", "wed", "thu", "fri", "sat",
42 |     "sun", "rappler", "rapplercom", "inquirer", "yahoo", "home", "sports", "1", "10", "2012", "sa", "says", "tweet",
43 |     "pm", "home", "homepage", "sports", "section", "newsinfo", "stories", "story", "photo", "2013", "na", "ng", "ang",
44 |     "year", "years", "percent", "ko", "ako", "yung", "yun", "2", "3", "4", "5", "6", "7", "8", "9", "0", "time",
45 |     "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november",
46 |     "december", "philippine", "government", "police", "manila")
47 | }
48 | 
49 | class StopWordsES extends StopWords {
50 |   lazy val stopWords = Set("un", "una", "unas", "unos", "uno", "sobre", "todo", "también", "tras", "otro", "algún",
51 |     "alguno", "alguna", "algunos", "algunas", "ser", "es", "soy", "eres", "somos", "sois", "estoy", "esta", "estamos",
52 |     "estais", "estan", "como", "en", "para", "atras", "porque", "por", "qué", "estado", "estaba", "ante", "antes",
53 |     "siendo", "ambos", "pero", "por", "poder", "puede", "puedo", "podemos", "podeis", "pueden", "fui", "fue", "fuimos",
54 |     "fueron", "hacer", "hago", "hace", "hacemos", "haceis", "hacen", "cada", "fin", "incluso", "primero", "desde",
55 |     "conseguir", "consigo", "consigue", "consigues", "conseguimos", "consiguen", "ir", "voy", "va", "vamos", "vais",
56 |     "van", "vaya", "gueno", "ha", "tener", "tengo", "tiene", "tenemos", "teneis", "tienen", "el", "la", "lo", "las",
57 |     "los", "su", "aqui", "mio", "tuyo", "ellos", "ellas", "nos", "nosotros", "vosotros", "vosotras", "si", "dentro",
58 |     "solo", "solamente", "saber", "sabes", "sabe", "sabemos", "sabeis", "saben", "ultimo", "largo", "bastante", "haces",
59 |     "muchos", "aquellos", "aquellas", "sus", "entonces", "tiempo", "verdad", "verdadero", "verdadera", "cierto", "ciertos",
60 |     "cierta", "ciertas", "intentar", "intento", "intenta", "intentas", "intentamos", "intentais", "intentan", "dos",
61 |     "bajo", "arriba", "encima", "usar", "uso", "usas", "usa", "usamos", "usais", "usan", "emplear", "empleo", "empleas",
62 |     "emplean", "ampleamos", "empleais", "valor", "muy", "era", "eras", "eramos", "eran", "modo", "bien", "cual", "cuando",
63 |     "donde", "mientras", "quien", "con", "entre", "sin", "trabajo", "trabajar", "trabajas", "trabaja", "trabajamos",
64 |     "trabajais", "trabajan", "podria", "podrias", "podriamos", "podrian", "podriais", "yo", "aquel")
65 | }


--------------------------------------------------------------------------------
/src/main/scala/com/textteaser/summarizer/Summarizer.scala:
--------------------------------------------------------------------------------
  1 | package com.textteaser.summarizer
  2 | 
  3 | import com.google.inject.Inject
  4 | import org.json4s.native.JsonMethods._
  5 | import org.json4s.JsonDSL._
  6 | import scala.collection.mutable.ListBuffer
  7 | 
  8 | class Summarizer @Inject() (parser: Parser, keywordService: KeywordService) {
  9 | 
 10 |   private var _summarySize: Int = 5
 11 |   private var _keywordsSize: Int = 10
 12 | 
 13 |   def summarySize = _summarySize
 14 |   def summarySize_=(newSize: Int) = {
 15 |     _summarySize = newSize
 16 |   }
 17 | 
 18 |   def keywordsSize = _keywordsSize
 19 |   def keywordsSize_=(newSize: Int) = {
 20 |     _keywordsSize = newSize
 21 |   }
 22 | 
 23 |   private def ensureSizeDoesNotExceedLimit(size: Int, limit: Int): Int = {
 24 |     size.min(limit)
 25 |   }
 26 | 
 27 |   def summarize(text: String, title: String, link: String, blog: String, category: String) = {
 28 |     val sentences = parser.splitSentences(text)
 29 |     def titleWords = parser.splitWords(title)
 30 |     val resKeywords = parser.getKeywords(text)
 31 |     val keywords = resKeywords.keywords
 32 |     keywordsSize = ensureSizeDoesNotExceedLimit(keywordsSize, keywords.size)
 33 |     val topKeywords = getTopKeywords(keywords.take(keywordsSize), resKeywords.wordCount, link, blog, category)
 34 |     val result = computeScore(sentences, titleWords, topKeywords)
 35 |     summarySize = ensureSizeDoesNotExceedLimit(summarySize, result.size)
 36 |     Summary(result.sortBy(-_.score).take(summarySize).sortBy(_.order).toIndexedSeq)
 37 |   }
 38 | 
 39 |   def toJSON(summary: Summary) = compact(render("sentences" -> summary.toList))
 40 | 
 41 |   def getTopKeywords(keywords: List[ArticleKeyword],
 42 |                      articleCount: Int, link: String,
 43 |                      blog: String, category: String): List[TopKeyword] =
 44 |     keywords.map { k =>
 45 |       val blogCount = keywordService.getBlogCount(blog) + 1.0
 46 |       val categoryCount = keywordService.getCategoryCount(category) + 1.0
 47 | 
 48 |       keywordService.add(k.word, k.count, link, blog, category)
 49 | 
 50 |       val articleScore = k.count / articleCount
 51 |       val blogScore = keywordService.getBlogScore(k.word, blog) / blogCount
 52 |       val categoryScore = keywordService.getCategoryScore(k.word, category) / categoryCount
 53 |       val totalScore = articleScore * 1.5 + blogScore + categoryScore
 54 | 
 55 |       TopKeyword(k.word, totalScore)
 56 |     }
 57 | 
 58 |   def computeScore(sentences: Array[String], titleWords: Array[String], topKeywords: List[TopKeyword]) =
 59 |     Array.tabulate(sentences.size) { i =>
 60 |       val sentence = parser.splitWords(sentences(i))
 61 |       val titleFeature = parser.titleScore(titleWords, sentence)
 62 |       val sentenceLength = parser.sentenceLength(sentence)
 63 |       val sentencePosition = parser.sentencePosition(i, sentences.size)
 64 |       val sbsFeature = sbs(sentence, topKeywords)
 65 |       val dbsFeature = dbs(sentence, topKeywords)
 66 |       val keywordFrequency = (sbsFeature + dbsFeature) / 2.0 * 10.0
 67 |       val totalScore = (titleFeature * 1.5 + keywordFrequency * 2.0 + sentenceLength * 0.5 + sentencePosition * 1.0) / 4.0
 68 | 
 69 |       Sentence(sentences(i), totalScore, i)
 70 |     }
 71 | 
 72 |   def sbs(words: Array[String], topKeywords: List[TopKeyword]): Double = {
 73 |     if (words.size == 0)
 74 |       0
 75 |     else {
 76 |       val summ = words.map { word =>
 77 |         topKeywords.find(_.word == word) match {
 78 |           case None => 0
 79 |           case Some(x) => x.score
 80 |         }
 81 |       }.sum
 82 | 
 83 |       1.0 / Math.abs(words.size) * summ
 84 |     }
 85 |   }
 86 | 
 87 |   def dbs(words: Array[String], topKeywords: List[TopKeyword]): Double = {
 88 |     if (words.size == 0)
 89 |       0
 90 |     else {
 91 |       val res = words.map { word =>
 92 |         topKeywords.find(_.word == word) match {
 93 |           case None => 0
 94 |           case Some(x) => x.score
 95 |         }
 96 |       }.zipWithIndex.filter(_._1 > 0)
 97 | 
 98 | 
 99 |       val summ = res.zip(res.slice(1, res.size)).map { r =>
100 |         (r._1._1 * r._2._1) / Math.pow(r._1._2 - r._2._2, 2)
101 |       }.sum
102 | 
103 |       val k = words.intersect(topKeywords.map(_.word)).size + 1
104 | 
105 |       (1.0 / (k * (k + 1.0))) * summ
106 |     }
107 |   }
108 | 
109 |   def canonical_dbs(words: Array[String], topKeywords: List[TopKeyword]): Double = {
110 |     if (words.size == 0)
111 |       0
112 |     else {
113 |       val res = words.map { word =>
114 |         topKeywords.find(_.word == word) match {
115 |           case None => 0
116 |           case Some(x) => x.score
117 |         }
118 |       }.zipWithIndex.filter(_._1 > 0)
119 | 
120 | 
121 |       val summ = res.zip(res.slice(1, res.size)).map { r =>
122 |         (r._1._1 * r._2._1) / Math.pow(r._1._2 - r._2._2, 2)
123 |       }.sum
124 | 
125 |       val k = words.intersect(topKeywords.map(_.word)).size + 1
126 | 
127 |       (1.0 / (k * (k + 1.0))) * summ
128 |     }
129 |   }
130 | }
131 | 
132 | case class TopKeyword(word: String, score: Double)
133 | case class Sentence(sentence: String, score: Double, order: Int)
134 | 
135 | /*
136 |    * The Density Based Selection (DBS) above is so fucking abstracted.
137 |    * USE THIS FOR REFERENCE:
138 |    *
139 |    * def dbs(sentence, topKeywords) {
140 | 		def words = parserService.splitWords sentence
141 | 		words.removeAll(" ")
142 | 		words = words*.toLowerCase()
143 | 
144 | 		if(words.size == 0)
145 | 			return 0
146 | 
147 | 		def k = words.intersect(topKeywords.word).size() + 1
148 | 		def summ = 0
149 | 		def firstWord = []
150 | 		def secondWord = []
151 | 
152 | 		for(def i = 0; i < words.size(); i++) {
153 | 			def index = topKeywords.word.indexOf(words[i])
154 | 
155 | 			if(index > -1) {
156 | 				def score = topKeywords[index].totalScore
157 | 
158 | 				if(firstWord == []) {
159 | 					firstWord = [i: i, score: score]
160 | 				}
161 | 				else {
162 | 					secondWord = firstWord
163 | 					firstWord = [i: i, score: score]
164 | 
165 | 					summ += (firstWord.score * secondWord.score) / Math.pow((firstWord.i - secondWord.i), 2)
166 | 				}
167 | 
168 | 			}
169 | 		}
170 | 
171 | 		def formula = ((1 / k * (k + 1)) * summ) as double
172 | 
173 | 		return formula
174 | 	}
175 | 
176 | 	Just for backup, this is for Summation Based Selection (SBS):
177 | 
178 | 	def sbs(sentence, topKeywords) {
179 | 		def words = parserService.splitWords sentence
180 | 		words.removeAll(" ")
181 | 
182 | 		if(words.size == 0)
183 | 			return 0
184 | 
185 | 		def summ = 0
186 | 
187 | 		words.each { word ->
188 | 			word = word.toLowerCase()
189 | 			def index = topKeywords.word.indexOf(word)
190 | 			def score = index == -1 ? 0 : topKeywords[index].totalScore
191 | 			summ += score
192 | 		}
193 | 
194 | 		def formula = (1 / Math.abs(words.size) * summ) as double
195 | 
196 | 		return formula
197 | 	}
198 | 
199 |    */


--------------------------------------------------------------------------------
/src/test/scala/com/textteaser/summarizer/ParserSuite.scala:
--------------------------------------------------------------------------------
  1 | package com.textteaser.summarizer
  2 | 
  3 | import org.scalatest.{BeforeAndAfter, FunSuite}
  4 | import net.codingwell.scalaguice.InjectorExtensions.ScalaInjector
  5 | import com.google.inject.Guice
  6 | 
  7 | class ParserSuite extends FunSuite with BeforeAndAfter {
  8 | 
  9 |   val guice = new ScalaInjector(Guice.createInjector(new GuiceModule(new Config)))
 10 |   val parser = guice.instance[Parser]
 11 | 
 12 |   val sentenceWithFiveWords: Array[String] = Array("1", "2", "3", "4", "5")
 13 |   val emptySentence:        Array[String] = Array()
 14 |   val sentenceWithTwentyWords: Array[String] = (1 to 20).map(_.toString).toArray
 15 | 
 16 | 
 17 |   val textBuilder         = StringBuilder.newBuilder
 18 |   val longTextBuilder     = StringBuilder.newBuilder
 19 |   val stopWordsSentence   = Array("hereafter", "hereby", "herein")
 20 |   val noStopWordsSentence = Array("Accommodation", "globalization", "emancipation")
 21 |   val title               = Array("Accommodation", "globalization", "emancipation")
 22 |   val textForKeywords     = "oneone twotwo twotwo threethree threethree threethree"
 23 | 
 24 |   before {
 25 |     longTextBuilder ++= "1914 translation by H. Rackham\n\n"
 26 |     longTextBuilder ++= "On the other hand, we denounce with righteous indignation and dislike men "
 27 |     longTextBuilder ++= "who are so beguiled and demoralized by the charms "
 28 |     longTextBuilder ++= "of pleasure of the moment, so blinded by desire, that they cannot foresee the pain and trouble "
 29 |     longTextBuilder ++= "that are bound to ensue; and equal blame belongs to those who fail in their duty "
 30 |     longTextBuilder ++= "through weakness of will, which is the same as saying through shrinking from toil and pain. "
 31 |     longTextBuilder ++= "These cases are perfectly simple and easy to distinguish. In a free hour, "
 32 |     longTextBuilder ++= "when our power of choice is untrammelled and when nothing prevents our being able to do "
 33 |     longTextBuilder ++= "what we like best, every pleasure is to be welcomed and every pain avoided. "
 34 |     longTextBuilder ++= "But in certain circumstances and owing to the claims of duty or the obligations of business "
 35 |     longTextBuilder ++= "it will frequently occur that pleasures have to be repudiated and annoyances accepted. "
 36 |     longTextBuilder ++= "The wise man therefore always holds in these matters to this principle of selection: "
 37 |     longTextBuilder ++= "he rejects pleasures to secure other greater pleasures, or else he endures pains to avoid worse pains."
 38 | 
 39 |     textBuilder ++= "Now that conventional thinking has been turned on its head in a paper by "
 40 |     textBuilder ++= "Prof Christof Wetterich at the University of Heidelberg in Germany. "
 41 |     textBuilder ++= "He points out that the tell-tale light emitted by atoms is also governed by the masses "
 42 |     textBuilder ++= "of their constituent particles, notably their electrons. The way these absorb and emit "
 43 |     textBuilder ++= "light would shift towards the blue part of the spectrum if atoms were to grow in mass, "
 44 |     textBuilder ++= "and to the red if they lost it.  Because the frequency or ÒpitchÓ of light increases with mass, "
 45 |     textBuilder ++= "Prof Wetterich argues that masses could have been lower long ago. "
 46 |     textBuilder ++= "If they had been constantly increasing, the colours of old galaxies would look red-shifted Ð"
 47 |     textBuilder ++= "and the degree of red shift would depend on how far away they were from Earth. "
 48 |     textBuilder ++= "ÒNone of my colleagues has so far found any fault [with this],Ó he says.  "
 49 |     textBuilder ++= "Although his research has yet to be published in a peer-reviewed publication, Nature reports "
 50 |     textBuilder ++= "that the idea that the universe is not expanding at all Ð or even contracting Ð is being taken "
 51 |     textBuilder ++= "seriously by some experts, such as Dr HongSheng Zhao, a cosmologist at the University of "
 52 |     textBuilder ++= "St Andrews who has worked on an alternative theory of gravity. ÒI see no fault in [Prof WetterichÕs] "
 53 |     textBuilder ++= "mathematical treatment,Ó he says. ÒThere were rudimentary versions of this idea two decades ago, and "
 54 |     textBuilder ++= "I think it is fascinating to explore this alternative representation of the cosmic expansion, where the evolution"
 55 |     textBuilder ++= "of the universe is like a piano keyboard played out from low to high pitch.Ó  Prof Wetterich takes the detached,"
 56 |     textBuilder ++= " even playful, view that his work marks a change in perspective, with two different views of reality: "
 57 |     textBuilder ++= "either the distances between galaxies grow, as in the traditional balloon picture, or the size of atoms "
 58 |     textBuilder ++= "shrinks, increasing their mass. Or itÕs a complex blend of the two. One benefit of this idea"
 59 |     textBuilder ++= "is that he is able to rid physics of the singularity at the start of time, a nasty infinity where "
 60 |     textBuilder ++= "the laws of physics break down. Instead, the Big Bang is smeared over the distant past : "
 61 |     textBuilder ++= "the first note of the ''cosmic pianoÕÕ was long and low-pitched.  Harry Cliff, a physicist working at CERN"
 62 |     textBuilder ++= "who is the Science MuseumÕs fellow of modern science, thinks it striking that a universe where particles are "
 63 |     textBuilder ++= "getting heavier could look identical to one where space/time is expanding. ÒFinding two different "
 64 |     textBuilder ++= "ways of thinking about the same problem often leads to new insights,Ó he says. ÒString theory, "
 65 |     textBuilder ++= " for instance, is full of 'dualitiesÕ like this, which allow theorists to pick whichever view "
 66 |     textBuilder ++= "makes their calculations simpler.Ó  If this idea turns out to be right Ð and that is a very big "
 67 |     textBuilder ++= "if Ð it could pave the way for new ways to think about our universe. If we are lucky, they might "
 68 |     textBuilder ++= "even be as revolutionary as Edwin HubbleÕs, almost a century ago.  Roger Highfield is director "
 69 |     textBuilder ++= "of external affairs at the Science Museum"
 70 |   }
 71 | 
 72 |   test("Sentence length on empty sentence returns 0") {
 73 |     assert(parser.sentenceLength(emptySentence) === 0.0)
 74 |   }
 75 | 
 76 |   test("Sentence length on non-empty sentence returns it's length according to formula") {
 77 |     assert(parser.sentenceLength(sentenceWithFiveWords) === 0.25)
 78 |   }
 79 | 
 80 |   test("When `ideal` is equal to `sentence` array length, sentence length should be 1") {
 81 |     assert(parser.sentenceLength(sentenceWithTwentyWords) === 1.0)
 82 |   }
 83 | 
 84 |   test("Splitting string into words should return no empty strings") {
 85 |     assert(!parser.splitWords(longTextBuilder.toString()).contains(""))
 86 |     assert(!parser.splitWords(textBuilder.toString()).contains(""))
 87 |   }
 88 | 
 89 |   test("Splitting string into words should not produce whitespaces in output") {
 90 |     assert(parser.splitWords(longTextBuilder.toString()).forall(s => """\s+""".r.findFirstIn(s) == None))
 91 |     assert(parser.splitWords(textBuilder.toString()).forall(s => """\s+""".r.findFirstIn(s) == None))
 92 |   }
 93 | 
 94 |   test("Splitting string into words should not produce newlines in output") {
 95 |     assert(parser.splitWords(longTextBuilder.toString()).forall(s => """\r?\n+""".r.findFirstIn(s) == None))
 96 |     assert(parser.splitWords(textBuilder.toString()).forall(s => """\r?\n+""".r.findFirstIn(s) == None))
 97 |   }
 98 | 
 99 |   test("Splitting string into words should let digits and letters pass") {
100 |     assert(parser.splitWords(longTextBuilder.toString()).forall(s => s matches """\w+"""))
101 |     assert(parser.splitWords(textBuilder.toString()).forall(s => s matches """\w+"""))
102 |   }
103 | 
104 |   test("Title score of sentence consisting solely of stop words should be 0") {
105 |     assert(parser.titleScore(title, stopWordsSentence) === 0.0)
106 |   }
107 | 
108 |   test("Title score of sentence that hasn't stop words should be 1") {
109 |     assert(parser.titleScore(title, noStopWordsSentence) === 1.0)
110 |   }
111 | 
112 |   test("Keywords are sorted in descending order") {
113 |     assert(parser.getKeywords(textForKeywords) ===
114 |       KeywordList(List(ArticleKeyword("threethree", 3), ArticleKeyword("twotwo", 2), ArticleKeyword("oneone", 1)), 6))
115 |   }
116 | 
117 |   test("Keywords are unique") {
118 |     assert(parser.getKeywords(textForKeywords).keywords.toSet ===
119 |       Set(ArticleKeyword("threethree", 3), ArticleKeyword("twotwo", 2), ArticleKeyword("oneone", 1)))
120 |   }
121 | 
122 |   test("Any keyword isn't present in stopWords list") {
123 |     assert(parser.getKeywords(textForKeywords).keywords.forall(aw => !parser.stopWords.contains(aw.word)))
124 |   }
125 | 
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------