├── images
    ├── gdelt.png
    └── article.jpeg
├── .gitignore
├── .travis.yml
├── src
    ├── main
    │   ├── resources
    │   │   ├── com
    │   │   │   ├── gravity
    │   │   │   │   └── goose
    │   │   │   │   │   ├── images
    │   │   │   │   │       └── known-image-css.txt
    │   │   │   │   │   ├── statichtml
    │   │   │   │   │       ├── msn1_result.txt
    │   │   │   │   │       ├── issue_24_result.txt
    │   │   │   │   │       ├── issue_24.txt
    │   │   │   │   │       └── guardian1_result.txt
    │   │   │   │   │   └── text
    │   │   │   │   │       └── stopwords-en.txt
    │   │   │   └── aamend
    │   │   │   │   └── spark
    │   │   │   │       └── gdelt
    │   │   │   │           └── reference
    │   │   │   │               ├── cameoReligion.txt
    │   │   │   │               ├── cameoType.txt
    │   │   │   │               ├── cameoGroup.txt
    │   │   │   │               ├── cameoCountry.txt
    │   │   │   │               ├── cameoEthnic.txt
    │   │   │   │               └── cameoEvent.txt
    │   │   └── log4j.properties
    │   └── scala
    │   │   └── com
    │   │       ├── gravity
    │   │           └── goose
    │   │           │   ├── network
    │   │           │       ├── MaxBytesException.scala
    │   │           │       ├── AbstractHtmlFetcher.scala
    │   │           │       ├── NotHtmlException.scala
    │   │           │       ├── HttpExceptions.scala
    │   │           │       └── HtmlFetcher.scala
    │   │           │   ├── images
    │   │           │       ├── DepthTraversal.scala
    │   │           │       ├── SecretGifException.scala
    │   │           │       ├── Image.scala
    │   │           │       ├── ImageDetails.scala
    │   │           │       ├── ImageExtractor.scala
    │   │           │       ├── ImageSaver.scala
    │   │           │       └── ImageUtils.scala
    │   │           │   ├── extractors
    │   │           │       ├── TagsEvaluator.scala
    │   │           │       ├── StandardContentExtractor.scala
    │   │           │       ├── AdditionalDataExtractor.scala
    │   │           │       ├── Extractor.scala
    │   │           │       └── PublishDateExtractor.scala
    │   │           │   ├── cleaners
    │   │           │       └── StandardDocumentCleaner.scala
    │   │           │   ├── outputformatters
    │   │           │       ├── StandardOutputFormatter.scala
    │   │           │       └── OutputFormatter.scala
    │   │           │   ├── text
    │   │           │       ├── HashUtils.scala
    │   │           │       ├── StringSplitter.scala
    │   │           │       ├── string.scala
    │   │           │       ├── StringReplacement.scala
    │   │           │       ├── WordStats.scala
    │   │           │       ├── StopWords.scala
    │   │           │       └── ReplaceSequence.scala
    │   │           │   ├── utils
    │   │           │       ├── FileHelper.scala
    │   │           │       ├── URLHelper.scala
    │   │           │       └── Logging.scala
    │   │           │   ├── spark
    │   │           │       ├── package.scala
    │   │           │       └── GooseFetcher.scala
    │   │           │   ├── Goose.scala
    │   │           │   ├── Article.scala
    │   │           │   ├── Configuration.scala
    │   │           │   └── Crawler.scala
    │   │       └── aamend
    │   │           └── spark
    │   │               └── gdelt
    │   │                   ├── reference
    │   │                       ├── CountryCodes.scala
    │   │                       ├── GcamCodes.scala
    │   │                       └── CameoCodes.scala
    │   │                   └── ContentFetcher.scala
    └── test
    │   ├── scala
    │       └── com
    │       │   └── aamend
    │       │       └── spark
    │       │           └── gdelt
    │       │               ├── SparkSpec.scala
    │       │               ├── ContentFetcherTest.scala
    │       │               ├── TTest.scala
    │       │               └── GdeltParserTest.scala
    │   └── resources
    │       └── com
    │           └── aamend
    │               └── spark
    │                   └── gdelt
    │                       ├── normDaily.csv
    │                       └── normDailyByCountry.csv
├── LICENSE
└── pom.xml


/images/gdelt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aamend/spark-gdelt/HEAD/images/gdelt.png


--------------------------------------------------------------------------------
/images/article.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aamend/spark-gdelt/HEAD/images/article.jpeg


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | target
4 | derby.log
5 | spark-warehouse
6 | metastore_db
7 | .DS_Store
8 | movejar.sh


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | install: mvn --quiet install -Dmaven.javadoc.skip=true -Dgpg.skip=true
3 | script: mvn test -Dmaven.javadoc.skip=true -DargLine="-Xmx2G"


--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/images/known-image-css.txt:
--------------------------------------------------------------------------------
1 | latimes.com^thumbnail
2 | cnn.com^storytext|cnn_strycntntlft
3 | foxnews.com^entry-content
4 | msn.com^articleText
5 | go.com^mediaimage
6 | buzznet.com^itembody
7 | time.com^entry-content


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/MaxBytesException.scala:
--------------------------------------------------------------------------------
 1 | package com.gravity.goose.network
 2 | 
 3 | /**
 4 |  * Created by Jim Plush
 5 |  * User: jim
 6 |  * Date: 8/14/11
 7 |  */
 8 | 
 9 | class MaxBytesException extends Exception {
10 | 
11 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/DepthTraversal.scala:
--------------------------------------------------------------------------------
 1 | package com.gravity.goose.images
 2 | 
 3 | import org.jsoup.nodes.Element
 4 | 
 5 | /**
 6 | * Created by Jim Plush
 7 | * User: jim
 8 | * Date: 8/18/11
 9 | */
10 | 
11 | case class DepthTraversal(node: Element, parentDepth: Int, siblingDepth: Int)
12 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | # Direct log messages to stdout
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %40.40c:%4L - %m%n


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/TagsEvaluator.scala:
--------------------------------------------------------------------------------
 1 | package org.jsoup.select
 2 | 
 3 | import org.jsoup.nodes.Element
 4 | 
 5 | /**
 6 |  * Created by IntelliJ IDEA.
 7 |  * Author: Robbie Coleman
 8 |  * Date: 6/12/12
 9 |  * Time: 12:04 PM
10 |  */
11 | 
12 | class TagsEvaluator(tags: scala.collection.Set[String]) extends Evaluator {
13 |   def matches(root: Element, element: Element) = tags.contains(element.tagName())
14 | }
15 | 
16 | object TagsEvaluator {
17 |   def apply(tags: String*): TagsEvaluator = new TagsEvaluator(tags.toSet)
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/resources/com/aamend/spark/gdelt/reference/cameoReligion.txt:
--------------------------------------------------------------------------------
 1 | CODE	LABEL
 2 | ADR	African Diasporic Religion
 3 | ALE	Alewi
 4 | ATH	Agnostic
 5 | BAH	Bahai Faith
 6 | BUD	Buddhism
 7 | CHR	Christianity
 8 | CON	Confucianism
 9 | CPT	Coptic
10 | CTH	Catholic
11 | DOX	Orthodox
12 | DRZ	Druze
13 | HIN	Hinduism
14 | HSD	Hasidic
15 | ITR	Indigenous Tribal Religion
16 | JAN	Jainism
17 | JEW	Judaism
18 | JHW	Jehovah's Witness
19 | LDS	Latter Day Saints
20 | MOS	Muslim
21 | MRN	Maronite
22 | NRM	New Religious Movement
23 | PAG	Pagan
24 | PRO	Protestant
25 | SFI	Sufi
26 | SHI	Shia
27 | SHN	Old Shinto School
28 | SIK	Sikh
29 | SUN	Sunni
30 | TAO	Taoist
31 | UDX	Ultra-Orthodox
32 | ZRO	Zoroastrianism


--------------------------------------------------------------------------------
/src/test/scala/com/aamend/spark/gdelt/SparkSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.aamend.spark.gdelt
 2 | 
 3 | import org.apache.log4j.{Level, Logger}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.scalatest.FunSuite
 6 | 
 7 | trait SparkSpec extends FunSuite {
 8 | 
 9 |   Logger.getLogger("org").setLevel(Level.OFF)
10 |   Logger.getLogger("akka").setLevel(Level.OFF)
11 | 
12 |   def sparkTest(name: String)(f: SparkSession => Unit): Unit = {
13 | 
14 |     this.test(name) {
15 | 
16 |       val spark = SparkSession
17 |         .builder()
18 |         .appName(name)
19 |         .master("local")
20 |         .config("spark.default.parallelism", "1")
21 |         .getOrCreate()
22 | 
23 |       try {
24 |         f(spark)
25 |       } finally {
26 |         spark.stop()
27 |       }
28 |     }
29 |   }
30 | }


--------------------------------------------------------------------------------
/src/main/scala/com/aamend/spark/gdelt/reference/CountryCodes.scala:
--------------------------------------------------------------------------------
 1 | package com.aamend.spark.gdelt.reference
 2 | 
 3 | import com.aamend.spark.gdelt.CountryCode
 4 | import com.aamend.spark.gdelt.T
 5 | import org.apache.spark.sql.{Dataset, SparkSession}
 6 | 
 7 | import scala.io.Source
 8 | 
 9 | object CountryCodes {
10 | 
11 |   def load(spark: SparkSession): Dataset[CountryCode] = {
12 |     import spark.implicits._
13 |     Source.fromInputStream(this.getClass.getResourceAsStream("countryInfo.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
14 |       val tokens = line.split("\t")
15 |       CountryCode(
16 |         iso = T(()=>tokens(0)),
17 |         iso3 = T(()=>tokens(1)),
18 |         isoNumeric = T(()=>tokens(2)),
19 |         fips = T(()=>tokens(3)),
20 |         country = T(()=>tokens(4).toLowerCase())
21 |       )
22 |     }).toDS()
23 |   }
24 | 
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/src/test/scala/com/aamend/spark/gdelt/ContentFetcherTest.scala:
--------------------------------------------------------------------------------
 1 | package com.aamend.spark.gdelt
 2 | 
 3 | import org.apache.spark.ml.Pipeline
 4 | import org.scalatest.Matchers
 5 | 
 6 | class ContentFetcherTest extends SparkSpec with Matchers {
 7 | 
 8 |   sparkTest("testing E2E pipeline") { spark =>
 9 | 
10 |     import spark.implicits._
11 |     val gdeltDf = List("https://www.theguardian.com/world/2018/jun/01/mariano-rajoy-ousted-as-spain-prime-minister").toDF("sourceUrl")
12 | 
13 |     val contentFetcher = new ContentFetcher()
14 |       .setInputCol("sourceUrl")
15 |       .setOutputImageUrlCol("imageUrl")
16 |       .setOutputImageBase64Col("imageBase64")
17 |       .setImagemagickConvert("/usr/local/bin/convert")
18 |       .setImagemagickIdentify("/usr/local/bin/identify")
19 | 
20 |     val contentDF = contentFetcher.transform(gdeltDf)
21 |     contentDF.show(false)
22 |   }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/scala/com/aamend/spark/gdelt/TTest.scala:
--------------------------------------------------------------------------------
 1 | package com.aamend.spark.gdelt
 2 | 
 3 | import org.scalatest.{FlatSpec, Matchers}
 4 | 
 5 | class TTest extends FlatSpec with Matchers {
 6 | 
 7 |   "null" should "return None" in {
 8 |     T(()=>null) should be(None)
 9 |     T(()=>null.toString) should be(None)
10 |   }
11 | 
12 |   "Integer" should "return Int" in {
13 |     T(()=>"1".toInt) should be(Some(1))
14 |     T(()=>"a".toInt) should be(None)
15 |   }
16 | 
17 |   "Long" should "return Long" in {
18 |     T(()=>"1".toLong) should be(Some(1L))
19 |     T(()=>"a".toLong) should be(None)
20 |   }
21 | 
22 |   "Float" should "return Float" in {
23 |     T(()=>"1.0".toFloat) should be(Some(1.0))
24 |     T(()=>"a".toFloat) should be(None)
25 |   }
26 | 
27 |   "String" should "return String" in {
28 |     T(()=>"1") should be(Some("1"))
29 |     T(()=>" 1 ") should be(Some("1"))
30 |     T(()=>" ") should be(None)
31 |     T(()=>"") should be(None)
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/resources/com/aamend/spark/gdelt/reference/cameoType.txt:
--------------------------------------------------------------------------------
 1 | CODE	LABEL
 2 | COP	Police forces
 3 | GOV	Government
 4 | INS	Insurgents
 5 | JUD	Judiciary
 6 | MIL	Military
 7 | OPP	Political Opposition
 8 | REB	Rebels
 9 | SEP	Separatist Rebels
10 | SPY	State Intelligence
11 | UAF	Unaligned Armed Forces
12 | AGR	Agriculture
13 | BUS	Business
14 | CRM	Criminal
15 | CVL	Civilian
16 | DEV	Development
17 | EDU	Education
18 | ELI	Elites
19 | ENV	Environmental
20 | HLH	Health
21 | HRI	Human Rights
22 | LAB	Labor
23 | LEG	Legislature
24 | MED	Media
25 | REF	Refugees
26 | MOD	Moderate
27 | RAD	Radical
28 | AMN	Amnesty International
29 | IRC	Red Cross
30 | GRP	Greenpeace
31 | UNO	United Nations
32 | PKO	Peacekeepers
33 | UIS	Unidentified State Actor
34 | IGO	Inter-Governmental Organization
35 | IMG	International Militarized Group
36 | INT	International/Transnational Generic
37 | MNC	Multinational Corporation
38 | NGM	Non-Governmental Movement
39 | NGO	Non-Governmental Organization
40 | UIS	Unidentified State Actor
41 | SET	Settler


--------------------------------------------------------------------------------
/src/main/scala/com/aamend/spark/gdelt/reference/GcamCodes.scala:
--------------------------------------------------------------------------------
 1 | package com.aamend.spark.gdelt.reference
 2 | 
 3 | import com.aamend.spark.gdelt.GcamCode
 4 | import com.aamend.spark.gdelt.T
 5 | import org.apache.spark.sql.{Dataset, SparkSession}
 6 | 
 7 | import scala.io.Source
 8 | 
 9 | object GcamCodes {
10 | 
11 |   def load(spark: SparkSession): Dataset[GcamCode] = {
12 |     import spark.implicits._
13 |     Source.fromInputStream(this.getClass.getResourceAsStream("gcam.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
14 |       val tokens = line.split("\t")
15 |       GcamCode(
16 |         gcamCode = T(()=>tokens(0)),
17 |         dictionaryId = T(()=>tokens(1)),
18 |         dimensionId = T(()=>tokens(2)),
19 |         dictionaryType = T(()=>tokens(3)),
20 |         languageCode = T(()=>tokens(4)),
21 |         dictionaryHumanName = T(()=>tokens(5)),
22 |         dimensionHumanName = T(()=>tokens(6)),
23 |         dictionaryCitation = T(()=>tokens(7))
24 |       )
25 |     }).toDS()
26 |   }
27 | 
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/AbstractHtmlFetcher.scala:
--------------------------------------------------------------------------------
 1 | package com.gravity.goose.network
 2 | 
 3 | import com.gravity.goose.Configuration
 4 | import org.apache.http.client.HttpClient
 5 | 
 6 | /**
 7 |  * Created by IntelliJ IDEA.
 8 |  * Author: Robbie Coleman
 9 |  * Date: 10/13/12
10 |  * Time: 1:02 AM
11 |  *
12 |  * The workhorse of goose. Override the {@see com.gravity.goose.network.HtmlFetcher} within your configuration for complete control.
13 |  */
14 | trait AbstractHtmlFetcher {
15 |   /**
16 |    * Access the `url` over the internet and retrieve the HTML from it
17 |    * @param config overrides and tweaks
18 |    * @param url the address to access and retrieve content from
19 |    * @return `Some` `String` of the response from the specified `url` or `None` if failed to retrieve HTML.
20 |    */
21 |   def getHtml(config: Configuration, url: String): Option[String]
22 | 
23 |   /**
24 |    * A shared accessor for making image calls
25 |    * @return a fully configured and initialized instance for shared use
26 |    */
27 |   def getHttpClient: HttpClient
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/SecretGifException.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.images
19 | 
20 | /**
21 |  * Created by Jim Plush
22 |  * User: jim
23 |  * Date: 8/18/11
24 |  */
25 | class SecretGifException extends Exception {
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/cleaners/StandardDocumentCleaner.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.cleaners
19 | 
20 | /**
21 |  * Created by Jim Plush
22 |  * User: jim
23 |  * Date: 8/16/11
24 |  */
25 | 
26 | class StandardDocumentCleaner extends DocumentCleaner {
27 | 
28 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/Image.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.images
19 | 
20 | /**
21 |  * Created by Jim Plush
22 |  * User: jim
23 |  * Date: 8/18/11
24 |  */
25 | 
26 | class Image {
27 |   var imageSrc: String = ""
28 |   var imageBase64: String = ""
29 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/StandardContentExtractor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.extractors
19 | 
20 | import com.gravity.goose.utils.Logging
21 | 
22 | 
23 | /**
24 |  * Created by Jim Plush
25 |  * User: jim
26 |  * Date: 8/15/11
27 |  */
28 | 
29 | object StandardContentExtractor extends ContentExtractor


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/NotHtmlException.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.network
20 | 
21 | /**
22 |  * Created by Jim Plush
23 |  * User: jim
24 |  * Date: 8/14/11
25 |  */
26 | 
27 | class NotHtmlException(url: String) extends Exception {
28 |   override val getMessage = "No HTML returned for url: " + url
29 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/outputformatters/StandardOutputFormatter.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.outputformatters
20 | 
21 | import com.gravity.goose.utils.Logging
22 | 
23 | /**
24 |  * Created by Jim Plush
25 |  * User: jim
26 |  * Date: 8/17/11
27 |  */
28 | 
29 | object StandardOutputFormatter extends OutputFormatter with Logging


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/AdditionalDataExtractor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.extractors
19 | 
20 | import org.jsoup.nodes.Element
21 | 
22 | /**
23 | * Implement this abstract class to extract anything not currently contained within the {@link com.jimplush.goose.Article} class
24 | */
25 | class AdditionalDataExtractor extends Extractor[Map[String, String]] {
26 |   def extract(rootElement: Element): Map[String, String] = {
27 |     Map.empty
28 |   }
29 | }
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/statichtml/msn1_result.txt:
--------------------------------------------------------------------------------
1 | "Head to the supermarket an hour before closing time. Some stores mark down prepared foods and bakery items then because they can't sell them the following day. You could get a rotisserie chicken or freshly baked cookies for 50 percent off, or nab two sushi meals for the price of one. If you're planning to host a party or some other gathering, it's worth your time to ask the deli or bakery manager for a 5 to 10 percent discount off your catering order. Also, keep an eye out for online coupons: Some grocery stores accept coupons printed out from sites like TheGroceryGame.com, ShopAtHome.com, and CouponMom.com, even though they rarely publicize the fact. (Find out your store's policy at the customer-service counter.) It also pays to check the market's own website. You could find weekly deals there that it doesn't advertise anywhere else, including its in-store flyers.
2 | 
3 | "And even though it's convenient to do all your shopping in one place, avoid going to a grocery store for kitchen supplies, like measuring cups and cookie sheets, or seasonal items, like holiday decorations and gift bags. These products will have inflated prices. Buy them at a big-box chain, like Target or Walmart, instead."
4 | 
5 | More from Bing and MSN Lifestyle Site Search: Get additional content on saving on your grocery bill


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/HashUtils.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.text
20 | 
21 | import java.security.MessageDigest
22 | 
23 | /**
24 | * Created by Jim Plush
25 | * User: jim
26 | * Date: 8/14/11
27 | */
28 | 
29 | object HashUtils {
30 | 
31 |   def md5(s: String): String = {
32 |     val md5 = MessageDigest.getInstance("MD5")
33 | 
34 |     md5.reset()
35 |     md5.update(s.getBytes)
36 | 
37 |     val result = md5.digest().map(0xFF & _).map { "%02x".format(_) }.mkString
38 | 
39 |     result
40 |   }
41 | 
42 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/StringSplitter.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.text
20 | 
21 | /**
22 | * Created by IntelliJ IDEA.
23 | * User: robbie
24 | * Date: 5/13/11
25 | * Time: 3:53 PM
26 | */
27 | 
28 | import java.util.regex.Pattern
29 | 
30 | class StringSplitter {
31 |   def this(pattern: String) {
32 |     this ()
33 |     this.pattern = Pattern.compile(pattern)
34 |   }
35 | 
36 |   def split(input: String): Array[String] = {
37 |     if (string.isNullOrEmpty(input)) return string.emptyArray
38 |     pattern.split(input)
39 |   }
40 | 
41 |   private var pattern: Pattern = null
42 | }
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/utils/FileHelper.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.utils
20 | 
21 | import org.apache.commons.io.IOUtils
22 | import java.io.{IOException, InputStream}
23 | 
24 | 
25 | /**
26 |  * Created by Jim Plush
27 |  * User: jim
28 |  * Date: 8/16/11
29 |  */
30 | 
31 | object FileHelper extends Logging {
32 | 
33 |   def loadResourceFile[A](filename: String, cls: Class[A]): String = {
34 |     var filedata: String = ""
35 |     val is: InputStream = cls.getResourceAsStream(filename)
36 |     try {
37 |       filedata = IOUtils.toString(is, "UTF-8")
38 |     }
39 |     catch {
40 |       case e: IOException => warn(e, e.toString)
41 |     }
42 |     filedata
43 |   }
44 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/string.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.text
20 | 
21 | /**
22 | * Created by IntelliJ IDEA.
23 | * User: robbie
24 | * Date: 5/13/11
25 | * Time: 12:11 AM
26 | */
27 | 
28 | object string {
29 |   def isNullOrEmpty(input: String): Boolean = {
30 |     if (input == null) return true
31 |     if (input.length == 0) return true
32 |     false
33 |   }
34 | 
35 |   val empty: String = ""
36 |   val emptyArray: Array[String] = Array[String](empty)
37 |   var SPACE_SPLITTER: StringSplitter = new StringSplitter(" ")
38 | 
39 |   def tryToInt(input: String): Option[Int] = {
40 |     try {
41 |       Some(input.toInt)
42 |     } catch {
43 |       case _: Exception => None
44 |     }
45 |   }
46 | }
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/Extractor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.extractors
19 | 
20 | import org.jsoup.nodes.Element
21 | 
22 | /**
23 | * Created by IntelliJ IDEA.
24 | * User: robbie
25 | * Date: 5/19/11
26 | * Time: 2:45 PM
27 | */
28 | /**
29 | * Encapsulates the process of extracting some type <code>T</code> from an article
30 | * @param <T> the type of {@link Object} the implementing class will return
31 | */
32 | trait Extractor[T] {
33 |   /**
34 |   * Given the specified {@link Element}, extract @param <T>
35 |   *
36 |   * @param rootElement passed in from the {@link com.jimplush.goose.ContentExtractor} after the article has been parsed
37 |   * @return an instance of type <code>T</code>
38 |   */
39 |   def extract(rootElement: Element): T
40 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/ImageDetails.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.images
19 | 
20 | /**
21 |  * Created by Jim Plush
22 |  * User: jim
23 |  * Date: 8/18/11
24 |  */
25 | 
26 | /**
27 | * holds the details of the result of inspecting an image
28 | * @author Jim Plush
29 | *
30 | */
31 | class ImageDetails {
32 |   def getWidth: Int = width
33 | 
34 |   def setWidth(width: Int) {
35 |     this.width = width
36 |   }
37 | 
38 |   def getHeight: Int = height
39 | 
40 |   def setHeight(height: Int) {
41 |     this.height = height
42 |   }
43 | 
44 |   def getMimeType: String = mimeType
45 | 
46 |   def setMimeType(mimeType: String) {
47 |     this.mimeType = mimeType
48 |   }
49 | 
50 |   /**
51 |   * the width of the image
52 |   */
53 |   private var width: Int = 0
54 |   /**
55 |   * height of the image
56 |   */
57 |   private var height: Int = 0
58 |   /**
59 |   * the mimeType of the image JPEG / PNG
60 |   */
61 |   private var mimeType: String = _
62 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/StringReplacement.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.text
20 | 
21 | /**
22 | * Created by IntelliJ IDEA.
23 | * User: robbie
24 | * Date: 5/13/11
25 | * Time: 11:38 AM
26 | */
27 | 
28 | import java.util.regex.Pattern
29 | 
30 | object StringReplacement {
31 |   def compile(pattern: String, replaceWith: String): StringReplacement = {
32 |     if (string.isNullOrEmpty(pattern)) throw new IllegalArgumentException("Patterns must not be null or empty!")
33 |     var p: Pattern = Pattern.compile(pattern)
34 |     return new StringReplacement(p, replaceWith)
35 |   }
36 | }
37 | 
38 | class StringReplacement {
39 |   private def this(pattern: Pattern, replaceWith: String) {
40 |     this ()
41 |     this.pattern = pattern
42 |     this.replaceWith = replaceWith
43 |   }
44 | 
45 |   def replaceAll(input: String): String = {
46 |     if (string.isNullOrEmpty(input)) return string.empty
47 |     return pattern.matcher(input).replaceAll(replaceWith)
48 |   }
49 | 
50 |   private var pattern: Pattern = null
51 |   private var replaceWith: String = null
52 | }
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/src/test/resources/com/aamend/spark/gdelt/normDaily.csv:
--------------------------------------------------------------------------------
  1 | 19200101,87765
  2 | 19200102,121054
  3 | 19200103,153580
  4 | 19200104,110228
  5 | 19200105,63362
  6 | 19200106,85
  7 | 19790101,661
  8 | 19790102,976
  9 | 19790103,1060
 10 | 19790104,950
 11 | 19790105,1027
 12 | 19790106,644
 13 | 19790107,884
 14 | 19790108,1291
 15 | 19790109,1287
 16 | 19790110,1290
 17 | 19790111,866
 18 | 19790112,1443
 19 | 19790113,726
 20 | 19790114,851
 21 | 19790115,1138
 22 | 19790116,1290
 23 | 19790117,1279
 24 | 19790118,1353
 25 | 19790119,1324
 26 | 19790120,1165
 27 | 19790121,744
 28 | 19790122,1065
 29 | 19790123,1277
 30 | 19790124,1079
 31 | 19790125,1156
 32 | 19790126,1563
 33 | 19790127,780
 34 | 19790128,983
 35 | 19790129,1285
 36 | 19790130,1168
 37 | 19790131,925
 38 | 19790201,902
 39 | 19790202,1625
 40 | 19790203,738
 41 | 19790204,602
 42 | 19790205,1332
 43 | 19790206,1170
 44 | 19790207,1218
 45 | 19790208,1286
 46 | 19790209,1383
 47 | 19790210,825
 48 | 19790211,714
 49 | 19790212,944
 50 | 19790213,1056
 51 | 19790214,1184
 52 | 19790215,1159
 53 | 19790216,1473
 54 | 19790217,1216
 55 | 19790218,1072
 56 | 19790219,1581
 57 | 19790220,1246
 58 | 19790221,1617
 59 | 19790222,1165
 60 | 19790223,1685
 61 | 19790224,875
 62 | 19790225,1255
 63 | 19790226,1364
 64 | 19790227,1247
 65 | 19790228,1516
 66 | 19790301,789
 67 | 19790302,1237
 68 | 19790303,507
 69 | 19790304,648
 70 | 19790305,839
 71 | 19790306,864
 72 | 19790307,842
 73 | 19790308,648
 74 | 19790309,1145
 75 | 19790310,738
 76 | 19790311,719
 77 | 19790312,1465
 78 | 19790313,969
 79 | 19790314,1034
 80 | 19790315,1420
 81 | 19790316,2019
 82 | 19790317,1349
 83 | 19790318,1056
 84 | 19790319,1312
 85 | 19790320,1450
 86 | 19790321,1387
 87 | 19790322,1354
 88 | 19790323,1630
 89 | 19790324,1125
 90 | 19790325,967
 91 | 19790326,1043
 92 | 19790327,1297
 93 | 19790328,1244
 94 | 19790329,1286
 95 | 19790330,1661
 96 | 19790331,1120
 97 | 19790401,1240
 98 | 19790402,1038
 99 | 19790403,1193
100 | 19790404,1276
101 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/HttpExceptions.scala:
--------------------------------------------------------------------------------
 1 | package com.gravity.goose.network
 2 | 
 3 | /**
 4 |  * Created by IntelliJ IDEA.
 5 |  * Author: Robbie Coleman
 6 |  * Date: 11/2/11
 7 |  * Time: 10:25 AM
 8 |  */
 9 | 
10 | class LoggableException(msg: String, innerEx: Exception = null) extends Exception(msg, innerEx) {
11 |   override lazy val getMessage = {
12 |     val innerMessage = if (innerEx != null) {
13 |       "%n\tand inner Exception of type %s:%n\t\tmessage: %s".format(innerEx.getClass.getName, innerEx.getMessage)
14 |     } else {
15 |       ""
16 |     }
17 |     getClass.getName + " ==> " + msg + innerMessage
18 |   }
19 | }
20 | 
21 | class NotFoundException(url: String) extends LoggableException("SERVER RETURNED 404 FOR LINK: " + url)
22 | class BadRequestException(url: String) extends LoggableException("Bad Request for URL: " + url)
23 | class NotAuthorizedException(url: String, statusCode: Int = 403) extends LoggableException("Not authorized (statusCode: %d) to access URL: %s".format(statusCode, url))
24 | class ServerErrorException(url: String, statusCode: Int = 500) extends LoggableException("Server Error! Status code returned: %d for URL: %s".format(statusCode, url))
25 | class UnhandledStatusCodeException(url: String, statusCode: Int)  extends LoggableException("Received HTTP statusCode: %d from URL: %s and did not know how to handle it!".format(statusCode, url))
26 | 
27 | object HttpStatusValidator {
28 |   def validate(url: String, statusCode: Int): Either[Exception, String] = statusCode match {
29 |     case 200 => Right("OK")
30 |     case 400 => Left(new BadRequestException(url))
31 |     case 404 => Left(new NotFoundException(url))
32 |     case auth if (auth > 400 && auth < 500) => Left(new NotAuthorizedException(url, auth))
33 |     case error if (error > 499) => Left(new ServerErrorException(url, error))
34 |     case unk => Left(new UnhandledStatusCodeException(url, statusCode))
35 |   }
36 | }
37 | 
38 | class ImageFetchException(imgSrc: String, ex: Exception = null) extends LoggableException("Failed to fetch image file from imgSrc: " + imgSrc, ex)


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/WordStats.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.text
20 | 
21 | import java.util.ArrayList
22 | import java.util.List
23 | 
24 | /**
25 | * User: Jim Plush
26 | * Date: Oct 29, 2010
27 | * Time: 3:59:44 PM
28 | */
29 | object WordStats {
30 |   var EMPTY: WordStats = new WordStats
31 | }
32 | 
33 | class WordStats {
34 | 
35 | 
36 |   import WordStats._
37 | 
38 |   /**
39 |   * total number of stopwords or good words that we can calculate
40 |   */
41 |   var stopWordCount: Int = 0
42 |   /**
43 |   * total number of words on a node
44 |   */
45 |   var wordCount: Int = 0
46 |   /**
47 |   * holds an actual list of the stop words we found
48 |   */
49 |   var stopWords: List[String] = new ArrayList[String]
50 | 
51 |   def getStopWords: List[String] = {
52 |     stopWords
53 |   }
54 | 
55 |   def setStopWords(words: List[String]) {
56 |     stopWords = words
57 |   }
58 | 
59 |   def getStopWordCount: Int = {
60 |     stopWordCount
61 |   }
62 | 
63 |   def setStopWordCount(wordcount: Int) {
64 |     stopWordCount = wordcount
65 |   }
66 | 
67 |   def getWordCount: Int = {
68 |     wordCount
69 |   }
70 | 
71 |   def setWordCount(cnt: Int) {
72 |     wordCount = cnt
73 |   }
74 | 
75 | 
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/spark/package.scala:
--------------------------------------------------------------------------------
 1 | package com.gravity.goose
 2 | 
 3 | import java.sql.Date
 4 | 
 5 | import org.apache.commons.lang.StringUtils
 6 | 
 7 | import scala.util.Try
 8 | 
 9 | package object spark {
10 | 
11 |   val ANNOTATOR_TITLE = "title"
12 |   val ANNOTATOR_CONTENT = "content"
13 |   val ANNOTATOR_DESCRIPTION = "description"
14 |   val ANNOTATOR_KEYWORDS = "keywords"
15 |   val ANNOTATOR_PUBLISH_DATE = "publishDate"
16 | 
17 |   // List of supported annotators
18 |   val ANNOTATORS = Array(
19 |     ANNOTATOR_TITLE,
20 |     ANNOTATOR_CONTENT,
21 |     ANNOTATOR_DESCRIPTION,
22 |     ANNOTATOR_KEYWORDS,
23 |     ANNOTATOR_PUBLISH_DATE
24 |   )
25 | 
26 |   def scrapeArticles(it: Iterator[String], goose: Goose): Iterator[GooseArticle] = {
27 |     it.map(url => {
28 |       Try {
29 |         val article = goose.extractContent(url)
30 |         GooseArticle(
31 |           url = url,
32 |           title = if(StringUtils.isNotEmpty(article.title)) Some(article.title) else None,
33 |           content = if(StringUtils.isNotEmpty(article.cleanedArticleText)) Some(article.cleanedArticleText.replaceAll("\\n+", "\n")) else None,
34 |           description = if(StringUtils.isNotEmpty(article.metaDescription)) Some(article.metaDescription) else None,
35 |           keywords = if(StringUtils.isNotEmpty(article.metaKeywords)) article.metaKeywords.split(",").map(_.trim.toUpperCase) else Array.empty[String],
36 |           publishDate = if(article.publishDate != null) Some(new Date(article.publishDate.getTime)) else None,
37 |           image = None
38 |         )
39 |       } getOrElse GooseArticle(url)
40 |     })
41 |   }
42 | 
43 |   case class GooseArticle(
44 |                          url: String,
45 |                          title: Option[String] = None,
46 |                          content: Option[String] = None,
47 |                          description: Option[String] = None,
48 |                          keywords: Array[String] = Array.empty[String],
49 |                          publishDate: Option[Date] = None,
50 |                          image: Option[String] = None
51 |                          )
52 | }
53 | 


--------------------------------------------------------------------------------
/src/test/resources/com/aamend/spark/gdelt/normDailyByCountry.csv:
--------------------------------------------------------------------------------
  1 | 19200101,,2396
  2 | 19200101,AC,10
  3 | 19200101,AE,200
  4 | 19200101,AF,699
  5 | 19200101,AG,55
  6 | 19200101,AJ,48
  7 | 19200101,AL,20
  8 | 19200101,AM,37
  9 | 19200101,AO,76
 10 | 19200101,AR,68
 11 | 19200101,AS,2650
 12 | 19200101,AU,165
 13 | 19200101,AY,12
 14 | 19200101,BA,72
 15 | 19200101,BB,47
 16 | 19200101,BC,18
 17 | 19200101,BD,9
 18 | 19200101,BE,104
 19 | 19200101,BF,68
 20 | 19200101,BG,456
 21 | 19200101,BH,28
 22 | 19200101,BK,15
 23 | 19200101,BL,64
 24 | 19200101,BM,70
 25 | 19200101,BN,49
 26 | 19200101,BO,67
 27 | 19200101,BR,246
 28 | 19200101,BT,180
 29 | 19200101,BU,41
 30 | 19200101,BX,36
 31 | 19200101,BY,9
 32 | 19200101,CA,1720
 33 | 19200101,CB,174
 34 | 19200101,CD,43
 35 | 19200101,CE,351
 36 | 19200101,CF,40
 37 | 19200101,CG,11
 38 | 19200101,CH,2595
 39 | 19200101,CI,93
 40 | 19200101,CJ,1
 41 | 19200101,CM,90
 42 | 19200101,CO,100
 43 | 19200101,CS,14
 44 | 19200101,CT,12
 45 | 19200101,CU,96
 46 | 19200101,CW,17
 47 | 19200101,CY,306
 48 | 19200101,DA,76
 49 | 19200101,DJ,19
 50 | 19200101,DR,18
 51 | 19200101,EC,19
 52 | 19200101,EG,350
 53 | 19200101,EI,700
 54 | 19200101,EK,3
 55 | 19200101,EN,25
 56 | 19200101,ER,1
 57 | 19200101,ES,47
 58 | 19200101,ET,145
 59 | 19200101,EZ,18
 60 | 19200101,FI,50
 61 | 19200101,FJ,53
 62 | 19200101,FR,974
 63 | 19200101,GA,29
 64 | 19200101,GB,6
 65 | 19200101,GG,19
 66 | 19200101,GH,450
 67 | 19200101,GI,2
 68 | 19200101,GJ,9
 69 | 19200101,GK,7
 70 | 19200101,GL,7
 71 | 19200101,GM,604
 72 | 19200101,GQ,11
 73 | 19200101,GR,254
 74 | 19200101,GT,80
 75 | 19200101,GV,72
 76 | 19200101,GY,60
 77 | 19200101,GZ,8
 78 | 19200101,HA,88
 79 | 19200101,HK,518
 80 | 19200101,HO,36
 81 | 19200101,HR,51
 82 | 19200101,HU,99
 83 | 19200101,IC,23
 84 | 19200101,ID,349
 85 | 19200101,IN,5068
 86 | 19200101,IR,3040
 87 | 19200101,IS,1931
 88 | 19200101,IT,421
 89 | 19200101,IV,45
 90 | 19200101,IZ,3500
 91 | 19200101,JA,1203
 92 | 19200101,JE,101
 93 | 19200101,JM,138
 94 | 19200101,JO,146
 95 | 19200101,KE,353
 96 | 19200101,KG,17
 97 | 19200101,KN,776
 98 | 19200101,KR,19
 99 | 19200101,KS,552
100 | 19200101,KU,249
101 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/StopWords.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose.text
20 | 
21 | /**
22 |  * Created by Jim Plush
23 |  * User: jim
24 |  * Date: 8/16/11
25 |  */
26 | 
27 | import java.util._
28 | import com.gravity.goose.utils.FileHelper
29 | 
30 | object StopWords {
31 | 
32 |   // the confusing pattern below is basically just match any non-word character excluding white-space.
33 |   private val PUNCTUATION: StringReplacement = StringReplacement.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]", string.empty)
34 | 
35 |   val STOP_WORDS = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass).split(sys.props("line.separator")).toSet
36 | 
37 | 
38 |   def removePunctuation(str: String): String = {
39 |     PUNCTUATION.replaceAll(str)
40 |   }
41 | 
42 |   def getStopWordCount(content: String): WordStats = {
43 | 
44 |     if (string.isNullOrEmpty(content)) return WordStats.EMPTY
45 |     val ws: WordStats = new WordStats
46 |     val strippedInput: String = removePunctuation(content)
47 | 
48 |     val candidateWords: Array[String] = string.SPACE_SPLITTER.split(strippedInput)
49 | 
50 |     val overlappingStopWords: List[String] = new ArrayList[String]
51 | 
52 |     candidateWords.foreach(w => {
53 |        if (STOP_WORDS.contains(w.toLowerCase)) overlappingStopWords.add(w.toLowerCase)
54 |     })
55 |     ws.setWordCount(candidateWords.length)
56 |     ws.setStopWordCount(overlappingStopWords.size)
57 |     ws.setStopWords(overlappingStopWords)
58 |     ws
59 |   }
60 | 
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/scala/com/aamend/spark/gdelt/GdeltParserTest.scala:
--------------------------------------------------------------------------------
 1 | package com.aamend.spark.gdelt
 2 | 
 3 | import org.scalatest.Matchers
 4 | 
 5 | import scala.io.Source
 6 | 
 7 | class GdeltParserTest extends SparkSpec with Matchers {
 8 | 
 9 |   //   I simply test all my dataframes can be loaded, no exception should be thrown
10 |   sparkTest("loading GDELT universe") { spark =>
11 |     import spark.implicits._
12 |     Source.fromInputStream(this.getClass.getResourceAsStream("gkg.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV2).show()
13 |     Source.fromInputStream(this.getClass.getResourceAsStream("gkgT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV2).show()
14 |     Source.fromInputStream(this.getClass.getResourceAsStream("gkg1.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV1).show()
15 |     Source.fromInputStream(this.getClass.getResourceAsStream("gkg1Count.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgCountV1).show()
16 |     Source.fromInputStream(this.getClass.getResourceAsStream("events.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV2).show()
17 |     Source.fromInputStream(this.getClass.getResourceAsStream("events1.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV1).show()
18 |     Source.fromInputStream(this.getClass.getResourceAsStream("eventsT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV2).show()
19 |     Source.fromInputStream(this.getClass.getResourceAsStream("mentions.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseMentionV2).show()
20 |     Source.fromInputStream(this.getClass.getResourceAsStream("mentionsT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseMentionV2).show()
21 |     Source.fromInputStream(this.getClass.getResourceAsStream("normDaily.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseNormDaily).show()
22 |     Source.fromInputStream(this.getClass.getResourceAsStream("normDailyByCountry.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseNormDailyByCountry).show()
23 |   }
24 | 
25 |   // I simply test all my dataframes can be loaded, no exception should be thrown
26 |   sparkTest("loading GDELT reference data") { spark =>
27 |     spark.loadCountryCodes.show()
28 |     spark.loadGcams.show()
29 |     spark.loadCameoEventCodes.show()
30 |     spark.loadCameoTypeCodes.show()
31 |     spark.loadCameoGroupCodes.show()
32 |     spark.loadCameoEthnicCodes.show()
33 |     spark.loadCameoReligionCodes.show()
34 |     spark.loadCameoCountryCodes.show()
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/Goose.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package com.gravity.goose
20 | 
21 | import network.HtmlFetcher
22 | import java.io.File
23 | 
24 | /**
25 |  * Created by Jim Plush - Gravity.com
26 |  * Date: 8/14/11
27 |  */
28 | class Goose(config: Configuration = new Configuration) {
29 | 
30 | 
31 |   initializeEnvironment()
32 | 
33 |   /**
34 |   * Main method to extract an article object from a URL, pass in a url and get back a Article
35 |   * @param url The url that you want to extract
36 |   */
37 |   def extractContent(url: String, rawHTML: String): Article = {
38 |     val cc = CrawlCandidate(config, url, rawHTML)
39 |     sendToActor(cc)
40 |   }
41 | 
42 |   def extractContent(url: String): Article = {
43 |     val cc = CrawlCandidate(config, url, null)
44 |     sendToActor(cc)
45 |   }
46 | 
47 |   def shutdownNetwork() {
48 |     HtmlFetcher.getHttpClient.getConnectionManager.shutdown()
49 |   }
50 | 
51 |   def sendToActor(crawlCandidate: CrawlCandidate): Article = {
52 |     val crawler = new Crawler(config)
53 |     val article = crawler.crawl(crawlCandidate)
54 |     article
55 |   }
56 | 
57 |   def initializeEnvironment() {
58 | 
59 |     val f = new File(config.localStoragePath)
60 |     try {
61 |       if (!f.isDirectory) f.mkdirs()
62 |     } catch {
63 |       case _: Exception =>
64 |     }
65 |     if (!f.isDirectory)
66 |       throw new Exception(config.localStoragePath + " directory does not seem to exist, you need to set this for image processing downloads")
67 |     if (!f.canWrite)
68 |       throw new Exception(config.localStoragePath + " directory is not writable, you need to set this for image processing downloads")
69 | 
70 |     // todo cleanup any jank that may be in the tmp folder currently
71 |   }
72 | 
73 | }
74 | 
75 | object Goose {
76 |   implicit val config = new Configuration
77 |   val logPrefix = "goose: "
78 | }


--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/statichtml/issue_24_result.txt:
--------------------------------------------------------------------------------
1 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity. For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
2 | 
3 | Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically. The classpath for compilation is taken from the Eclipse project .classpath file. You may take the source directory from there as well if you wish (exercise to the reader). The params are not passed to the fsc in the command line since in my project's case the line is too long for the OS to handle. The alternative is to put it into a file and let fsc handle it for you.
4 | 
5 | TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code. In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day.
6 | 
7 | Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.


--------------------------------------------------------------------------------
/src/main/scala/com/aamend/spark/gdelt/reference/CameoCodes.scala:
--------------------------------------------------------------------------------
 1 | package com.aamend.spark.gdelt.reference
 2 | 
 3 | import com.aamend.spark.gdelt.CameoCode
 4 | import com.aamend.spark.gdelt.T
 5 | import org.apache.spark.sql.{Dataset, SparkSession}
 6 | 
 7 | import scala.io.Source
 8 | 
 9 | object CameoCodes {
10 | 
11 |   def loadEventCode(spark: SparkSession): Dataset[CameoCode] = {
12 |     import spark.implicits._
13 |     Source.fromInputStream(this.getClass.getResourceAsStream("cameoEvent.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
14 |       val tokens = line.split("\t")
15 |       CameoCode(
16 |         cameoCode = T(()=>tokens(0).toUpperCase()),
17 |         cameoValue = T(()=>tokens(1).toLowerCase())
18 |       )
19 |     }).toDS()
20 |   }
21 | 
22 |   def loadTypeCode(spark: SparkSession): Dataset[CameoCode] = {
23 |     import spark.implicits._
24 |     Source.fromInputStream(this.getClass.getResourceAsStream("cameoType.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
25 |       val tokens = line.split("\t")
26 |       CameoCode(
27 |         cameoCode = T(()=>tokens(0).toUpperCase()),
28 |         cameoValue = T(()=>tokens(1).toLowerCase())
29 |       )
30 |     }).toDS()
31 |   }
32 | 
33 |   def loadGroupCode(spark: SparkSession): Dataset[CameoCode] = {
34 |     import spark.implicits._
35 |     Source.fromInputStream(this.getClass.getResourceAsStream("cameoGroup.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
36 |       val tokens = line.split("\t")
37 |       CameoCode(
38 |         cameoCode = T(()=>tokens(0).toUpperCase()),
39 |         cameoValue = T(()=>tokens(1).toLowerCase())
40 |       )
41 |     }).toDS()
42 |   }
43 | 
44 |   def loadEthnicCode(spark: SparkSession): Dataset[CameoCode] = {
45 |     import spark.implicits._
46 |     Source.fromInputStream(this.getClass.getResourceAsStream("cameoEthnic.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
47 |       val tokens = line.split("\t")
48 |       CameoCode(
49 |         cameoCode = T(()=>tokens(0).toUpperCase()),
50 |         cameoValue = T(()=>tokens(1).toLowerCase())
51 |       )
52 |     }).toDS()
53 |   }
54 | 
55 |   def loadReligionCode(spark: SparkSession): Dataset[CameoCode] = {
56 |     import spark.implicits._
57 |     Source.fromInputStream(this.getClass.getResourceAsStream("cameoReligion.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
58 |       val tokens = line.split("\t")
59 |       CameoCode(
60 |         cameoCode = T(()=>tokens(0).toUpperCase()),
61 |         cameoValue = T(()=>tokens(1).toLowerCase())
62 |       )
63 |     }).toDS()
64 |   }
65 | 
66 |   def loadCountryCode(spark: SparkSession): Dataset[CameoCode] = {
67 |     import spark.implicits._
68 |     Source.fromInputStream(this.getClass.getResourceAsStream("cameoCountry.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
69 |       val tokens = line.split("\t")
70 |       CameoCode(
71 |         cameoCode = T(()=>tokens(0).toUpperCase()),
72 |         cameoValue = T(()=>tokens(1).toLowerCase())
73 |       )
74 |     }).toDS()
75 |   }
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/PublishDateExtractor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.extractors
19 | 
20 | import java.sql.Date
21 | import javax.xml.datatype.DatatypeFactory
22 | 
23 | import com.gravity.goose.utils.Logging
24 | import org.jsoup.nodes.Element
25 | 
26 | /**
27 | * Implement this class to extract the {@link Date} of when this article was published.
28 | */
29 | /**
30 |  * Created by IntelliJ IDEA.
31 |  * User: robbie
32 |  * Date: 5/19/11
33 |  * Time: 2:50 PM
34 |  */
35 | abstract class PublishDateExtractor extends Extractor[Date] {
36 |   /**
37 |   * Intended to search the DOM and identify the {@link Date} of when this article was published.
38 |   * <p>This will be called by the {@link com.jimplush.goose.ContentExtractor#extractContent(String)} method and will be passed to {@link com.jimplush.goose.Article#setPublishDate(java.sql.Date)}</p>
39 |   *
40 |   * @param rootElement passed in from the {@link com.jimplush.goose.ContentExtractor} after the article has been parsed
41 |   * @return {@link Date} of when this particular article was published or <code>null</code> if no date could be found.
42 |   */
43 |   def extract(rootElement: Element): Date
44 | }
45 | 
46 | object PublishDateExtractor extends Logging {
47 |   val logPrefix = "PublishDateExtractor: "
48 | 
49 |   lazy val datatypeFactory: DatatypeFactory = DatatypeFactory.newInstance()
50 | 
51 |   /**
52 |     * Helper function to return the minimum of two non-null Java Dates.
53 |     */
54 |   def minDate(lhs: java.sql.Date, rhs: java.sql.Date): java.sql.Date = {
55 |     if (lhs.getTime < rhs.getTime)
56 |       lhs
57 |     else
58 |       rhs
59 |   }
60 | 
61 |   /**
62 |     * Helper function to parse ISO 8601 date/time strings safely.
63 |     */
64 |   def safeParseISO8601Date(txt: String): Option[java.sql.Date] = {
65 |     if (txt == null || txt.isEmpty)
66 |       return None
67 | 
68 |     try {
69 |       Option(new Date(datatypeFactory.newXMLGregorianCalendar(txt).toGregorianCalendar.getTime.getTime))
70 |     } catch {
71 |       case ex: Exception =>
72 |         info(s"`$txt` could not be parsed to date as it did not meet the ISO 8601 spec")
73 |         None
74 |     }
75 |   }
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/ImageExtractor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to Gravity.com under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  Gravity.com licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package com.gravity.goose.images
19 | 
20 | import org.jsoup.nodes.{Document, Element}
21 | import com.gravity.goose.utils.{CanLog, Logging}
22 | import org.slf4j.Logger
23 | 
24 | /**
25 | * Created by Jim Plush
26 | * User: jim
27 | * Date: 8/18/11
28 | */
29 | 
30 | // represents a file stored on disk that we've downloaded
31 | case class LocallyStoredImage(
32 |                                imgSrc: String,
33 |                                mimeType: String,
34 |                                base64: String,
35 |                                bytes: Long,
36 |                                height: Int = 0,
37 |                                width: Int = 0
38 |                              )
39 | 
40 | trait ImageExtractor extends CanLog {
41 | 
42 |   def getBestImage(doc: Document, topNode: Element): Image
43 | 
44 |   def logPrefix: String = ImageExtractor.loggingPrefix
45 | 
46 |   def critical(msg: String, refs: Any*) {
47 |     ImageExtractor.critical(msg, refs: _*)
48 |   }
49 | 
50 |   def critical(t: Throwable, msg: String, refs: Any*) {
51 |     ImageExtractor.critical(t, msg, refs: _*)
52 |   }
53 | 
54 |   def debug(msg: String, refs: Any*) {
55 |     ImageExtractor.debug(msg, refs: _*)
56 |   }
57 | 
58 |   def debug(t: Throwable, msg: String, refs: Any*) {
59 |     ImageExtractor.debug(t, msg, refs: _*)
60 |   }
61 | 
62 |   def info(msg: String, refs: Any*) {
63 |     ImageExtractor.info(msg, refs: _*)
64 |   }
65 | 
66 |   def info(t: Throwable, msg: String, refs: Any*) {
67 |     ImageExtractor.info(t, msg, refs: _*)
68 |   }
69 | 
70 |   def logger: Logger = ImageExtractor.logger
71 | 
72 |   def trace(msg: String, refs: Any*) {
73 |     ImageExtractor.trace(msg, refs: _*)
74 |   }
75 | 
76 |   def trace(t: Throwable, msg: String, refs: Any*) {
77 |     ImageExtractor.trace(t, msg, refs: _*)
78 |   }
79 | 
80 |   def warn(msg: String, refs: Any*) {
81 |     ImageExtractor.warn(msg, refs: _*)
82 |   }
83 | 
84 |   def warn(t: Throwable, msg: String, refs: Any*) {
85 |     ImageExtractor.warn(t, msg, refs: _*)
86 |   }
87 | }
88 | 
89 | object ImageExtractor extends Logging {
90 |   val loggingPrefix = "images: "
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/statichtml/issue_24.txt:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/loose.dtd"><html>
 2 | <html>
 3 | 	<head>
 4 | 		<title>Paragraph Order Test</title>
 5 | 	</head>
 6 | 
 7 | 	<body>
 8 | 		<div>
 9 | 			TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies.
10 | 
11 |  Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity.
12 |  For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
13 | 			<p>Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically.
14 |  The classpath for compilation is taken from the Eclipse project .classpath file. You may take the source directory from there as well if you wish (exercise to the reader).
15 |  The params are not passed to the fsc in the command line since in my project's case the line is too long for the OS to handle. The alternative is to put it into a file and let fsc handle it for you.</p>
16 | 
17 | 			TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code.
18 |  In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day.
19 | 			<p>Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.</p>
20 | 		</div>
21 | 	</body>
22 | </html>


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/utils/URLHelper.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.gravity.goose.utils
 20 | 
 21 | import com.gravity.goose.text.{StringReplacement, HashUtils}
 22 | import java.net.{URI, MalformedURLException, URL}
 23 | import org.apache.http.client.methods.HttpGet
 24 | 
 25 | /**
 26 |  * Created by Jim Plush
 27 |  * User: jim
 28 |  * Date: 8/14/11
 29 |  */
 30 | 
 31 | case class ParsingCandidate(urlString: String, linkhash: String, url: URL)
 32 | 
 33 | object URLHelper extends Logging {
 34 | 
 35 |   private val ESCAPED_FRAGMENT_REPLACEMENT: StringReplacement = StringReplacement.compile("#!", "?_escaped_fragment_=")
 36 | 
 37 |   /**
 38 |   * returns a ParseCandidate object  that is a valid URL
 39 |   */
 40 |   def getCleanedUrl(urlToCrawl: String): Option[ParsingCandidate] = {
 41 | 
 42 |     val finalURL =
 43 |       if (urlToCrawl.contains("#!")) ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(urlToCrawl) else urlToCrawl
 44 | 
 45 |     try {
 46 |       val url = new URL(finalURL)
 47 |       val linkhash = HashUtils.md5(finalURL)
 48 |       Some(ParsingCandidate(finalURL, linkhash, url))
 49 |     }
 50 |     catch {
 51 |       case e: MalformedURLException => {
 52 |         warn("{0} - is a malformed URL and cannot be processed", urlToCrawl)
 53 |         None
 54 |       }
 55 |       case unknown: Exception => {
 56 |         critical("Unable to process URL: {0} due to an unexpected exception:\n\tException Type: {1}\n\tException Message: {2}\n\tException Stack:\n{3}",
 57 |           urlToCrawl,
 58 |           unknown.getClass.getCanonicalName,
 59 |           unknown.getMessage,
 60 |           unknown.getStackTraceString)
 61 | 
 62 |         None
 63 |       }
 64 |     }
 65 |   }
 66 | 
 67 |   def tryToURL(url: String): Option[URL] = {
 68 |     val finalUrl = if (url.contains("#!")) {
 69 |       ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(url)
 70 |     } else {
 71 |       url
 72 |     }
 73 | 
 74 |     try {
 75 |       Some(new URL(finalUrl))
 76 |     } catch {
 77 |       case _: Exception => None
 78 |     }
 79 |   }
 80 | 
 81 |   def tryToURI(url: String): Option[URI] = {
 82 |     val finalUrl = if (url.contains("#!")) {
 83 |       ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(url)
 84 |     } else {
 85 |       url
 86 |     }
 87 | 
 88 |     try {
 89 |       Some(URI.create(finalUrl))
 90 |     } catch {
 91 |       case _: Exception => None
 92 |     }
 93 |   }
 94 | 
 95 |   def tryToHttpGet(url: String): Option[HttpGet] = {
 96 |     tryToURI(url) match {
 97 |       case Some(uri) => Some(new HttpGet(uri))
 98 |       case None => None
 99 |     }
100 |   }
101 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/utils/Logging.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.gravity.goose.utils
 20 | 
 21 | import org.slf4j._
 22 | import java.text.MessageFormat
 23 | 
 24 | /**
 25 |  * User: chris bissel
 26 |  * Date: 1/2/11
 27 |  * Time: 1:47 PM
 28 |  */
 29 | 
 30 | /**
 31 |  * Trait that enables logging. String formatting is based on the Java MessageFormat object, NOT the
 32 |  * regular String.format.  See this documentation:
 33 |  * http://download.oracle.com/javase/1.4.2/docs/api/java/text/MessageFormat.html
 34 |  *
 35 |  * The code was initially taken from this location at Stack Overflow:
 36 |  * From http://stackoverflow.com/questions/978252/logging-in-scala/981942#981942
 37 |  */
 38 | trait Logging extends CanLog {
 39 | 
 40 |   val logger: Logger = Logging.getLogger(this)
 41 | 
 42 |   private def formatmsg(msg: String, refs: Seq[Any]): String = {
 43 |     new MessageFormat(msg).format(refs.toArray)
 44 |   }
 45 | 
 46 |   private def checkFormat(msg: String, refs: Seq[Any]): String =
 47 |     if (refs.size > 0) formatmsg(msg, refs) else msg
 48 | 
 49 |   def trace(msg: String, refs: Any*) { logger trace checkFormat(msg, refs) }
 50 | 
 51 |   def trace(t: Throwable, msg: String, refs: Any*) { logger trace(checkFormat(msg, refs), t) }
 52 | 
 53 |   def info(msg: String, refs: Any*) { logger info checkFormat(msg, refs) }
 54 | 
 55 |   def info(t: Throwable, msg: String, refs: Any*) { logger info (checkFormat(msg, refs), t) }
 56 | 
 57 |   def warn(msg: String, refs: Any*) { logger warn checkFormat(msg, refs) }
 58 | 
 59 |   def warn(t: Throwable, msg: String, refs: Any*) { logger warn (checkFormat(msg, refs), t) }
 60 | 
 61 |   def critical(msg: String, refs: Any*) { logger error checkFormat(msg, refs) }
 62 | 
 63 |   def critical(t: Throwable, msg: String, refs: Any*) { logger error (checkFormat(msg, refs), t) }
 64 | 
 65 |   def debug(msg: String, refs: Any*) { logger debug checkFormat(msg, refs) }
 66 | 
 67 |   def debug(t: Throwable, msg: String, refs: Any*) { logger debug (checkFormat(msg, refs), t) }
 68 | 
 69 | }
 70 | 
 71 | /**
 72 |  * Note: implementation taken from scalax.logging API
 73 |  */
 74 | object Logging {
 75 | 
 76 |   def loggerNameForClass(className: String) = {
 77 |     if (className endsWith "$") {
 78 |       className.substring(0, className.length - 1)
 79 |     }
 80 |     else {
 81 |       className
 82 |     }
 83 |   }
 84 | 
 85 |   def getLogger(logging: AnyRef) = LoggerFactory.getLogger(loggerNameForClass(logging.getClass.getName))
 86 | }
 87 | 
 88 | trait CanLog {
 89 |   def logger: Logger
 90 | 
 91 |   def trace(msg: String, refs: Any*)
 92 | 
 93 |   def trace(t: Throwable, msg: String, refs: Any*)
 94 | 
 95 |   def info(msg: String, refs: Any*)
 96 | 
 97 |   def info(t: Throwable, msg: String, refs: Any*)
 98 | 
 99 |   def warn(msg: String, refs: Any*)
100 | 
101 |   def warn(t: Throwable, msg: String, refs: Any*)
102 | 
103 |   def critical(msg: String, refs: Any*)
104 | 
105 |   def critical(t: Throwable, msg: String, refs: Any*)
106 | 
107 |   def debug(msg: String, refs: Any*)
108 | 
109 |   def debug(t: Throwable, msg: String, refs: Any*)
110 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/Article.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.gravity.goose
 20 | 
 21 | import java.sql.Date
 22 | 
 23 | import images.Image
 24 | import org.jsoup.nodes.{Document, Element}
 25 | 
 26 | import scala.collection._
 27 | 
 28 | /**
 29 | * Created by Jim Plush
 30 | * User: jim
 31 | * Date: 8/14/11
 32 | */
 33 | 
 34 | class Article {
 35 | 
 36 |   /**
 37 |   * title of the article
 38 |   */
 39 |   var title: String = null
 40 | 
 41 |   /**
 42 |   * stores the lovely, pure text from the article, stripped of html, formatting, etc...
 43 |   * just raw text with paragraphs separated by newlines. This is probably what you want to use.
 44 |   */
 45 |   var cleanedArticleText: String = ""
 46 | 
 47 |   /**
 48 |   * meta description field in HTML source
 49 |   */
 50 |   var metaDescription: String = ""
 51 | 
 52 |   /**
 53 |   * meta keywords field in the HTML source
 54 |   */
 55 |   var metaKeywords: String = ""
 56 | 
 57 |   /**
 58 |   * The canonical link of this article if found in the meta data
 59 |   */
 60 |   var canonicalLink: String = ""
 61 | 
 62 |   /**
 63 |   * holds the domain of this article we're parsing
 64 |   */
 65 |   var domain: String = ""
 66 | 
 67 |   /**
 68 |   * holds the top Element we think is a candidate for the main body of the article
 69 |   */
 70 |   var topNode: Element = null
 71 | 
 72 |   /**
 73 |   * holds the top Image object that we think represents this article
 74 |   */
 75 |   var topImage: Image = new Image
 76 | 
 77 | 
 78 |   /**
 79 |   * holds a set of tags that may have been in the artcle, these are not meta keywords
 80 |   */
 81 |   var tags: Set[String] = null
 82 | 
 83 |   /**
 84 |   * holds a list of any movies we found on the page like youtube, vimeo
 85 |   */
 86 |   var movies: List[Element] = Nil
 87 | 
 88 |   /**
 89 |   * stores the final URL that we're going to try and fetch content against, this would be expanded if any
 90 |   * escaped fragments were found in the starting url
 91 |   */
 92 |   var finalUrl: String = "";
 93 | 
 94 |   /**
 95 |   * stores the MD5 hash of the url to use for various identification tasks
 96 |   */
 97 |   var linkhash: String = "";
 98 | 
 99 |   /**
100 |   * stores the RAW HTML straight from the network connection
101 |   */
102 |   var rawHtml: String = ""
103 | 
104 |   /**
105 |   * the JSoup Document object
106 |   */
107 |   var doc: Document = null
108 | 
109 |   /**
110 |   * this is the original JSoup document that contains a pure object from the original HTML without any cleaning
111 |   * options done on it
112 |   */
113 |   var rawDoc: Document = null
114 | 
115 |   /**
116 |   * Sometimes useful to try and know when the publish date of an article was
117 |   */
118 |   var publishDate: Date = null
119 | 
120 |   /**
121 |    * A property bucket for consumers of goose to store custom data extractions.
122 |    * This is populated by an implementation of {@link com.gravity.goose.extractors.AdditionalDataExtractor}
123 |    * which is executed before document cleansing within {@link com.gravity.goose.CrawlingActor#crawl}
124 |    * @return a {@link Map Map&lt;String,String&gt;} of property name to property vaue (represented as a {@link String}.
125 |    */
126 |   var additionalData: Map[String, String] = Map.empty
127 | }


--------------------------------------------------------------------------------
/src/main/resources/com/aamend/spark/gdelt/reference/cameoGroup.txt:
--------------------------------------------------------------------------------
  1 | CODE	LABEL
  2 | AAM	Al Aqsa Martyrs Brigade
  3 | ABD	Arab Bank for Economic Development in Africa
  4 | ACC	Arab Cooperation Council
  5 | ADB	Asian Development Bank
  6 | AEU	Arab Economic Unity Council
  7 | AFB	African Development Bank
  8 | ALQ	Al Qaeda
  9 | AMF	Arab Monetary Fund for Economic and Social Development
 10 | AML	Amal Militia
 11 | AMN	Amnesty International
 12 | AMU	Arab Maghreb Union
 13 | ANO	Abu Nidal Organization
 14 | APE	Org. of Arab Petroleum Exporting Countries (OAPEC)
 15 | ARL	Arab League
 16 | ASL	South Lebanon Army
 17 | ASN	Association of Southeast Asian Nations (ASEAN)
 18 | ATD	Eastern and Southern African Trade and Development Bank
 19 | BCA	Bank of Central African States (BEAC)
 20 | BIS	Bank for International Settlements
 21 | BTH	Baath Party
 22 | CEM	Common Market for Eastern and Southern Africa
 23 | CEM	Monetary and Economic Community of Central Africa
 24 | CFA	Franc Zone Financial Community of Africa
 25 | CIS	Commonwealth of Independent States
 26 | CMN	Communist
 27 | COE	Council of Europe
 28 | CPA	Cocoa Producer's Alliance
 29 | CPC	Association of Coffee Producing Countries
 30 | CRC	International Fed. of Red Cross and Red Crescent (ICRC)
 31 | CSS	Community of Sahel-Saharan States (CENSAD)
 32 | CWN	Commonwealth of Nations
 33 | DFL	Democratic Front for the Lib. of Palestine (DFLP)
 34 | EBR	European Bank for Reconstruction and Development
 35 | ECA	Economic Community of Central African States
 36 | EEC	European Union
 37 | EFT	European Free Trade Association
 38 | ENN	Ennahda Movement
 39 | FAO	United Nations Food and Agriculture Organization
 40 | FID	International Federation of Human Rights (FIDH)
 41 | FIS	Islamic Salvation Army
 42 | FLN	National Liberation Front (FLN)
 43 | FTA	Fatah
 44 | GCC	Gulf Cooperation Council
 45 | GIA	Armed Islamic Group (GIA)
 46 | GOE	Group of Eight (G-8) (G-7 plus Russia)
 47 | GOS	Group of Seven (G-7)
 48 | GSP	Salafist Group
 49 | GSS	Group of Seventy-Seven (G-77)
 50 | HCH	UN High Commission for Human Rights
 51 | HCR	UN High Commission for Refugees
 52 | HEZ	Hezbullah
 53 | HIP	Highly Indebted Poor Countries (HIPC)
 54 | HMS	Hamas
 55 | HRW	Human Rights Watch
 56 | IAC	Inter-African Coffee Organization (IACO)
 57 | IAD	Intergovernmental Authority on Development (IGAD)
 58 | IAE	International Atomic Energy Agency (IAEA)
 59 | IAF	Islamic Action Front
 60 | ICC	International Criminal Court
 61 | ICG	International Crisis Group
 62 | ICJ	International Court of Justice (ICJ)
 63 | ICO	International Cocoa Organization (ICCO)
 64 | IDB	Islamic Development Bank
 65 | IGC	International Grains Council
 66 | IHF	International Helsinki Federation for Human Rights
 67 | ILO	International Labor Organization
 68 | IMF	International Monetary Fund (IMF)
 69 | IOM	International Organization for Migration
 70 | IPU	Inter-Parliamentary Union
 71 | IRC	Red Cross
 72 | ISJ	Palestinian Islamic Jihad
 73 | ITP	Interpol
 74 | JUR	International Commission of Jurists
 75 | KDP	Kurdish Democratic Party (KDP)
 76 | KID	United Nations Children?s Fund (UNICEF)
 77 | LBA	Israeli Labor Party
 78 | LKD	Likud Party
 79 | MBR	Muslim Brotherhood
 80 | MRZ	Meretz Party
 81 | MSF	Medecins Sans Frontieres (Doctors Without Borders)
 82 | MSP	Movement of the Society for Peace
 83 | NAT	North Atlantic Treaty Organization (NATO)
 84 | NEP	New Economic Partnership for Africa?s Development
 85 | NON	Organization of Non-Aligned Countries
 86 | OAS	Organization of American States
 87 | OAU	Organization of African Unity (OAU)
 88 | OIC	Organization of Islamic Conferences (OIC)
 89 | OPC	Organization of Petroleum Exporting Countries (OPEC)
 90 | PAP	Pan-African Parliament
 91 | PFL	People's Front for the Liberation of Palestine (PFLP)
 92 | PLF	Palestine Liberation Front
 93 | PLO	Palestine Liberation Organization
 94 | PLS	Polisario Guerillas
 95 | PMD	People's Mujahedeen
 96 | PRC	Paris Club
 97 | PSE	Occupied Palestinian Territories
 98 | RCR	Red Crescent
 99 | RND	Democratic National Rally
100 | SAA	South Asian Association
101 | SAD	Southern African Development Community
102 | SCE	Council of Security and Cooperation in Europe (OSCE)
103 | SHA	Shas Party
104 | SOT	Southeast Asia Collective Defense Treaty (SEATO)
105 | TAL	Taliban
106 | UEM	Economic and Monetary Union of West Africa (UEMOA)
107 | UNO	United Nations
108 | WAD	West Africa Development Bank
109 | WAM	West Africa Monetary and Economic Union
110 | WAS	Economic Community of West African States (ECOWAS)
111 | WBK	World Bank
112 | WCT	International War Crimes Tribunals
113 | WEF	World Economic Forum
114 | WFP	World Food Program
115 | WHO	World Health Organization
116 | WTO	World Trade Organization
117 | WTO	World Trade Organization (WTO)
118 | XFM	Oxfam


--------------------------------------------------------------------------------
/src/main/resources/com/aamend/spark/gdelt/reference/cameoCountry.txt:
--------------------------------------------------------------------------------
  1 | CODE	LABEL
  2 | WSB	West Bank
  3 | BAG	Baghdad
  4 | GZS	Gaza Strip
  5 | AFR	Africa
  6 | ASA	Asia
  7 | BLK	Balkans
  8 | CRB	Caribbean
  9 | CAU	Caucasus
 10 | CFR	Central Africa
 11 | CAS	Central Asia
 12 | CEU	Central Europe
 13 | EIN	East Indies
 14 | EAF	Eastern Africa
 15 | EEU	Eastern Europe
 16 | EUR	Europe
 17 | LAM	Latin America
 18 | MEA	Middle East
 19 | MDT	Mediterranean
 20 | NAF	North Africa
 21 | NMR	North America
 22 | PGS	Persian Gulf
 23 | SCN	Scandinavia
 24 | SAM	South America
 25 | SAS	South Asia
 26 | SEA	Southeast Asia
 27 | SAF	Southern Africa
 28 | WAF	West Africa
 29 | WST	The West
 30 | AFG	Afghanistan
 31 | ALA	Aland Islands
 32 | ALB	Albania
 33 | DZA	Algeria
 34 | ASM	American Samoa
 35 | AND	Andorra
 36 | AGO	Angola
 37 | AIA	Anguilla
 38 | ATG	Antigua and Barbuda
 39 | ARG	Argentina
 40 | ARM	Armenia
 41 | ABW	Aruba
 42 | AUS	Australia
 43 | AUT	Austria
 44 | AZE	Azerbaijan
 45 | BHS	Bahamas
 46 | BHR	Bahrain
 47 | BGD	Bangladesh
 48 | BRB	Barbados
 49 | BLR	Belarus
 50 | BEL	Belgium
 51 | BLZ	Belize
 52 | BEN	Benin
 53 | BMU	Bermuda
 54 | BTN	Bhutan
 55 | BOL	Bolivia
 56 | BIH	Bosnia and Herzegovina
 57 | BWA	Botswana
 58 | BRA	Brazil
 59 | VGB	British Virgin Islands
 60 | BRN	Brunei Darussalam
 61 | BGR	Bulgaria
 62 | BFA	Burkina Faso
 63 | BDI	Burundi
 64 | KHM	Cambodia
 65 | CMR	Cameroon
 66 | CAN	Canada
 67 | CPV	Cape Verde
 68 | CYM	Cayman Islands
 69 | CAF	Central African Republic
 70 | TCD	Chad
 71 | CHL	Chile
 72 | CHN	China
 73 | COL	Columbia
 74 | COM	Comoros
 75 | COD	Democratic Republic of the Congo
 76 | COG	People's Republic of the Congo
 77 | COK	Cook Islands
 78 | CRI	Costa Rica
 79 | CIV	Ivory Coast
 80 | HRV	Croatia
 81 | CUB	Cuba
 82 | CYP	Cyprus
 83 | CZE	Czech Republic
 84 | DNK	Denmark
 85 | DJI	Djibouti
 86 | DMA	Dominica
 87 | DOM	Dominican Republic
 88 | TMP	East Timor
 89 | ECU	Ecuador
 90 | EGY	Egypt
 91 | SLV	El Salvador
 92 | GNQ	Equatorial Guinea
 93 | ERI	Eritrea
 94 | EST	Estonia
 95 | ETH	Ethiopia
 96 | FRO	Faeroe Islands
 97 | FLK	Falkland Islands
 98 | FJI	Fiji
 99 | FIN	Finland
100 | FRA	France
101 | GUF	French Guiana
102 | PYF	French Polynesia
103 | GAB	Gabon
104 | GMB	Gambia
105 | GEO	Georgia
106 | DEU	Germany
107 | GHA	Ghana
108 | GIB	Gibraltar
109 | GRC	Greece
110 | GRL	Greenland
111 | GRD	Grenada
112 | GLP	Guadeloupe
113 | GUM	Guam
114 | GTM	Guatemala
115 | GIN	Guinea
116 | GNB	Guinea-Bissau
117 | GUY	Guyana
118 | HTI	Haiti
119 | VAT	Vatican City
120 | HND	Honduras
121 | HKG	Hong Kong
122 | HUN	Hungary
123 | ISL	Iceland
124 | IND	India
125 | IDN	Indonesia
126 | IRN	Iran
127 | IRQ	Iraq
128 | IRL	Ireland
129 | IMY	Isle of Man
130 | ISR	Israel
131 | ITA	Italy
132 | JAM	Jamaica
133 | JPN	Japan
134 | JOR	Jordan
135 | KAZ	Kazakhstan
136 | KEN	Kenya
137 | KIR	Kiribati
138 | PRK	North Korea
139 | KOR	South Korea
140 | KWT	Kuwait
141 | KGZ	Kyrgyzstan
142 | LAO	Laos
143 | LVA	Latvia
144 | LBN	Lebanon
145 | LSO	Lesotho
146 | LBR	Liberia
147 | LBY	Libya
148 | LIE	Liechtenstein
149 | LTU	Lithuania
150 | LUX	Luxembourg
151 | MAC	Macao
152 | MKD	Macedonia
153 | MDG	Madagascar
154 | MWI	Malawi
155 | MYS	Malaysia
156 | MDV	Maldives
157 | MLI	Mali
158 | MLT	Malta
159 | MHL	Marshall Islands
160 | MTQ	Martinique
161 | MRT	Mauritania
162 | MUS	Mauritius
163 | MYT	Mayotte
164 | MEX	Mexico
165 | FSM	Micronesia
166 | MDA	Moldova
167 | MCO	Monaco
168 | MNG	Mongolia
169 | MTN	Montenegro
170 | MSR	Montserrat
171 | MAR	Morocco
172 | MOZ	Mozambique
173 | MMR	Myanmar
174 | NAM	Namibia
175 | NRU	Nauru
176 | NPL	Nepal
177 | NLD	Netherlands
178 | ANT	Netherlands Antilles
179 | NCL	New Caledonia
180 | NZL	New Zealand
181 | NIC	Nicaragua
182 | NER	Niger
183 | NGA	Nigeria
184 | NIU	Niue
185 | NFK	Norfolk Island
186 | MNP	Northern Mariana Islands
187 | NOR	Norway
188 | PSE	Occupied Palestinian Territory
189 | OMN	Oman
190 | PAK	Pakistan
191 | PLW	Palau
192 | PAN	Panama
193 | PNG	Papua New Guinea
194 | PRY	Paraguay
195 | PER	Peru
196 | PHL	Philippines
197 | PCN	Pitcairn
198 | POL	Poland
199 | PRT	Portugal
200 | PRI	Puerto Rico
201 | QAT	Qatar
202 | REU	Runion
203 | ROM	Romania
204 | RUS	Russia
205 | RWA	Rwanda
206 | SHN	Saint Helena
207 | KNA	Saint Kitts-Nevis
208 | LCA	Saint Lucia
209 | SPM	Saint Pierre and Miquelon
210 | VCT	Saint Vincent and the Grenadines
211 | WSM	Samoa
212 | SMR	San Marino
213 | STP	Sao Tome and Principe
214 | SAU	Saudi Arabia
215 | SEN	Senegal
216 | SRB	Serbia
217 | SYC	Seychelles
218 | SLE	Sierra Leone
219 | SGP	Singapore
220 | SVK	Slovakia
221 | SVN	Slovenia
222 | SLB	Solomon Islands
223 | SOM	Somalia
224 | ZAF	South Africa
225 | ESP	Spain
226 | LKA	Sri Lanka
227 | SDN	Sudan
228 | SUR	Suriname
229 | SJM	Svalbard and Jan Mayen Islands
230 | SWZ	Swaziland
231 | SWE	Sweden
232 | CHE	Switzerland
233 | SYR	Syria
234 | TWN	Taiwan
235 | TJK	Tajikistan
236 | TZA	Tanzania
237 | THA	Thailand
238 | TGO	Togo
239 | TKL	Tokelau
240 | TON	Tonga
241 | TTO	Trinidad and Tobago
242 | TUN	Tunisia
243 | TUR	Turkey
244 | TKM	Turkmenistan
245 | TCA	Turks and Caicos Islands
246 | TUV	Tuvalu
247 | UGA	Uganda
248 | UKR	Ukraine
249 | ARE	United Arab Emirates
250 | GBR	United Kingdom
251 | USA	United States
252 | VIR	United States Virgin Islands
253 | URY	Uruguay
254 | UZB	Uzbekistan
255 | VUT	Vanuatu
256 | VEN	Venezuela
257 | VNM	Vietnam
258 | WLF	Wallis and Futuna Islands
259 | ESH	Western Sahara
260 | YEM	Yemen
261 | ZMB	Zambia
262 | ZWE	Zimbabwe


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/ReplaceSequence.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.gravity.goose.text
 20 | 
 21 | import org.slf4j.Logger
 22 | import org.slf4j.LoggerFactory
 23 | import java.util.ArrayList
 24 | import java.util.List
 25 | import java.util.regex.Matcher
 26 | import java.util.regex.Pattern
 27 | 
 28 | /**
 29 |  * Created by IntelliJ IDEA.
 30 |  * User: robbie
 31 |  * Date: 5/13/11
 32 |  * Time: 12:03 AM
 33 |  */
 34 | /**
 35 | * Wraps the usage of making multiple string replacements in an ordered sequence.
 36 | * For Example... instead of doing this over and over:</p>
 37 | * <blockquote>
 38 | *   <pre>
 39 | *     String text = "   Some example text     ";
 40 | *     text = text.{@link String#replaceAll(String, String) replaceAll}("e", "E");
 41 | *     text = text.{@link String#replaceAll(String, String) replaceAll}(" ", "_");
 42 | *     text = text.{@link String#replaceAll(String, String) replaceAll}("^\\s+$", "");
 43 | *   </pre>
 44 | * </blockquote>
 45 | * You can use a <code>ReplaceSequence</code> like this:</p>
 46 | * <blockquote>
 47 | *   <pre>
 48 | *     static final betterReplacements = ReplaceSequence.{@link #create(String, String) create}("e", "E").{@link #append(String, String) append}(" ", "_").{@link #append(String) append}("^\\s+$");
 49 | *
 50 | *     void fixMyString(String text) {
 51 | *       return betterReplacements.{@link #replaceAll(String) replaceAll}(text);
 52 | *     }
 53 | *   </pre>
 54 | * </blockquote>
 55 | *
 56 | * Internally, an ordered list of {@link Matcher}s and its associated replacement is built as the {@link #append} method is called.<br/>
 57 | * Each matcher is {@link Matcher#reset(CharSequence) reset} with the input specified in the {@link #replaceAll(String)} method.</p>
 58 | * Use of this class can improve performance if the sequence of replacements is intended to be used repeatedly throughout the life of an application.<br/>
 59 | * This is due to the fact that each {@link Pattern} is only compiled once and each {@link Matcher} is only generated once.
 60 | */
 61 | object ReplaceSequence {
 62 |   /**
 63 |   * Creates a new <code>ReplaceSequence</code> with the first pattern to be replaced with an empty <code>String</code>
 64 |   * @param firstPattern The regex {@link Pattern pattern} string for the first replacement
 65 |   * @return a new instance
 66 |   */
 67 |   def create(firstPattern: String): ReplaceSequence = {
 68 |     create(firstPattern, string.empty)
 69 |   }
 70 | 
 71 |   /**
 72 |   * Creates a new <code>ReplaceSequence</code> with the first pattern to be replaced with the specified <code>replaceWith</code> parameter.
 73 |   * @param firstPattern The regex {@link Pattern pattern} {@link String} for the first replacement
 74 |   * @param replaceWith The {@link String} to replace matches of the specified pattern
 75 |   * @return a new instance
 76 |   */
 77 |   def create(firstPattern: String, replaceWith: String): ReplaceSequence = {
 78 |     val result: ReplaceSequence = new ReplaceSequence(StringReplacement.compile(firstPattern, replaceWith))
 79 |      result
 80 |   }
 81 | }
 82 | 
 83 | class ReplaceSequence {
 84 |   /**
 85 |   * Appends a new pattern to this instance in a builder pattern
 86 |   * @param pattern The regex {@link Pattern pattern} {@link String} for this replacement
 87 |   * @return this instance of itself for use in a builder pattern
 88 |   */
 89 |   def append(pattern: String): ReplaceSequence = {
 90 |     append(pattern, string.empty)
 91 |   }
 92 | 
 93 |   /**
 94 |   * Appends a new pattern to this instance in a builder pattern
 95 |   * @param pattern The regex {@link Pattern pattern} {@link String} for this replacement
 96 |   * @param replaceWith The {@link String} to replace matches of the specified pattern
 97 |   * @return this instance of itself for use in a builder pattern
 98 |   */
 99 |   def append(pattern: String, replaceWith: String): ReplaceSequence = {
100 |     replacements.add(StringReplacement.compile(pattern, replaceWith))
101 |     this
102 |   }
103 | 
104 |   /**
105 |   * Applies each of the replacements specified via the initial {@link #create(String)} and/or any additional via {@link #append(String)}
106 |   * @param input the {@link String} to apply all of the replacements to
107 |   * @return the resulting {@link String} after all replacements have been applied
108 |   */
109 |   def replaceAll(input: String): String = {
110 |     if (string.isNullOrEmpty(input)) return string.empty
111 |     var mutatedString = input
112 |     import scala.collection.JavaConversions._
113 |     for (rp <- replacements) {
114 |       mutatedString = rp.replaceAll(mutatedString)
115 |     }
116 |     mutatedString
117 |   }
118 | 
119 |   private def this(pair: StringReplacement) {
120 |     this ()
121 |     replacements.add(pair)
122 |   }
123 | 
124 |   var replacements: List[StringReplacement] = new ArrayList[StringReplacement]
125 | }
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/spark/GooseFetcher.scala:
--------------------------------------------------------------------------------
  1 | package com.gravity.goose.spark
  2 | 
  3 | import com.gravity.goose.{Configuration, Goose}
  4 | import org.apache.spark.ml.Transformer
  5 | import org.apache.spark.ml.param._
  6 | import org.apache.spark.ml.util._
  7 | import org.apache.spark.sql.types._
  8 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
  9 | 
 10 | trait GooseFetcherParams extends Params with DefaultParamsWritable {
 11 |   val annotators = new Param[Map[String, String]](this, "annotators", s"The list of annotators [${ANNOTATORS.mkString(",")}]")
 12 |   val urlColumn = new Param[String](this, "urlColumn", "The input column containing URLs")
 13 |   val userAgent = new Param[String](this, "userAgent", "User agent that is sent with your web requests to extract URL content")
 14 |   val socketTimeout = new Param[Int](this, "socketTimeout", "Socket timeout (ms)")
 15 |   val connectionTimeout = new Param[Int](this, "connectionTimeout", "Connection timeout (ms)")
 16 |   val enableImageFetching = new Param[Boolean](this, "enableImageFetching", "(Experimental) Fetching image header as base64")
 17 | }
 18 | 
 19 | class GooseFetcher(override val uid: String) extends Transformer with GooseFetcherParams {
 20 | 
 21 |   def setAnnotators(value: Map[String, String]): this.type = {
 22 |     require(value.nonEmpty, "At least one annotator must be provided")
 23 |     require(value.values.toSet.size == value.keys.size, "Annotator fields must be unique")
 24 |     value.keys.foreach(annotator => require(ANNOTATORS.contains(annotator), s"Annotator [$annotator] is not valid, supported are [${ANNOTATORS.mkString(",")}]"))
 25 |     set(annotators, value)
 26 |   }
 27 | 
 28 |   setDefault(annotators -> ANNOTATORS.zip(ANNOTATORS).toMap)
 29 | 
 30 |   def setUserAgent(value: String): this.type = set(userAgent, value)
 31 | 
 32 |   setDefault(userAgent -> "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8")
 33 | 
 34 |   def setSocketTimeout(value: Int): this.type = set(socketTimeout, value)
 35 | 
 36 |   setDefault(socketTimeout -> 10000)
 37 | 
 38 |   def setConnectionTimeout(value: Int): this.type = set(connectionTimeout, value)
 39 | 
 40 |   setDefault(connectionTimeout -> 10000)
 41 | 
 42 |   def setEnableImageFetching(value: Boolean): this.type = set(enableImageFetching, value)
 43 | 
 44 |   setDefault(enableImageFetching -> false)
 45 | 
 46 |   def setUrlColumn(value: String): this.type = set(urlColumn, value)
 47 | 
 48 |   setDefault(urlColumn -> "url")
 49 | 
 50 |   def this() = this(Identifiable.randomUID("goose"))
 51 | 
 52 |   override def transform(origDS: Dataset[_]): DataFrame = {
 53 | 
 54 |     // Make sure the URL field exist
 55 |     require(origDS.schema.exists(s => s.name == $(urlColumn) && s.dataType == StringType), "Field [" + $(urlColumn) + "] is not valid")
 56 | 
 57 |     // Make sure annotators field do not exist
 58 |     $(annotators).values.foreach(annotator => {
 59 |       require(!origDS.schema.exists(s => s.name == annotator), s"Annotator field [$annotator] already exist")
 60 |     })
 61 | 
 62 |     // This intermediate dataset to make sure we don't scrape more than once a same URL
 63 |     val urlDF = origDS.select($(urlColumn)).dropDuplicates($(urlColumn))
 64 | 
 65 |     // Append URL dataframe with article annotators
 66 |     val urlContentRDD = urlDF.rdd.mapPartitions(rows => {
 67 | 
 68 |       // Initialize Goose only once for each partition
 69 |       val conf = new Configuration()
 70 |       conf.setEnableImageFetching($(enableImageFetching))
 71 |       conf.setBrowserUserAgent($(userAgent))
 72 |       conf.setSocketTimeout($(socketTimeout))
 73 |       conf.setConnectionTimeout($(connectionTimeout))
 74 |       val goose = new Goose(conf)
 75 | 
 76 |       // Scrape each URL individually
 77 |       val articles = scrapeArticles(rows.map(_.getAs[String]($(urlColumn))), goose)
 78 | 
 79 |       // Convert articles as Row
 80 |       articles.map(article => {
 81 |         val appended: Seq[Any] = $(annotators).map { case (key, _) =>
 82 |           key match {
 83 |             case ANNOTATOR_TITLE => article.title.getOrElse("")
 84 |             case ANNOTATOR_DESCRIPTION => article.description.getOrElse("")
 85 |             case ANNOTATOR_CONTENT => article.content.getOrElse("")
 86 |             case ANNOTATOR_KEYWORDS => article.keywords
 87 |             case ANNOTATOR_PUBLISH_DATE => article.publishDate.orNull
 88 |           }
 89 |         }.toSeq
 90 |         Row.fromSeq(Seq(article.url) ++ appended)
 91 |       })
 92 |     })
 93 | 
 94 |     // Transform RDD of Row to Dataframe
 95 |     val contentDF = origDS.sqlContext.createDataFrame(urlContentRDD, transformSchema(urlDF.schema))
 96 | 
 97 |     // Join articles back to any duplicate URL dataset
 98 |     contentDF.join(origDS, List($(urlColumn)))
 99 | 
100 |   }
101 | 
102 |   override def transformSchema(schema: StructType): StructType = {
103 |     StructType(
104 |       schema.seq ++ $(annotators).map { case (key, value) =>
105 |         key match {
106 |           case ANNOTATOR_TITLE => StructField(value, StringType, nullable = false)
107 |           case ANNOTATOR_DESCRIPTION => StructField(value, StringType, nullable = false)
108 |           case ANNOTATOR_CONTENT => StructField(value, StringType, nullable = false)
109 |           case ANNOTATOR_KEYWORDS => StructField(value, ArrayType.apply(StringType), nullable = false)
110 |           case ANNOTATOR_PUBLISH_DATE => StructField(value, DateType, nullable = true)
111 |         }
112 |       }
113 |     )
114 |   }
115 | 
116 |   override def copy(extra: ParamMap): Transformer = {
117 |     defaultCopy(extra)
118 |   }
119 | }
120 | 
121 | object GooseFetcher extends DefaultParamsReadable[GooseFetcher] {
122 |   override def load(path: String): GooseFetcher = super.load(path)
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/Configuration.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.gravity.goose
 20 | 
 21 | import network.{HtmlFetcher, AbstractHtmlFetcher}
 22 | import org.jsoup.nodes.Element
 23 | import scala.beans.BeanProperty
 24 | import com.gravity.goose.extractors.{StandardContentExtractor, ContentExtractor, AdditionalDataExtractor, PublishDateExtractor}
 25 | 
 26 | 
 27 | /**
 28 |  * Created by Jim Plush
 29 |  * User: jim
 30 |  * Date: 8/16/11
 31 |  */
 32 | 
 33 | 
 34 | class Configuration {
 35 | 
 36 |   /**
 37 |   * this is the local storage path used to place images to inspect them, should be writable
 38 |   */
 39 |   @BeanProperty
 40 |   var localStoragePath: String = "/tmp/goose"
 41 |   /**
 42 |   * What's the minimum bytes for an image we'd accept is, alot of times we want to filter out the author's little images
 43 |   * in the beginning of the article
 44 |   */
 45 |   @BeanProperty
 46 |   var minBytesForImages: Int = 4500
 47 |   /**
 48 |   * set this guy to false if you don't care about getting images, otherwise you can either use the default
 49 |   * image extractor to implement the ImageExtractor interface to build your own
 50 |   */
 51 |   @BeanProperty
 52 |   var enableImageFetching: Boolean = true
 53 |   /**
 54 |   * path to your imagemagick convert executable, on the mac using mac ports this is the default listed
 55 |   */
 56 |   @BeanProperty
 57 |   var imagemagickConvertPath: String = "/opt/local/bin/convert"
 58 |   /**
 59 |   *  path to your imagemagick identify executable
 60 |   */
 61 |   @BeanProperty
 62 |   var imagemagickIdentifyPath: String = "/opt/local/bin/identify"
 63 | 
 64 |   @BeanProperty
 65 |   var connectionTimeout: Int = 10000
 66 | 
 67 |   @BeanProperty
 68 |   var socketTimeout: Int = 10000
 69 | 
 70 |   /**
 71 |   * used as the user agent that is sent with your web requests to extract an article
 72 |   */
 73 |   @BeanProperty
 74 |   var browserUserAgent: String = "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8"
 75 | 
 76 |   var contentExtractor: ContentExtractor = StandardContentExtractor
 77 | 
 78 |   var publishDateExtractor: PublishDateExtractor = new PublishDateExtractor {
 79 |     import PublishDateExtractor._
 80 | 
 81 |     def extractCandidate(rootElement: Element, selector: String): Seq[java.sql.Date] = {
 82 |       import scala.collection.JavaConversions._
 83 | 
 84 |       try {
 85 |         rootElement.select(selector).flatMap(item => safeParseISO8601Date(item.attr("content")))
 86 |       }
 87 |       catch {
 88 |         case e: Exception =>
 89 |           Nil
 90 |       }
 91 |     }
 92 | 
 93 |     final val pubSelectors = Seq(
 94 |       "meta[property~=article:published_time]"
 95 |     )
 96 | 
 97 |     final val modSelectors = Seq(
 98 |       "meta[property~=article:modified_time]",
 99 |       "meta[property~=og:updated_time]"
100 |     )
101 | 
102 |     def extract(rootElement: Element): java.sql.Date = {
103 |       // A few different ways to get a date.
104 |       def bestPubDate = pubSelectors.flatMap(extractCandidate(rootElement, _)).reduceOption(minDate)
105 |       def bestModDate = modSelectors.flatMap(extractCandidate(rootElement, _)).reduceOption(minDate)
106 | 
107 |       // Return the oldest 'published' date, or else the oldest 'modified' date, or null if none.
108 |       bestPubDate.orElse(bestModDate).getOrElse(null)
109 |     }
110 |   }
111 | 
112 |   var additionalDataExtractor: AdditionalDataExtractor = new AdditionalDataExtractor
113 | 
114 |   def getPublishDateExtractor: PublishDateExtractor = {
115 |     publishDateExtractor
116 |   }
117 | 
118 |   def setContentExtractor(extractor: ContentExtractor) {
119 |     if (extractor == null) throw new IllegalArgumentException("extractor must not be null!")
120 |     contentExtractor = extractor
121 |   }
122 | 
123 |   /**
124 |   * Pass in to extract article publish dates.
125 |     * @param extractor a concrete instance of {@link PublishDateExtractor}
126 |   * @throws IllegalArgumentException if the instance passed in is <code>null</code>
127 |   */
128 |   def setPublishDateExtractor(extractor: PublishDateExtractor) {
129 |     if (extractor == null) throw new IllegalArgumentException("extractor must not be null!")
130 |     this.publishDateExtractor = extractor
131 |   }
132 | 
133 |   def getAdditionalDataExtractor: AdditionalDataExtractor = {
134 |     additionalDataExtractor
135 |   }
136 | 
137 |   /**
138 |   * Pass in to extract any additional data not defined within {@link Article}
139 |     * @param extractor a concrete instance of {@link AdditionalDataExtractor}
140 |   * @throws IllegalArgumentException if the instance passed in is <code>null</code>
141 |   */
142 |   def setAdditionalDataExtractor(extractor: AdditionalDataExtractor) {
143 |     this.additionalDataExtractor = extractor
144 |   }
145 | 
146 |   var htmlFetcher: AbstractHtmlFetcher = HtmlFetcher
147 | 
148 |   def setHtmlFetcher(fetcher: AbstractHtmlFetcher) {
149 |     require(fetcher != null, "fetcher MUST NOT be null!")
150 |     this.htmlFetcher = fetcher
151 |   }
152 | 
153 |   def getHtmlFetcher: AbstractHtmlFetcher = htmlFetcher
154 | 
155 | }


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/Crawler.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.gravity.goose
 20 | 
 21 | import cleaners.{StandardDocumentCleaner, DocumentCleaner}
 22 | import extractors.ContentExtractor
 23 | import images.{Image, UpgradedImageIExtractor, ImageExtractor}
 24 | import org.apache.http.client.HttpClient
 25 | import org.jsoup.nodes.{Document, Element}
 26 | import org.jsoup.Jsoup
 27 | import java.io.File
 28 | import utils.{ParsingCandidate, URLHelper, Logging}
 29 | import com.gravity.goose.outputformatters.{StandardOutputFormatter, OutputFormatter}
 30 | 
 31 | /**
 32 |  * Created by Jim Plush
 33 |  * User: jim
 34 |  * Date: 8/18/11
 35 |  */
 36 | 
 37 | case class CrawlCandidate(config: Configuration, url: String, rawHTML: String = null)
 38 | 
 39 | class Crawler(config: Configuration) {
 40 | 
 41 |   import Crawler._
 42 | 
 43 |   def crawl(crawlCandidate: CrawlCandidate): Article = {
 44 |     val article = new Article()
 45 |     for {
 46 |       parseCandidate <- URLHelper.getCleanedUrl(crawlCandidate.url)
 47 |       rawHtml <- getHTML(crawlCandidate, parseCandidate)
 48 |       doc <- getDocument(parseCandidate.url.toString, rawHtml)
 49 |     } {
 50 |       trace("Crawling url: " + parseCandidate.url)
 51 | 
 52 |       val extractor = getExtractor
 53 |       val docCleaner = getDocCleaner
 54 |       val outputFormatter = getOutputFormatter
 55 | 
 56 |       article.finalUrl = parseCandidate.url.toString
 57 |       article.domain = parseCandidate.url.getHost
 58 |       article.linkhash = parseCandidate.linkhash
 59 |       article.rawHtml = rawHtml
 60 |       article.doc = doc
 61 |       article.rawDoc = doc.clone()
 62 | 
 63 |       article.title = extractor.getTitle(article)
 64 |       article.publishDate = config.publishDateExtractor.extract(doc)
 65 |       article.additionalData = config.getAdditionalDataExtractor.extract(doc)
 66 |       article.metaDescription = extractor.getMetaDescription(article)
 67 |       article.metaKeywords = extractor.getMetaKeywords(article)
 68 |       article.canonicalLink = extractor.getCanonicalLink(article)
 69 |       article.tags = extractor.extractTags(article)
 70 | 
 71 |       // before we do any calcs on the body itself let's clean up the document
 72 |       article.doc = docCleaner.clean(article)
 73 | 
 74 |       extractor.calculateBestNodeBasedOnClustering(article) match {
 75 |         case Some(node: Element) =>
 76 |           article.topNode = node
 77 |           article.movies = extractor.extractVideos(article.topNode)
 78 | 
 79 |           if (config.enableImageFetching) {
 80 |             trace(logPrefix + "Image fetching enabled...")
 81 |             val imageExtractor = getImageExtractor(article)
 82 |             try {
 83 |               if (article.rawDoc == null) {
 84 |                 article.topImage = new Image
 85 |               } else {
 86 |                 article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)
 87 |               }
 88 |             } catch {
 89 |               case e: Exception => warn(e, e.toString)
 90 |             }
 91 |           }
 92 |           article.topNode = extractor.postExtractionCleanup(article.topNode)
 93 | 
 94 | 
 95 |           article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode)
 96 |         case _ => trace("NO ARTICLE FOUND")
 97 |       }
 98 |       releaseResources(article)
 99 |       article
100 |     }
101 | 
102 |     article
103 |   }
104 | 
105 |   def getHTML(crawlCandidate: CrawlCandidate, parsingCandidate: ParsingCandidate): Option[String] = {
106 |     if (crawlCandidate.rawHTML != null) {
107 |       Some(crawlCandidate.rawHTML)
108 |     } else {
109 |       config.getHtmlFetcher.getHtml(config, parsingCandidate.url.toString) match {
110 |         case Some(html) =>
111 |           Some(html)
112 |         case _ => None
113 |       }
114 |     }
115 |   }
116 | 
117 | 
118 |   def getImageExtractor(article: Article): ImageExtractor = {
119 |     val httpClient: HttpClient = config.getHtmlFetcher.getHttpClient
120 |     new UpgradedImageIExtractor(httpClient, article, config)
121 |   }
122 | 
123 |   def getOutputFormatter: OutputFormatter = {
124 |     StandardOutputFormatter
125 |   }
126 | 
127 |   def getDocCleaner: DocumentCleaner = {
128 |     new StandardDocumentCleaner
129 |   }
130 | 
131 |   def getDocument(url: String, rawlHtml: String): Option[Document] = {
132 | 
133 |     try {
134 |       Some(Jsoup.parse(rawlHtml))
135 |     } catch {
136 |       case e: Exception => {
137 |         trace("Unable to parse " + url + " properly into JSoup Doc")
138 |         None
139 |       }
140 |     }
141 |   }
142 | 
143 |   def getExtractor: ContentExtractor = {
144 |     config.contentExtractor
145 |   }
146 | 
147 |   /**
148 |   * cleans up any temp files we have laying around like temp images
149 |   * removes any image in the temp dir that starts with the linkhash of the url we just parsed
150 |   */
151 |   def releaseResources(article: Article) {
152 |     trace(logPrefix + "STARTING TO RELEASE ALL RESOURCES")
153 | 
154 |     val dir: File = new File(config.localStoragePath)
155 | 
156 |     dir.list.foreach(filename => {
157 |       if (filename.startsWith(article.linkhash)) {
158 |         val f: File = new File(dir.getAbsolutePath + "/" + filename)
159 |         if (!f.delete) {
160 |           warn("Unable to remove temp file: " + filename)
161 |         }
162 |       }
163 |     })
164 |   }
165 | 
166 | }
167 | 
168 | object Crawler extends Logging {
169 |   val logPrefix = "crawler: "
170 | }


--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/text/stopwords-en.txt:
--------------------------------------------------------------------------------
  1 | a's
  2 | able
  3 | about
  4 | above
  5 | according
  6 | accordingly
  7 | across
  8 | actually
  9 | after
 10 | afterwards
 11 | again
 12 | against
 13 | ain't
 14 | all
 15 | allow
 16 | allows
 17 | almost
 18 | alone
 19 | along
 20 | already
 21 | also
 22 | although
 23 | always
 24 | am
 25 | among
 26 | amongst
 27 | an
 28 | and
 29 | another
 30 | any
 31 | anybody
 32 | anyhow
 33 | anyone
 34 | anything
 35 | anyway
 36 | anyways
 37 | anywhere
 38 | apart
 39 | appear
 40 | appreciate
 41 | appropriate
 42 | are
 43 | aren't
 44 | around
 45 | as
 46 | aside
 47 | ask
 48 | asking
 49 | associated
 50 | at
 51 | available
 52 | away
 53 | awfully
 54 | be
 55 | became
 56 | because
 57 | become
 58 | becomes
 59 | becoming
 60 | been
 61 | before
 62 | beforehand
 63 | behind
 64 | being
 65 | believe
 66 | below
 67 | beside
 68 | besides
 69 | best
 70 | better
 71 | between
 72 | beyond
 73 | both
 74 | brief
 75 | but
 76 | by
 77 | c
 78 | c'mon
 79 | c's
 80 | came
 81 | campaign
 82 | can
 83 | can't
 84 | cannot
 85 | cant
 86 | cause
 87 | causes
 88 | certain
 89 | certainly
 90 | changes
 91 | clearly
 92 | co
 93 | com
 94 | come
 95 | comes
 96 | concerning
 97 | consequently
 98 | consider
 99 | considering
100 | contain
101 | containing
102 | contains
103 | corresponding
104 | could
105 | couldn't
106 | course
107 | currently
108 | definitely
109 | described
110 | despite
111 | did
112 | didn't
113 | different
114 | do
115 | does
116 | doesn't
117 | doing
118 | don't
119 | done
120 | down
121 | downwards
122 | during
123 | each
124 | edu
125 | eight
126 | either
127 | else
128 | elsewhere
129 | enough
130 | endorsed
131 | entirely
132 | especially
133 | et
134 | etc
135 | even
136 | ever
137 | every
138 | everybody
139 | everyone
140 | everything
141 | everywhere
142 | ex
143 | exactly
144 | example
145 | except
146 | far
147 | few
148 | fifth
149 | first
150 | financial
151 | five
152 | followed
153 | following
154 | follows
155 | for
156 | former
157 | formerly
158 | forth
159 | four
160 | from
161 | further
162 | furthermore
163 | get
164 | gets
165 | getting
166 | given
167 | gives
168 | go
169 | goes
170 | going
171 | gone
172 | got
173 | gotten
174 | greetings
175 | had
176 | hadn't
177 | happens
178 | hardly
179 | has
180 | hasn't
181 | have
182 | haven't
183 | having
184 | he
185 | he's
186 | hello
187 | help
188 | hence
189 | her
190 | here
191 | here's
192 | hereafter
193 | hereby
194 | herein
195 | hereupon
196 | hers
197 | herself
198 | hi
199 | him
200 | himself
201 | his
202 | hither
203 | hopefully
204 | how
205 | howbeit
206 | however
207 | i'd
208 | i'll
209 | i'm
210 | i've
211 | if
212 | ignored
213 | immediate
214 | in
215 | inasmuch
216 | inc
217 | indeed
218 | indicate
219 | indicated
220 | indicates
221 | inner
222 | insofar
223 | instead
224 | into
225 | inward
226 | is
227 | isn't
228 | it
229 | it'd
230 | it'll
231 | it's
232 | its
233 | itself
234 | just
235 | keep
236 | keeps
237 | kept
238 | know
239 | knows
240 | known
241 | last
242 | lately
243 | later
244 | latter
245 | latterly
246 | least
247 | less
248 | lest
249 | let
250 | let's
251 | like
252 | liked
253 | likely
254 | little
255 | look
256 | looking
257 | looks
258 | ltd
259 | mainly
260 | many
261 | may
262 | maybe
263 | me
264 | mean
265 | meanwhile
266 | merely
267 | might
268 | more
269 | moreover
270 | most
271 | mostly
272 | much
273 | must
274 | my
275 | myself
276 | name
277 | namely
278 | nd
279 | near
280 | nearly
281 | necessary
282 | need
283 | needs
284 | neither
285 | never
286 | nevertheless
287 | new
288 | next
289 | nine
290 | no
291 | nobody
292 | non
293 | none
294 | noone
295 | nor
296 | normally
297 | not
298 | nothing
299 | novel
300 | now
301 | nowhere
302 | obviously
303 | of
304 | off
305 | often
306 | oh
307 | ok
308 | okay
309 | old
310 | on
311 | once
312 | one
313 | ones
314 | only
315 | onto
316 | or
317 | other
318 | others
319 | otherwise
320 | ought
321 | our
322 | ours
323 | ourselves
324 | out
325 | outside
326 | over
327 | overall
328 | own
329 | particular
330 | particularly
331 | per
332 | perhaps
333 | placed
334 | please
335 | plus
336 | possible
337 | presumably
338 | probably
339 | provides
340 | quite
341 | quote
342 | quarterly
343 | rather
344 | really
345 | reasonably
346 | regarding
347 | regardless
348 | regards
349 | relatively
350 | respectively
351 | right
352 | said
353 | same
354 | saw
355 | say
356 | saying
357 | says
358 | second
359 | secondly
360 | see
361 | seeing
362 | seem
363 | seemed
364 | seeming
365 | seems
366 | seen
367 | self
368 | selves
369 | sensible
370 | sent
371 | serious
372 | seriously
373 | seven
374 | several
375 | shall
376 | she
377 | should
378 | shouldn't
379 | since
380 | six
381 | so
382 | some
383 | somebody
384 | somehow
385 | someone
386 | something
387 | sometime
388 | sometimes
389 | somewhat
390 | somewhere
391 | soon
392 | sorry
393 | specified
394 | specify
395 | specifying
396 | still
397 | sub
398 | such
399 | sup
400 | sure
401 | t's
402 | take
403 | taken
404 | tell
405 | tends
406 | than
407 | thank
408 | thanks
409 | thanx
410 | that
411 | that's
412 | thats
413 | the
414 | their
415 | theirs
416 | them
417 | themselves
418 | then
419 | thence
420 | there
421 | there's
422 | thereafter
423 | thereby
424 | therefore
425 | therein
426 | theres
427 | thereupon
428 | these
429 | they
430 | they'd
431 | they'll
432 | they're
433 | they've
434 | think
435 | third
436 | this
437 | thorough
438 | thoroughly
439 | those
440 | though
441 | three
442 | through
443 | throughout
444 | thru
445 | thus
446 | to
447 | together
448 | too
449 | took
450 | toward
451 | towards
452 | tried
453 | tries
454 | truly
455 | try
456 | trying
457 | twice
458 | two
459 | under
460 | unfortunately
461 | unless
462 | unlikely
463 | until
464 | unto
465 | up
466 | upon
467 | us
468 | use
469 | used
470 | useful
471 | uses
472 | using
473 | usually
474 | uucp
475 | value
476 | various
477 | very
478 | via
479 | viz
480 | vs
481 | want
482 | wants
483 | was
484 | wasn't
485 | way
486 | we
487 | we'd
488 | we'll
489 | we're
490 | we've
491 | welcome
492 | well
493 | went
494 | were
495 | weren't
496 | what
497 | what's
498 | whatever
499 | when
500 | whence
501 | whenever
502 | where
503 | where's
504 | whereafter
505 | whereas
506 | whereby
507 | wherein
508 | whereupon
509 | wherever
510 | whether
511 | which
512 | while
513 | whither
514 | who
515 | who's
516 | whoever
517 | whole
518 | whom
519 | whose
520 | why
521 | will
522 | willing
523 | wish
524 | with
525 | within
526 | without
527 | won't
528 | wonder
529 | would
530 | would
531 | wouldn't
532 | yes
533 | yet
534 | you
535 | you'd
536 | you'll
537 | you're
538 | you've
539 | your
540 | yours
541 | yourself
542 | yourselves
543 | zero
544 | official
545 | sharply
546 | criticized


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/ImageSaver.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | package com.gravity.goose.images
 19 | 
 20 | /**
 21 |  * Created by Jim Plush
 22 |  * User: jim
 23 |  * Date: 8/18/11
 24 |  */
 25 | 
 26 | import org.apache.commons.io.IOUtils
 27 | import org.apache.http.HttpEntity
 28 | import org.apache.http.client.ClientProtocolException
 29 | import org.apache.http.client.HttpClient
 30 | import org.apache.http.client.methods.HttpGet
 31 | import org.apache.http.client.protocol.ClientContext
 32 | import org.apache.http.protocol.BasicHttpContext
 33 | import org.apache.http.protocol.HttpContext
 34 | import java.io._
 35 | import java.util.Random
 36 | import com.gravity.goose.utils.Logging
 37 | import com.gravity.goose.Configuration
 38 | import com.gravity.goose.network.HtmlFetcher
 39 | 
 40 | /**
 41 | * This class will be responsible for storing images to disk
 42 | *
 43 | * @author Jim Plush
 44 | */
 45 | object ImageSaver extends Logging {
 46 |   private def getFileExtension(config: Configuration, fileName: String): String = {
 47 |     var fileExtension: String = ""
 48 |     var mimeType: String = null
 49 |     try {
 50 |       val imageDims: ImageDetails = ImageUtils.getImageDimensions(config.imagemagickIdentifyPath, fileName)
 51 |       mimeType = imageDims.getMimeType
 52 |       if (mimeType == "GIF") {
 53 |         if (logger.isDebugEnabled) {
 54 |           logger.debug("SNEAKY GIF! " + fileName)
 55 |         }
 56 |         throw new SecretGifException
 57 |       }
 58 |       if (mimeType == "JPEG") {
 59 |         fileExtension = ".jpg"
 60 |       }
 61 |       else if (mimeType == "PNG") {
 62 |         fileExtension = ".png"
 63 |       }
 64 |       else {
 65 |         throw new IOException("BAD MIME TYPE: " + mimeType + " FILENAME:" + fileName)
 66 |       }
 67 |     }
 68 |     catch {
 69 |       case e: SecretGifException =>
 70 |         throw e
 71 |       case e: FileNotFoundException =>
 72 |         logger.error(e.getMessage)
 73 |       case e: IOException =>
 74 |         logger.error(e.getMessage)
 75 |         throw e
 76 |     }
 77 |     finally {
 78 |     }
 79 |     fileExtension
 80 |   }
 81 | 
 82 |   def fetchEntity(httpClient: HttpClient, imageSrc: String): Option[HttpEntity] = {
 83 | 
 84 |     val localContext: HttpContext = new BasicHttpContext
 85 |     localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore)
 86 |     val httpget = new HttpGet(imageSrc)
 87 |     val response = httpClient.execute(httpget, localContext)
 88 |     val respStatus: String = response.getStatusLine.toString
 89 |     if (!respStatus.contains("200")) {
 90 |       None
 91 |     } else {
 92 |       try {
 93 |         Some(response.getEntity)
 94 |       } catch {
 95 |         case e: Exception => warn(e, e.toString); None
 96 |       } finally {
 97 |         httpget.abort()
 98 |       }
 99 |     }
100 |   }
101 | 
102 | 
103 |   def copyInputStreamToLocalImage(entity: HttpEntity, linkhash: String, config: Configuration): String = {
104 |     val generator: Random = new Random
105 |     val randInt: Int = generator.nextInt
106 |     val localSrcPath = config.localStoragePath + "/" + linkhash + "_" + randInt
107 |     val instream: InputStream = entity.getContent
108 |     val outstream: OutputStream = new FileOutputStream(localSrcPath)
109 |     try {
110 |       trace("Storing image locally: " + localSrcPath)
111 |       IOUtils.copy(instream, outstream)
112 |       val fileExtension = ImageSaver.getFileExtension(config, localSrcPath)
113 |       if (fileExtension == "" || fileExtension == null) {
114 |         trace("EMPTY FILE EXTENSION: " + localSrcPath)
115 |         return null
116 |       }
117 |       val f: File = new File(localSrcPath)
118 |       if (f.length < config.minBytesForImages) {
119 |         if (logger.isDebugEnabled) {
120 |           logger.debug("TOO SMALL AN IMAGE: " + localSrcPath + " bytes: " + f.length)
121 |         }
122 |         return null
123 |       }
124 |       val newFilename = localSrcPath + fileExtension
125 |       val newFile: File = new File(newFilename)
126 |       f.renameTo(newFile)
127 |       trace("Image successfully Written to Disk")
128 |       newFilename
129 |     }
130 |     catch {
131 |       case e: Exception =>
132 |         throw e
133 |     }
134 |     finally {
135 |       instream.close()
136 |       outstream.close()
137 |     }
138 |   }
139 | 
140 |   /**
141 |   * stores an image to disk and returns the path where the file was written
142 |   *
143 |   * @return
144 |   */
145 |   def storeTempImage(httpClient: HttpClient, linkhash: String, imageSrcMaster: String, config: Configuration): String = {
146 | 
147 |     var imageSrc = imageSrcMaster
148 |     try {
149 |       imageSrc = imageSrc.replace(" ", "%20")
150 |       trace("Starting to download image: " + imageSrc)
151 | 
152 |       fetchEntity(httpClient, imageSrc) match {
153 |         case Some(entity) =>
154 |           try {
155 |             return copyInputStreamToLocalImage(entity, linkhash, config)
156 |           }
157 |           catch {
158 |             case e: SecretGifException =>
159 |               throw e
160 |             case e: Exception =>
161 |               logger.error(e.getMessage)
162 |               return null
163 |           }
164 |         case None =>
165 |           trace("Unable to get entity for: " + imageSrc)
166 |           return null
167 |       }
168 | 
169 |     }
170 |     catch {
171 |       case e: IllegalArgumentException =>
172 |         logger.warn(e.getMessage)
173 |       case e: SecretGifException =>
174 |         raise(e)
175 |       case e: ClientProtocolException =>
176 |         logger.error(e.toString)
177 |       case e: IOException =>
178 |         logger.error(e.toString)
179 |       case e: Exception =>
180 |         e.printStackTrace()
181 |         logger.error(e.toString)
182 |         e.printStackTrace()
183 |     }
184 |     finally {
185 | 
186 |     }
187 |     null
188 |   }
189 | 
190 |   private def raise(e: SecretGifException): Unit = {
191 |   }
192 | 
193 | 
194 | }
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/outputformatters/OutputFormatter.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.gravity.goose.outputformatters
 20 | 
 21 | import org.jsoup.nodes._
 22 | import org.apache.commons.lang.StringEscapeUtils
 23 | import org.jsoup.select.Elements
 24 | import com.gravity.goose.text.StopWords
 25 | import scala.collection.JavaConversions._
 26 | import org.slf4j.Logger
 27 | 
 28 | /**
 29 | * Created by Jim Plush
 30 | * User: jim
 31 | * Date: 8/17/11
 32 | */
 33 | 
 34 | trait OutputFormatter {
 35 |   val logPrefix = "outformat: "
 36 | 
 37 |   // used to remove tags within tags
 38 |   val tagReplace = "<[^>]+>".r
 39 | 
 40 |   def logger: Logger
 41 | 
 42 |   private def selectElements(query: String, topNode: Element): Elements = topNode match {
 43 |     case null => new Elements(List.empty[Element])
 44 |     case n => n.select(query)
 45 |   }
 46 |   
 47 |   /**
 48 |   * Depricated use {@link #getFormattedText(Element)}
 49 |   * @param topNode the top most node to format
 50 |   * @return the prepared Element
 51 |   */
 52 |   @Deprecated def getFormattedElement(topNode: Element): Element = {
 53 |     removeNodesWithNegativeScores(topNode)
 54 |     convertLinksToText(topNode)
 55 |     replaceTagsWithText(topNode)
 56 |     removeParagraphsWithFewWords(topNode)
 57 |     topNode
 58 |   }
 59 | 
 60 |   /**
 61 |   * Removes all unnecessarry elements and formats the selected text nodes
 62 |   * @param topNode the top most node to format
 63 |   * @return a formatted string with all HTML removed
 64 |   */
 65 |   def getFormattedText(topNode: Element): String = {
 66 |     removeNodesWithNegativeScores(topNode)
 67 |     convertLinksToText(topNode)
 68 |     replaceTagsWithText(topNode)
 69 |     removeParagraphsWithFewWords(topNode)
 70 |     convertToText(topNode)
 71 |   }
 72 | 
 73 |   /**
 74 |   * Depricated use {@link #getFormattedText(Element)}
 75 |   * takes an element and turns the P tags into \n\n
 76 |   *
 77 |   * @return
 78 |   */
 79 |   def convertToText(topNode: Element): String = topNode match {
 80 |     case null => ""
 81 |     case node => {
 82 |       (node.children().map((e: Element) => {
 83 |         StringEscapeUtils.unescapeHtml(e.text).trim
 84 |       })).toList.mkString("\n\n")
 85 |     }
 86 | 
 87 |   }
 88 | 
 89 |   /**
 90 |   * cleans up and converts any nodes that should be considered text into text
 91 |   */
 92 |   private def convertLinksToText(topNode: Element) {
 93 |     if (topNode != null) {
 94 |       logger.trace(logPrefix + "Turning links to text")
 95 |       val baseUri = topNode.baseUri()
 96 | 
 97 |       val links = topNode.getElementsByTag("a")
 98 |       for (item <- links) {
 99 |         if (item.getElementsByTag("img").isEmpty) {
100 |           val tn = new TextNode(item.text, baseUri)
101 |           item.replaceWith(tn)
102 |         }
103 |       }
104 |     }
105 | 
106 |   }
107 | 
108 |   /**
109 |   * if there are elements inside our top node that have a negative gravity score, let's
110 |   * give em the boot
111 |   */
112 |   private def removeNodesWithNegativeScores(topNode: Element) {
113 |     def tryInt(text: String): Int = try {
114 |       Integer.parseInt(text)
115 |     } catch {
116 |       case _: Exception => 0
117 |     }
118 | 
119 |     val gravityItems = selectElements("*[gravityScore]", topNode)
120 |     for (item <- gravityItems) {
121 |       val score = tryInt(item.attr("gravityScore"))
122 |       if (score < 1) {
123 |         item.remove()
124 |       }
125 |     }
126 |   }
127 | 
128 |   /**
129 |   * replace common tags with just text so we don't have any crazy formatting issues
130 |   * so replace <br>, <i>, <strong>, etc.... with whatever text is inside them
131 |   */
132 |   private def replaceTagsWithText(topNode: Element) {
133 |     if (topNode != null) {
134 |       val baseUri = topNode.baseUri()
135 |       val bolds = topNode.getElementsByTag("b")
136 |       for (item <- bolds) {
137 |         val tn = new TextNode(getTagCleanedText(item), baseUri)
138 |         item.replaceWith(tn)
139 |       }
140 | 
141 |       val strongs = topNode.getElementsByTag("strong")
142 |       for (item <- strongs) {
143 |         val tn = new TextNode(getTagCleanedText(item), baseUri)
144 |         item.replaceWith(tn)
145 |       }
146 | 
147 |       val italics = topNode.getElementsByTag("i")
148 |       for (item <- italics) {
149 |         val tn = new TextNode(getTagCleanedText(item), baseUri)
150 |         item.replaceWith(tn)
151 | 
152 |       }
153 |     }
154 |   }
155 | 
156 |   private def getTagCleanedText(item: Node): String = {
157 |     val sb = new StringBuilder()
158 | 
159 |     item.childNodes().foreach {
160 |       case childText: TextNode => {
161 |         sb.append(childText.getWholeText)
162 |       }
163 |       case childElement: Element => {
164 |         sb.append(childElement.outerHtml())
165 |       }
166 |       case _ =>
167 |     }
168 | 
169 |     val text = tagReplace replaceAllIn(sb.toString(), "")
170 |     text
171 |   }
172 | 
173 |   /**
174 |   * remove paragraphs that have less than x number of words, would indicate that it's some sort of link
175 |   */
176 |   private def removeParagraphsWithFewWords(topNode: Element) {
177 |     if (topNode != null) {
178 |       if (logger.isDebugEnabled) {
179 |         logger.debug("removeParagraphsWithFewWords starting...")
180 |       }
181 | 
182 |       val allNodes = topNode.getAllElements
183 | 
184 |       for (el <- allNodes) {
185 |         try {
186 |           val stopWords = StopWords.getStopWordCount(el.text)
187 |           if (stopWords.getStopWordCount < 3 && el.getElementsByTag("object").size == 0 && el.getElementsByTag("embed").size == 0) {
188 |             logger.debug("removeParagraphsWithFewWords - swcnt: %d removing text: %s".format(stopWords.getStopWordCount, el.text()))
189 |             el.remove()
190 |           }
191 |         }
192 |         catch {
193 |           case e: IllegalArgumentException => {
194 |             logger.error(e.getMessage)
195 |           }
196 |         }
197 |       }
198 | 
199 |       Option(topNode.getElementsByTag("p").first()).foreach {
200 |         case firstModdedNode: Element => {
201 |           // check for open parens as the first paragraph, e.g. businessweek4.txt (IT)
202 |           val trimmed = firstModdedNode.text().trim()
203 |           if (trimmed.startsWith("(") && trimmed.endsWith(")")) {
204 |             logger.trace("Removing parenthesis paragraph that is first paragraph")
205 |             firstModdedNode.remove()
206 |           }
207 |         }
208 |       }
209 |     }
210 |   }
211 | }


--------------------------------------------------------------------------------
/src/main/scala/com/aamend/spark/gdelt/ContentFetcher.scala:
--------------------------------------------------------------------------------
  1 | package com.aamend.spark.gdelt
  2 | 
  3 | import java.io.File
  4 | 
  5 | import com.gravity.goose.{Configuration, Goose}
  6 | import org.apache.commons.lang.StringUtils
  7 | import org.apache.spark.ml.Transformer
  8 | import org.apache.spark.ml.param._
  9 | import org.apache.spark.ml.util._
 10 | import org.apache.spark.sql.types._
 11 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 12 | 
 13 | import scala.util.Try
 14 | 
 15 | trait ContentFetcherParams extends Params with DefaultParamsWritable {
 16 |   val inputColumn = new Param[String](this, "inputColumn", "(MANDATORY) The input column containing URLs")
 17 |   val outputContentColumn = new Param[String](this, "outputContentColumn", "(OPTIONAL) Field that will contain HTML content")
 18 |   val outputTitleColumn = new Param[String](this, "outputTitleColumn", "(OPTIONAL) Field that will contain HTML title")
 19 |   val outputDescriptionColumn = new Param[String](this, "outputDescriptionColumn", "(OPTIONAL) Field that will contain HTML description")
 20 |   val outputKeywordsColumn = new Param[String](this, "outputKeywordsColumn", "(OPTIONAL) Field that will contain HTML keywords")
 21 |   val outputPublishDateColumn = new Param[String](this, "outputPublishDateColumn", "(OPTIONAL) Field that will contain HTML publishDate")
 22 |   val outputImageUrlColumn = new Param[String](this, "outputImageUrlColumn", "(OPTIONAL) Field that will contain HTML image header URL")
 23 |   val outputImageBase64Column = new Param[String](this, "outputImageBase64Column", "(OPTIONAL) Field that will contain HTML image header in Base64")
 24 |   val userAgent = new Param[String](this, "userAgent", "(OPTIONAL) User agent that is sent with your web requests to extract URL content")
 25 |   val socketTimeout = new Param[Int](this, "socketTimeout", "(OPTIONAL) Socket timeout (ms)")
 26 |   val connectionTimeout = new Param[Int](this, "connectionTimeout", "(OPTIONAL) Connection timeout (ms)")
 27 |   val imagemagickConvert = new Param[String](this, "imagemagickConvert", "(OPTIONAL) imagemagick convert executable")
 28 |   val imagemagickIdentify = new Param[String](this, "imagemagickIdentify", "(OPTIONAL) imagemagick identify executable")
 29 | }
 30 | 
 31 | class ContentFetcher(override val uid: String) extends Transformer with ContentFetcherParams {
 32 | 
 33 |   def setImagemagickConvert(value: String): this.type = set(imagemagickConvert, value)
 34 | 
 35 |   setDefault(imagemagickConvert -> "/usr/local/bin/convert")
 36 | 
 37 |   def setImagemagickIdentify(value: String): this.type = set(imagemagickIdentify, value)
 38 | 
 39 |   setDefault(imagemagickIdentify -> "/usr/local/bin/identify")
 40 | 
 41 |   def setUserAgent(value: String): this.type = set(userAgent, value)
 42 | 
 43 |   setDefault(userAgent -> "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8")
 44 | 
 45 |   def setSocketTimeout(value: Int): this.type = set(socketTimeout, value)
 46 | 
 47 |   setDefault(socketTimeout -> 10000)
 48 | 
 49 |   def setConnectionTimeout(value: Int): this.type = set(connectionTimeout, value)
 50 | 
 51 |   setDefault(connectionTimeout -> 10000)
 52 | 
 53 |   def setInputCol(value: String): this.type = set(inputColumn, value)
 54 | 
 55 |   setDefault(inputColumn -> "sourceURL")
 56 | 
 57 |   def setOutputContentCol(value: String): this.type = set(outputContentColumn, value)
 58 | 
 59 |   setDefault(outputContentColumn -> "")
 60 | 
 61 |   def setOutputTitleCol(value: String): this.type = set(outputTitleColumn, value)
 62 | 
 63 |   setDefault(outputTitleColumn -> "")
 64 | 
 65 |   def setOutputDescriptionCol(value: String): this.type = set(outputDescriptionColumn, value)
 66 | 
 67 |   setDefault(outputDescriptionColumn -> "")
 68 | 
 69 |   def setOutputKeywordsCol(value: String): this.type = set(outputKeywordsColumn, value)
 70 | 
 71 |   setDefault(outputKeywordsColumn -> "")
 72 | 
 73 |   def setOutputPublishDateCol(value: String): this.type = set(outputPublishDateColumn, value)
 74 | 
 75 |   setDefault(outputPublishDateColumn -> "")
 76 | 
 77 |   def setOutputImageUrlCol(value: String): this.type = set(outputImageUrlColumn, value)
 78 | 
 79 |   setDefault(outputImageUrlColumn -> "")
 80 | 
 81 |   def setOutputImageBase64Col(value: String): this.type = set(outputImageBase64Column, value)
 82 | 
 83 |   setDefault(outputImageBase64Column -> "")
 84 | 
 85 |   def this() = this(Identifiable.randomUID("com/gravity/goose"))
 86 | 
 87 |   override def transform(origDS: Dataset[_]): DataFrame = {
 88 | 
 89 |     val outputFields = loadOutputFields()
 90 | 
 91 |     // Make sure the URL field exist
 92 |     require(origDS.schema.exists(s => s.name == $(inputColumn) && s.dataType == StringType), "Field [" + $(inputColumn) + "] is not valid")
 93 | 
 94 |     // Make sure at least one output field is specified
 95 |     require(outputFields.nonEmpty, "At least one output field should be specified")
 96 | 
 97 |     // Make sure each specified output field does not exist
 98 |     outputFields.values.foreach(outputField => require(!origDS.schema.exists(_.name == outputField), s"Field [$outputField] already exist"))
 99 | 
100 |     // This intermediate dataset to make sure we don't scrape more than once a same URL
101 |     val urlDF = origDS.select($(inputColumn)).dropDuplicates($(inputColumn))
102 | 
103 |     // If Image fetching enabled, we need path to image magic and convert
104 |     if(StringUtils.isNotEmpty($(outputImageUrlColumn)) || StringUtils.isNotEmpty($(outputImageBase64Column))) {
105 |       require(StringUtils.isNotEmpty($(imagemagickConvert)) && Try(new File($(imagemagickConvert))).isSuccess, "imagemagick convert executable needs to be specified for Image fetching")
106 |       require(StringUtils.isNotEmpty($(imagemagickIdentify)) && Try(new File($(imagemagickIdentify))).isSuccess, "imagemagick identify executable needs to be specified for Image fetching")
107 |     }
108 | 
109 |     // Append URL dataframe with article annotators
110 |     val urlContentRDD = urlDF.rdd.mapPartitions(rows => {
111 | 
112 |       // Initialize Goose only once for each partition
113 |       val conf = new Configuration()
114 |       if(StringUtils.isNotEmpty($(outputImageUrlColumn)) || StringUtils.isNotEmpty($(outputImageBase64Column))) {
115 |         conf.setEnableImageFetching(true)
116 |         conf.setImagemagickConvertPath($(imagemagickConvert))
117 |         conf.setImagemagickIdentifyPath($(imagemagickIdentify))
118 |       } else {
119 |         conf.setEnableImageFetching(false)
120 |       }
121 |       conf.setBrowserUserAgent($(userAgent))
122 |       conf.setSocketTimeout($(socketTimeout))
123 |       conf.setConnectionTimeout($(connectionTimeout))
124 |       val goose = new Goose(conf)
125 | 
126 |       // Scrape each URL individually
127 |       val articles = scrapeContent(rows.map(_.getAs[String]($(inputColumn))), goose)
128 | 
129 |       // Convert articles as Row
130 |       articles.map(article => {
131 |         val appended: Seq[Any] = outputFields.map { case (key, _) =>
132 |           key match {
133 |             case ANNOTATOR_TITLE => article.title.getOrElse("")
134 |             case ANNOTATOR_DESCRIPTION => article.description.getOrElse("")
135 |             case ANNOTATOR_CONTENT => article.content.getOrElse("")
136 |             case ANNOTATOR_KEYWORDS => article.keywords
137 |             case ANNOTATOR_PUBLISH_DATE => article.publishDate.orNull
138 |             case ANNOTATOR_IMAGE_URL => article.imageURL.getOrElse("")
139 |             case ANNOTATOR_IMAGE_BASE64 => article.imageBase64.getOrElse("")
140 |           }
141 |         }.toSeq
142 |         Row.fromSeq(Seq(article.url) ++ appended)
143 |       })
144 |     })
145 | 
146 |     // Transform RDD of Row to Dataframe
147 |     val contentDF = origDS.sqlContext.createDataFrame(urlContentRDD, transformSchema(urlDF.schema))
148 | 
149 |     // Join articles back to any duplicate URL dataset
150 |     contentDF.join(origDS, List($(inputColumn)))
151 | 
152 |   }
153 | 
154 |   private def loadOutputFields(): Map[String, String] = {
155 |     Map(
156 |       ANNOTATOR_TITLE -> $(outputTitleColumn),
157 |       ANNOTATOR_DESCRIPTION -> $(outputDescriptionColumn),
158 |       ANNOTATOR_CONTENT -> $(outputContentColumn),
159 |       ANNOTATOR_KEYWORDS -> $(outputKeywordsColumn),
160 |       ANNOTATOR_PUBLISH_DATE -> $(outputPublishDateColumn),
161 |       ANNOTATOR_IMAGE_BASE64 -> $(outputImageBase64Column),
162 |       ANNOTATOR_IMAGE_URL -> $(outputImageUrlColumn)
163 |     ).filter(s => StringUtils.isNotEmpty(s._2))
164 |   }
165 | 
166 |   override def transformSchema(schema: StructType): StructType = {
167 |     StructType(
168 |       schema.seq ++ loadOutputFields().map { case (key, value) =>
169 |         key match {
170 |           case ANNOTATOR_TITLE => StructField(value, StringType, nullable = false)
171 |           case ANNOTATOR_DESCRIPTION => StructField(value, StringType, nullable = false)
172 |           case ANNOTATOR_CONTENT => StructField(value, StringType, nullable = false)
173 |           case ANNOTATOR_KEYWORDS => StructField(value, ArrayType.apply(StringType), nullable = false)
174 |           case ANNOTATOR_PUBLISH_DATE => StructField(value, DateType, nullable = true)
175 |           case ANNOTATOR_IMAGE_URL => StructField(value, StringType, nullable = true)
176 |           case ANNOTATOR_IMAGE_BASE64 => StructField(value, StringType, nullable = true)
177 |         }
178 |       }
179 |     )
180 |   }
181 | 
182 |   override def copy(extra: ParamMap): Transformer = {
183 |     defaultCopy(extra)
184 |   }
185 | }
186 | 
187 | object ContentFetcher extends DefaultParamsReadable[ContentFetcher] {
188 |   override def load(path: String): ContentFetcher = super.load(path)
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/ImageUtils.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | package com.gravity.goose.images
 19 | 
 20 | /**
 21 |  * Created by Jim Plush
 22 |  * User: jim
 23 |  * Date: 8/18/11
 24 |  */
 25 | 
 26 | import javax.imageio.ImageIO
 27 | import java.awt.color.CMMException
 28 | import java.awt.image.BufferedImage
 29 | 
 30 | import com.gravity.goose.utils.{Logging, URLHelper}
 31 | import org.apache.http.client.HttpClient
 32 | import org.apache.http.HttpEntity
 33 | import org.apache.http.protocol.{BasicHttpContext, HttpContext}
 34 | import org.apache.http.client.protocol.ClientContext
 35 | import java.util.Base64
 36 | import java.io._
 37 | import java.util
 38 | 
 39 | import com.gravity.goose.Configuration
 40 | import com.gravity.goose.text.{HashUtils, string}
 41 | import org.apache.http.util.EntityUtils
 42 | import org.apache.commons.io.IOUtils
 43 | import com.gravity.goose.network.{HtmlFetcher, ImageFetchException}
 44 | 
 45 | import scala.util.Try
 46 | import scala.util.matching.Regex
 47 | 
 48 | object ImageUtils extends Logging {
 49 |   val spaceRegex: Regex = " ".r
 50 |   val xRegex: Regex = "x".r
 51 | 
 52 |   /**
 53 |   * User: Jim Plush
 54 |   * gets the image dimensions for an image file, pass in the path to the image who's dimensions you want to get
 55 |   * this will use imageMagick since the Java IO and imaging shit SUCKS for getting mime types and file info for jpg and png files
 56 |   *
 57 |   * @return
 58 |   */
 59 |   def getImageDimensions(identifyProgram: String, filePath: String): ImageDetails = {
 60 |     val imageInfo = execToString(Array(identifyProgram, filePath))
 61 |     val imageDetails: ImageDetails = new ImageDetails
 62 |     if (imageInfo == null || imageInfo.contains("no decode delegate for this image format")) {
 63 |       throw new IOException("Unable to get Image Information (no decode delegate) for: " + filePath + "\n\tcommand '" + identifyProgram + " " + filePath + "' returned: " + imageInfo)
 64 |     }
 65 |     val infoParts = spaceRegex.split(imageInfo)
 66 |     val mimeType = infoParts.lift(1).getOrElse(string.empty)
 67 |     val (width, height) = infoParts.lift(2) match {
 68 |       case Some(dimensions) =>
 69 |         val pair = xRegex.split(dimensions)
 70 |         if (pair.length > 1) {
 71 |           val wStr = pair(0)
 72 |           val hStr = pair(1)
 73 | 
 74 |           (string.tryToInt(wStr).getOrElse(0), string.tryToInt(hStr).getOrElse(0))
 75 |         } else {
 76 |           (0, 0)
 77 |         }
 78 |       case None => (0, 0)
 79 |     }
 80 |     imageDetails.setMimeType(mimeType)
 81 |     imageDetails.setWidth(width)
 82 |     imageDetails.setHeight(height)
 83 |     imageDetails
 84 |   }
 85 | 
 86 |   def readImageBase64(file: File): String = {
 87 |     val fileInputStreamReader = new FileInputStream(file)
 88 |     val bytes = new Array[Byte](file.length.asInstanceOf[Int])
 89 |     fileInputStreamReader.read(bytes)
 90 |     Base64.getEncoder.encodeToString(bytes)
 91 |   }
 92 | 
 93 |   /**
 94 |   * gets the image dimensions for an image file, pass in the path to the image who's dimensions you want to get, uses the built in java commands
 95 |   *
 96 |   * @return
 97 |   */
 98 |   def getImageDimensionsJava(filePath: String): util.HashMap[String, Integer] = {
 99 |     var image: BufferedImage = null
100 |     try {
101 |       val f: File = new File(filePath)
102 |       image = ImageIO.read(f)
103 |       val results: util.HashMap[String, Integer] = new util.HashMap[String, Integer]
104 |       results.put("height", image.getHeight)
105 |       results.put("width", image.getWidth)
106 |       results
107 |     }
108 |     catch {
109 |       case e: CMMException =>
110 |         logger.error("ERROR READING FILE: " + filePath + " \n", e)
111 |         throw new IOException("Unable to read file: " + filePath)
112 |     }
113 |     finally {
114 |       if (image != null) {
115 |         try {
116 |           image.flush()
117 |         }
118 |         catch {
119 |           case _: Exception =>
120 |         }
121 |       }
122 |     }
123 |   }
124 | 
125 |   /**
126 |   * Tries to exec the command, waits for it to finish, logs errors if exit
127 |   * status is nonzero, and returns true if exit status is 0 (success).
128 |   *
129 |   * @param command Description of the Parameter
130 |   * @return Description of the Return Value
131 |   */
132 |   private def execToString(command: Array[String]): String = {
133 |     var p: Process = null
134 |     var in: BufferedReader = null
135 |     try {
136 |       p = Runtime.getRuntime.exec(command)
137 |       in = new BufferedReader(new InputStreamReader(p.getInputStream))
138 |       var line: String = null
139 |       line = in.readLine
140 |       p.waitFor
141 |       return line
142 |     }
143 |     catch {
144 |       case e: IOException =>
145 |         logger.error(e.toString, e)
146 |       case e: InterruptedException =>
147 |         logger.error(e.toString, e)
148 |         throw new RuntimeException(e)
149 |     }
150 |     finally {
151 |       if (in != null) {
152 |         try {
153 |           in.close()
154 |         }
155 |         catch {
156 |           case _: IOException =>
157 |         }
158 |       }
159 |       if (p != null) {
160 |         p.destroy()
161 |       }
162 |     }
163 |     null
164 |   }
165 | 
166 |   /**
167 |   * Writes an image src http string to disk as a temporary file and returns the LocallyStoredImage object that has the info you should need
168 |   * on the image
169 |   */
170 |   def storeImageToLocalFile(httpClient: HttpClient, linkhash: String, imageSrc: String, config: Configuration): Option[LocallyStoredImage] = {
171 | 
172 |     try {
173 |       // check for a cache hit already on disk
174 |       readExistingFileInfo(linkhash, imageSrc, config) match {
175 |         case Some(locallyStoredImage) =>
176 |           trace("Image already cached on disk: " + imageSrc)
177 |           return Some(locallyStoredImage)
178 |         case None =>
179 |       }
180 | 
181 |       trace("Not found locally...starting to download image: " + imageSrc)
182 |       fetchEntity(httpClient, imageSrc, config) match {
183 |         case Some(entity) =>
184 |           trace("Got entity for " + imageSrc)
185 |           writeEntityContentsToDisk(entity, linkhash, imageSrc, config) match {
186 |             case Some(locallyStoredImage) => trace("Img Write successfull to disk"); Some(locallyStoredImage)
187 |             case None => trace("Unable to write contents to disk: " + imageSrc); None
188 |           }
189 |         case None => trace("Unable to fetch entity for: " + imageSrc); None
190 |       }
191 |     } catch {
192 |       case e: Exception =>
193 |         info(e, e.toString)
194 |         None
195 |     }
196 | 
197 |   }
198 | 
199 | 
200 |   def readExistingFileInfo(linkhash: String, imageSrc: String, config: Configuration): Option[LocallyStoredImage] = {
201 |     val localImageName = getLocalFileName(linkhash, imageSrc, config)
202 |     val imageFile = new File(localImageName)
203 |     if (imageFile.exists()) {
204 |       try {
205 |         trace("Reading image from disk: " + localImageName)
206 |         val imageDetails = getImageDimensions(config.imagemagickIdentifyPath, localImageName)
207 |         val mimeType = imageDetails.getMimeType.toLowerCase
208 |         val base64 = ImageUtils.readImageBase64(new File(localImageName))
209 |         Some(LocallyStoredImage(imageSrc, mimeType, base64, imageFile.length(), imageDetails.getHeight, imageDetails.getWidth))
210 |       } catch {
211 |         case e: Exception =>
212 |           trace(e, "Unable to get image file dimensions & extension name!")
213 |           None
214 |       }
215 |     } else {
216 |       None
217 |     }
218 | 
219 |   }
220 | 
221 |   def writeEntityContentsToDisk(entity: HttpEntity, linkhash: String, imageSrc: String, config: Configuration): Option[LocallyStoredImage] = {
222 | 
223 |     val localSrcPath = getLocalFileName(linkhash, imageSrc, config)
224 |     val outstream: OutputStream = new FileOutputStream(localSrcPath)
225 |     val instream: InputStream = entity.getContent
226 |      trace("Content Length: " + entity.getContentLength)
227 |     try {
228 |       val fileCopyBytes = IOUtils.copy(instream, outstream)
229 |       trace(fileCopyBytes + " bytes copied to disk")
230 |     } finally {
231 |       Try {
232 |         outstream.flush()
233 |         outstream.close()
234 |         instream.close()
235 |       }
236 |     }
237 |     EntityUtils.consume(entity)
238 |     trace("Content Length: " + entity.getContentLength)
239 |     readExistingFileInfo(linkhash, imageSrc, config)
240 | 
241 |   }
242 | 
243 |   def getLocalFileName(linkhash: String, imageSrc: String, config: Configuration): String = {
244 |     val imageHash = HashUtils.md5(imageSrc)
245 |     config.localStoragePath + "/" + linkhash + "_" + imageHash
246 |   }
247 | 
248 | 
249 |   def cleanImageSrcString(imgSrc: String): String = spaceRegex.replaceAllIn(imgSrc, "%20")
250 | 
251 |   def fetchEntity(httpClient: HttpClient, imageSrc: String, config: Configuration): Option[HttpEntity] = {
252 | 
253 |     URLHelper.tryToHttpGet(imageSrc) match {
254 |       case Some(httpget) =>
255 |         val localContext: HttpContext = new BasicHttpContext
256 |         localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore)
257 |         val response = try {
258 |           config.getHtmlFetcher.getHttpClient.execute(httpget, localContext)
259 |         }
260 |         catch {
261 |           case ex: Exception => throw new ImageFetchException(imageSrc, ex)
262 |         }
263 | 
264 |         val respStatus = response.getStatusLine.getStatusCode
265 | 
266 | 
267 |         if (respStatus != 200) {
268 |           None
269 |         } else {
270 |           try {
271 |             Option(response.getEntity)
272 |           } catch {
273 |             case e: Exception => warn(e, e.toString); httpget.abort(); None
274 |           }
275 |         }
276 |       case None =>
277 |         warn("Unable to parse imageSrc: '" + imageSrc + "' into HttpGet")
278 |         None
279 |     }
280 | 
281 |   }
282 | 
283 | 
284 | }
285 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------
/src/main/resources/com/aamend/spark/gdelt/reference/cameoEthnic.txt:
--------------------------------------------------------------------------------
  1 | CODE	LABEL
  2 | aar	Afar
  3 | abk	Abkhaz
  4 | abr	Aboriginal-Australians
  5 | ace	Acehnese
  6 | acg	Achang
  7 | ach	Acholi
  8 | ada	Ga
  9 | adi	Adivasi
 10 | adj	Adjarians
 11 | ady	Adyghe
 12 | afa	Black-African
 13 | afr	Afrikaners
 14 | ahm	Ahmadis
 15 | ain	Ainu
 16 | aja	Aja
 17 | aka	Akan
 18 | aku	Aku
 19 | ala	Alawi
 20 | alb	Albanian
 21 | ale	Aleut
 22 | alg	Algonquian
 23 | alt	Altay
 24 | alu	Alur
 25 | amb	Ambonese
 26 | ame	Americo-Liberians
 27 | amh	Amhara
 28 | anp	Angika speakers
 29 | apa	Apache
 30 | ara	Arab
 31 | ARB	Arab
 32 | arg	Aragonese
 33 | arm	Armenian
 34 | arn	Mapuche
 35 | arp	Arapaho
 36 | arw	Arawak
 37 | asa	Asian
 38 | ash	Ashkenazi Jews
 39 | asm	Assamese
 40 | ast	Asturian
 41 | asy	Assyrian
 42 | ata	Atacamenos
 43 | atg	Argentinians
 44 | ath	Athabaskan
 45 | ats	Agnostic/Athiest
 46 | aus	Australians
 47 | auu	Austrians
 48 | ava	Caucasian Avars
 49 | awa	Awadhi
 50 | aym	Aymara
 51 | aze	Azerbaijani
 52 | bad	Baganda
 53 | bah	Bahais
 54 | bai	Bamileke
 55 | bak	Bashkirs
 56 | bal	Baloch
 57 | bam	Bambara
 58 | ban	Balinese
 59 | baq	Basque
 60 | bar	Bari
 61 | bas	Basoga
 62 | bay	Gbaya
 63 | bda	Rakhine
 64 | bej	Beja
 65 | bel	Belarusians
 66 | bem	Bemba
 67 | ben	Bengali-Hindu
 68 | ber	Berber
 69 | bey	Beydan
 70 | bho	Bhojpuri
 71 | bih	Bihari
 72 | bii	Bai
 73 | bik	Bicolano
 74 | bin	Edo
 75 | bis	Urban ni-Vanautu
 76 | bke	Bateke
 77 | bkn	Bakongo
 78 | bkw	Bakweri
 79 | bla	Siksikawa
 80 | blg	Blang
 81 | blk	Balkars
 82 | bln	Balanta
 83 | bmr	Bamar
 84 | bni	Beni-Shugal-Gumez
 85 | bnt	Bantu
 86 | bny	Banyarwanda
 87 | bod	Bodo
 88 | bod	Tibetan
 89 | bol	Bolivia
 90 | bon	Bonan
 91 | bos	Bosniaks
 92 | bou	Buyei
 93 | bra	Brijwasi
 94 | brb	Bariba
 95 | bre	Breton
 96 | brh	Brahui
 97 | brk	Burakumin
 98 | brm	Kurichiya
 99 | bsh	Bushmen
100 | bst	Baster
101 | bsu	Subiya
102 | bte	Beti-Pahuin
103 | btk	Batak
104 | bua	Buryat
105 | bud	Buddhist
106 | bug	Bugis
107 | bul	Bulgarian
108 | byn	Bilen
109 | cab	Cabindan-Mayombe
110 | cad	Caddo
111 | cap	Cape Verdean
112 | car	Kali'na
113 | cat	Catalan
114 | ceb	Cebuano
115 | cha	Chamorro
116 | chc	Chukchi
117 | che	Chechen
118 | chg	Chagatai
119 | chi	Chinese
120 | chk	Chuukese
121 | chl	Chileans
122 | chm	Mari
123 | chn	Chinook
124 | cho	Choctaw
125 | chp	Chipewyan
126 | chr	Cherokee
127 | cht	Ch'orti'
128 | chv	Chuvash
129 | chw	Chewa
130 | chy	Cheyenne
131 | cir	Adyghe
132 | cmc	Cham
133 | col	Colombian
134 | con	Confusian
135 | cop	Coptic Christians
136 | cor	Cornish
137 | cos	Corsican
138 | cot	Cotiers
139 | cpe	English-Creole
140 | cpf	French-Creole
141 | cpp	Portuguese-Creole
142 | cre	Cree
143 | crh	Crimean Tatar
144 | cri	Christian
145 | cro	Orthodox Christian
146 | crp	Creole
147 | csb	Kashubian
148 | csr	Costa Ricans
149 | cth	Catholics
150 | cus	Cushitic
151 | cze	Czech
152 | dai	Dai
153 | dak	Sioux
154 | dal	Dalit
155 | dam	Damara
156 | dan	Danes
157 | dao	Yao (Asia)
158 | dar	Dargwa
159 | dau	Daur
160 | day	Dayak
161 | del	Lenape
162 | den	Slavey
163 | dgr	Dogrib
164 | din	Dinka
165 | div	Maldivian
166 | dje	Djerma-Songhai
167 | doi	Dogras
168 | dom	Dominicans
169 | don	Dong
170 | dox	Dongxiang
171 | dra	Dravidian
172 | dru	Druze
173 | drz	Druze
174 | dsb	Lower Sorbian
175 | dua	Duala
176 | dut	Dutch
177 | dyu	Dyula
178 | dzo	Ngalop
179 | eat	East Timorese
180 | ecu	Ecuadorians
181 | efi	Efik
182 | ein	East Indian
183 | eka	Ekajuk
184 | eng	English
185 | esh	Eshira
186 | est	Estonian
187 | eth	Ethiopian-Jews
188 | eur	Europeans
189 | eve	Evenks
190 | ewe	Ewe
191 | ewo	Ewondo
192 | fan	Fang
193 | fao	Faroese
194 | fat	Fante
195 | fij	Fijian
196 | fil	Filipino
197 | fin	Finns
198 | fiu	Finno-Ugric
199 | fon	Fon
200 | fre	French
201 | fri	Santals
202 | frr	Frisians
203 | fru	Fur
204 | ful	Fula
205 | fur	Friulan
206 | gar	Garifuna
207 | gay	Gayo
208 | gba	Gbaya
209 | gel	Gelao
210 | geo	Georgian
211 | ger	German
212 | gia	Gia Rai
213 | gil	Kiribati
214 | gin	Gin
215 | gio	Gio
216 | gla	Gaels
217 | gle	Irish
218 | glg	Galician
219 | glv	Manx
220 | gon	Gondi
221 | gor	Gorontalonese
222 | gra	Grassfielders
223 | grb	Grebo
224 | gre	Greek
225 | grn	Guarani
226 | gsw	Swiss Germans
227 | gua	Guatemalan
228 | guj	Gujarati
229 | gun	Guan
230 | gwi	Gwich'in
231 | had	Hadjerai
232 | hai	Haida
233 | har	Harari
234 | hat	Haitian
235 | hau	Hausa
236 | haw	Hawaiian
237 | haz	Hazara
238 | her	Herero
239 | hgh	Hill Tribes
240 | hil	Hiligayon
241 | him	Himachali
242 | hin	Hindu
243 | hjw	Hasidic
244 | hmn	Hmong
245 | hmo	Hiri Motu
246 | hni	Hani
247 | hoa	Hoa
248 | hon	Hondurans
249 | hrt	Haratin
250 | hrv	Croats
251 | hsb	Upper Sorbian
252 | hui	Hui
253 | hun	Hungarian
254 | hup	Hupa
255 | hut	Hutu
256 | iba	Iban
257 | ibo	Igbo
258 | ice	Icelanders
259 | idg	Indigenous
260 | idn	Indian
261 | iii	Yi
262 | ijo	Ijaw
263 | iku	Inuit
264 | ilo	Ilocono
265 | ind	Indonesian
266 | inh	Ingush
267 | ipk	Inupiat
268 | ira	Iranian
269 | iro	Iroquois
270 | ita	Itallian
271 | jan	Jain
272 | jav	Javanese
273 | jew	Jewish
274 | jhw	Jehovah's Witnesses
275 | jin	Jino
276 | jol	Jola
277 | jpn	Japanese
278 | kaa	Karakalpak
279 | kab	Kabyle
280 | kac	Kachin
281 | kad	Kadazan
282 | kak	Kakwa-Nubian
283 | kal	Kalaallit
284 | kam	Kamba
285 | kan	Kannada
286 | kao	Kaonde
287 | kar	Karen
288 | kas	Kashmiri
289 | kau	Kanuri
290 | kav	Kavango
291 | kaz	Kazakhs
292 | kbd	Kabarday
293 | kby	Kabye
294 | kch	Karachays
295 | kha	Khasi
296 | khi	Khoikhoi
297 | khk	Khakas
298 | khm	Khmer
299 | khu	Khmu
300 | kik	Kikuyu
301 | kin	Kinyarwanda Speakers
302 | kir	Kyrgyz
303 | kis	Kisii
304 | klm	Kalmyk
305 | kmb	North Mbundu
306 | kno	Kono
307 | knr	Kanuri
308 | kok	Kokani
309 | kom	Komi
310 | kon	Kongo
311 | kor	Korean
312 | kos	Kosraean
313 | kou	Kouyou
314 | kpe	Kpelle
315 | krh	Krahn
316 | krl	Karelians
317 | krm	Karamojong
318 | kro	Kru
319 | kru	Kurukh
320 | kua	Kwanyama
321 | kum	Kumyks
322 | kur	Kurd
323 | KUR	Kurd
324 | kut	Ktunaxa
325 | lad	Sephardic Jew
326 | lak	Lak (Russia)
327 | lam	Lamba
328 | lao	Lao
329 | lar	Lari
330 | lav	Latvian
331 | lba	Limba
332 | lds	Latter Day Saints
333 | len	Lenca
334 | lez	Lezgian
335 | lgb	Lugbara
336 | lhu	Lahu
337 | lii	Li
338 | lim	Limburgian
339 | lin	Lingala
340 | lit	Lithuanian
341 | lol	Mongo
342 | lom	Lomwe
343 | lov	Lovale
344 | loz	Lozi
345 | lsu	Lisu
346 | ltk	Latoka
347 | ltn	Latinos
348 | ltz	Luxembourgers
349 | lua	Luba-Kasai
350 | lub	Luba-Katanga
351 | lug	Baganda
352 | luh	Luhya
353 | lui	Luiseno
354 | lul	Lulua
355 | lun	Lunda
356 | luo	Luo
357 | lus	Lusei
358 | mac	Macedonian
359 | mad	Madurese
360 | maf	Mafwe
361 | mag	Magahi
362 | mah	Marshallese
363 | mai	Maithili
364 | mak	Makassarese
365 | mal	Malayalam
366 | man	Mandinka
367 | mao	Maori
368 | mar	Marathi
369 | mas	Maasai
370 | may	Malays
371 | mba	Mbandja
372 | mbe	Mbere
373 | mbk	M'Baka
374 | mbo	Mbochi
375 | mbu	Mbundu-Mestico
376 | mdf	Mokshas
377 | mdh	Madhesi
378 | mdi	Madi
379 | mdr	Mandar
380 | men	Mende
381 | mia	Miao
382 | mic	Mi'kmaq
383 | mij	Mijikenda
384 | min	Minangkabau
385 | miz	Mizo
386 | mla	Mulatto
387 | mld	Mole-Dagbani
388 | mlg	Malagasy
389 | mlo	Mulao
390 | mlt	Maltese
391 | mnc	Manchu
392 | mnd	Mande
393 | mng	Mananja-Nayanja
394 | mnh	Minahasa
395 | mni	Manipuri
396 | mnj	Manjack
397 | mnn	Mano
398 | mno	Lumad
399 | mns	Mon
400 | mny	Manyika
401 | moh	Mohajirs
402 | moh	Mohawk
403 | mok	Makonde
404 | mon	Maonan
405 | mon	Mongol
406 | mos	Mossi
407 | mri	Mari
408 | mrn	Maronites
409 | mro	Moro
410 | msk	Miskito
411 | msl	Muslim
412 | mtn	Montenegrins
413 | mtz	Mestizo
414 | mun	Munda
415 | muo	Muong
416 | mus	Muscogee
417 | mwl	Mirandese
418 | mwr	Marwaris
419 | mya	Mayangnas
420 | mye	Myene
421 | myn	Maya
422 | myv	Mordvins
423 | nag	Naga
424 | nah	Nahua
425 | nai	Native American
426 | nam	Nama
427 | nap	Neapolitan
428 | nau	Nauruan
429 | nav	Navajo
430 | nax	Nakhi
431 | nba	Nuba
432 | nbl	South Ndebele
433 | nca	Nicaraguan
434 | nde	Northern Ndebele
435 | ndo	Ndonga
436 | nep	Nepali
437 | ner	Nuer
438 | new	Newars
439 | ngn	Ngbandi
440 | ngo	Ngoni
441 | nia	Niasans
442 | nib	Nibolek
443 | nir	Niari
444 | niu	Niuean
445 | nkm	Nkomi
446 | nng	Nung
447 | nog	Nogais
448 | nor	Norwegians
449 | nso	Northern Sotho
450 | nub	Nubian
451 | nur	Nuristani
452 | nuu	Nu
453 | nya	Chewa
454 | nyk	Nyakyusa
455 | nym	Nyamwezi
456 | nyn	Ankole
457 | nyo	Nyoro
458 | nze	New Zealanders
459 | nzi	Nzema
460 | oci	Occitanians
461 | ogo	Ogoni
462 | oji	Ojibwe
463 | ojw	Orthodox/Ultra-Orthodox Jew
464 | oki	Okinawan
465 | ori	Oriya
466 | orm	Oromo
467 | oru	Orgunu
468 | osa	Osage
469 | oss	Ossetians
470 | oto	Otomi
471 | ova	Ovambo
472 | paa	Papuan
473 | pac	Pacific Islanders
474 | pag	Pangasinan
475 | pal	Palestinian
476 | PAL	Palestinian
477 | pam	Kapampangan
478 | pan	Punjabi
479 | pap	Papiamento-Creole
480 | par	Paraguayan
481 | pau	Palauan
482 | per	Persian
483 | pgn	Animist/Pagan
484 | phu	Puthai
485 | pnm	Panamanians
486 | pol	Poles
487 | pom	Pomaks
488 | pon	Pehnpeian
489 | por	Portuguese
490 | ppl	Papel
491 | pro	Protestant
492 | pru	Peruvian
493 | psh	Pashayi
494 | pum	Pumi
495 | pus	Pashtun
496 | qia	Qiang
497 | qiz	Qizilbash
498 | que	Quechua
499 | raj	Rajasthani
500 | ran	Pahari Rajput
501 | rap	Rapa Nui
502 | rar	Cook Islands Maori
503 | rel	Unspecified Religion
504 | roh	Romansh
505 | rom	Romani
506 | rum	Romanian
507 | run	Rundi
508 | rup	Aromanians
509 | rus	Russian
510 | sad	Sandawe
511 | sag	Sango
512 | sah	Yakuts
513 | sal	Salish
514 | sar	Sara
515 | sas	Sasak
516 | sat	Sudanese
517 | scn	Sicilian
518 | sco	Scottish
519 | sel	Selkup
520 | sen	Sena
521 | sfi	Sufi
522 | sha	Shafi'i
523 | she	She
524 | shi	Shi'ites
525 | shl	Shilluk
526 | shn	Shan
527 | shy	Shaigiya
528 | sid	Sidama
529 | sin	Sinhalese
530 | sio	Siouan
531 | sla	Slavic
532 | slo	Slovaks
533 | slr	Salar
534 | slv	Slovenes
535 | smi	Sami
536 | smo	Samoans
537 | sna	Shona
538 | snd	Sindhi
539 | snk	Soninke
540 | som	Somali
541 | son	Songhai
542 | sot	Sotho
543 | spa	Spanish
544 | srd	Sardinian
545 | srn	Sranan Tongo
546 | srp	Serbs
547 | srr	Serer
548 | srr	Serer
549 | ssw	Swazi
550 | sui	Sui
551 | suk	Sukama
552 | sun	Sunni
553 | sus	Susu
554 | swa	Swahili
555 | swe	Swedes
556 | swf	Swiss French
557 | swt	Swiss Italian
558 | tab	Tabasaran
559 | tah	Tahitian
560 | tai	Tai
561 | tam	Tamil
562 | tao	Taoist
563 | tat	Tatars
564 | taw	Tawahka
565 | tay	Tay
566 | tel	Telugu
567 | tem	Temne
568 | ter	Terenan
569 | tes	Teso
570 | tet	Tetum
571 | tgk	Tajik
572 | tgl	Tagalog
573 | tha	Thai
574 | tib	Tibetan
575 | tig	Tigre
576 | tir	Tigray-Tigrinya
577 | tiv	Tiv
578 | tkl	Tokelauan
579 | tli	Tlingit
580 | tmh	Tuareg
581 | tms	Tama
582 | tog	Tonga (Africa)
583 | ton	Tonga (Pacific)
584 | tor	Tooro
585 | tou	Toubou
586 | tpi	Tok Pisin
587 | tra	Transnistrians
588 | tri	Tripuri
589 | trn	Ternate
590 | tsi	Tsimshian
591 | tsn	Tswana
592 | tso	Tsonga
593 | tts	Tutsi
594 | tuj	Tujia
595 | tuk	Turkmen
596 | tum	Tumbuka
597 | tup	Tupi
598 | tur	Turkish
599 | tuu	Mongour
600 | tvl	Tuvaluans
601 | twi	Ashanti
602 | twn	Taiwanese
603 | tyv	Tuvans
604 | udm	Udmurt
605 | uig	Uyghur
606 | ukr	Ukranian
607 | umb	Southern Mbundu
608 | und	Undetermined
609 | urd	Urdu
610 | uzb	Uzbeks
611 | vaa	Va
612 | vai	Vai
613 | ven	Venda
614 | vie	Vietnamese
615 | vil	Vili
616 | vnz	Venezuelan
617 | vot	Votes
618 | wak	Wakashan
619 | wal	Welayta
620 | war	Waray
621 | was	Washoe
622 | wel	Welsch
623 | wel	Welsh
624 | wen	Sorbs
625 | whi	Whites
626 | wln	Walloons
627 | wol	Wolof
628 | xal	Kalmyk
629 | xho	Xhosa
630 | xib	Xibe
631 | xnc	Xinca
632 | yao	Yao
633 | yap	Yapese
634 | yor	Yoruba
635 | ypk	Yupik
636 | yug	Yugur
637 | zag	Zaghawa
638 | zap	Zapotec
639 | zay	Zaidiyya
640 | zen	Zenaga
641 | zha	Zhuang
642 | znd	Azande
643 | zom	Zomi
644 | zor	Zoroastrians
645 | zul	Zulu
646 | zun	Zuni
647 | zza	Zaza


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 
  4 |   <modelVersion>4.0.0</modelVersion>
  5 |   <name>gdelt-spark</name>
  6 |   <groupId>com.aamend.spark</groupId>
  7 |   <artifactId>spark-gdelt</artifactId>
  8 |   <version>3.1-SNAPSHOT</version>
  9 |   <url>https://github.com/aamend/spark-gdelt</url>
 10 |   <description>Working with GDELT from Spark
 11 |   environment</description>
 12 |   <inceptionYear>2015</inceptionYear>
 13 |   <developers>
 14 |     <developer>
 15 |       <id>aamend</id>
 16 |       <name>Antoine Amend</name>
 17 |       <email>antoine.amend@gmail.com</email>
 18 |       <organization>aamend.com</organization>
 19 |       <timezone>0</timezone>
 20 |       <roles>
 21 |         <role>big data scientist</role>
 22 |       </roles>
 23 |     </developer>
 24 |     <developer>
 25 |       <id>lamastex</id>
 26 |       <name>Raazesh Sainudiin</name>
 27 |       <email>raazesh.sainudiin@gmail.com</email>
 28 |       <organization>lamastex.org</organization>
 29 |       <timezone>0</timezone>
 30 |       <roles>
 31 |         <role>Associate Professor of Mathematics with Specialisation in Data Science and Senior Researcher in Data Science</role>
 32 |       </roles>
 33 |     </developer>
 34 |     <developer>
 35 |       <id>JohannesGraner</id>
 36 |       <name>Johannes Graner</name>
 37 |       <email>johannes.graner@hotmail.com</email>
 38 |       <organization>Combient Competence Centre for Data Engineering Sciences, Department of Mathematics, Uppsala University</organization>
 39 |       <timezone>0</timezone>
 40 |       <roles>
 41 |         <role>Summer Internship in Data Science</role>
 42 |       </roles>
 43 |     </developer>
 44 |     <developer>
 45 |       <id>AlbertNilsson</id>
 46 |       <name>Albert Nilsson</name>
 47 |       <email>albertnilsson1998@gmail.com</email>
 48 |       <organization>Combient Competence Centre for Data Engineering Sciences, Department of Mathematics, Uppsala University</organization>
 49 |       <timezone>0</timezone>
 50 |       <roles>
 51 |         <role>Summer Internship in Data Science</role>
 52 |       </roles>
 53 |     </developer>
 54 |   </developers>
 55 |   <organization>
 56 |     <name>aamend.com</name>
 57 |   </organization>
 58 |   <licenses>
 59 |     <license>
 60 |       <name>Apache License, version 2.0</name>
 61 |       <url>http://www.apache.org/licenses/LICENSE-2.0</url>
 62 |     </license>
 63 |   </licenses>
 64 |   <scm>
 65 |     <url>https://github.com/aamend/gdelt-spark</url>
 66 |     <connection>
 67 |     scm:git:git@github.com:aamend/spark-gdelt.git</connection>
 68 |     <tag>gdelt</tag>
 69 |   </scm>
 70 |   <distributionManagement>
 71 |     <snapshotRepository>
 72 |       <id>ossrh</id>
 73 |       <url>
 74 |       https://oss.sonatype.org/content/repositories/snapshots</url>
 75 |     </snapshotRepository>
 76 |     <repository>
 77 |       <id>ossrh</id>
 78 |       <url>
 79 |       https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
 80 |     </repository>
 81 |   </distributionManagement>
 82 |   <properties>
 83 |     <project.build.sourceEncoding>
 84 |     UTF-8</project.build.sourceEncoding>
 85 |     <scala.version>2.12.8</scala.version>
 86 |     <scala.binary.version>2.12</scala.binary.version>
 87 |     <spark.version>3.0.0</spark.version>
 88 |     <java.version>1.8</java.version>
 89 |   </properties>
 90 |   <dependencies>
 91 |     <!--SCALA DEPENDENCIES (provided)-->
 92 |     <dependency>
 93 |       <groupId>org.scala-lang</groupId>
 94 |       <artifactId>scala-library</artifactId>
 95 |       <version>${scala.version}</version>
 96 |       <scope>provided</scope>
 97 |     </dependency>
 98 |     <dependency>
 99 |       <groupId>org.scala-lang</groupId>
100 |       <artifactId>scala-reflect</artifactId>
101 |       <version>${scala.version}</version>
102 |       <scope>provided</scope>
103 |     </dependency>
104 |     <!--SPARK DEPENDENCIES (provided)-->
105 |     <dependency>
106 |       <groupId>org.apache.spark</groupId>
107 |       <artifactId>spark-core_${scala.binary.version}</artifactId>
108 |       <version>${spark.version}</version>
109 |       <scope>provided</scope>
110 |     </dependency>
111 |     <dependency>
112 |       <groupId>org.apache.spark</groupId>
113 |       <artifactId>spark-sql_${scala.binary.version}</artifactId>
114 |       <version>${spark.version}</version>
115 |       <scope>provided</scope>
116 |     </dependency>
117 |     <dependency>
118 |       <groupId>org.apache.spark</groupId>
119 |       <artifactId>spark-mllib_${scala.binary.version}</artifactId>
120 |       <version>${spark.version}</version>
121 |       <scope>provided</scope>
122 |     </dependency>
123 |     <!--UTIL DEPENDENCIES-->
124 |     <dependency>
125 |       <groupId>joda-time</groupId>
126 |       <artifactId>joda-time</artifactId>
127 |       <version>2.9.9</version>
128 |     </dependency>
129 |     <dependency>
130 |       <groupId>commons-lang</groupId>
131 |       <artifactId>commons-lang</artifactId>
132 |       <version>2.6</version>
133 |     </dependency>
134 |     <dependency>
135 |       <groupId>com.typesafe.scala-logging</groupId>
136 |       <artifactId>
137 |       scala-logging_${scala.binary.version}</artifactId>
138 |       <version>3.7.1</version>
139 |     </dependency>
140 |     <!--GOOSE DEPENDENCIES-->
141 |     <dependency>
142 |       <groupId>org.jsoup</groupId>
143 |       <artifactId>jsoup</artifactId>
144 |       <version>1.14.2</version>
145 |     </dependency>
146 |     <dependency>
147 |       <groupId>org.apache.httpcomponents</groupId>
148 |       <artifactId>httpclient</artifactId>
149 |       <version>4.5.13</version>
150 |     </dependency>
151 |     <dependency>
152 |       <groupId>commons-io</groupId>
153 |       <artifactId>commons-io</artifactId>
154 |       <version>2.7</version>
155 |     </dependency>
156 |     <!--TEST DEPENDENCIES (test)-->
157 |     <dependency>
158 |       <groupId>org.scalatest</groupId>
159 |       <artifactId>scalatest_${scala.binary.version}</artifactId>
160 |       <version>3.1.1</version>
161 |       <scope>test</scope>
162 |     </dependency>
163 |     <dependency>
164 |       <groupId>junit</groupId>
165 |       <artifactId>junit</artifactId>
166 |       <version>4.13.1</version>
167 |       <scope>test</scope>
168 |     </dependency>
169 |   </dependencies>
170 |   <build>
171 |     <plugins>
172 |       <plugin>
173 |         <groupId>org.apache.maven.plugins</groupId>
174 |         <artifactId>maven-compiler-plugin</artifactId>
175 |         <version>2.5.1</version>
176 |         <configuration>
177 |           <source>${java.version}</source>
178 |           <target>${java.version}</target>
179 |         </configuration>
180 |       </plugin>
181 |       <plugin>
182 |         <groupId>org.apache.maven.plugins</groupId>
183 |         <artifactId>maven-surefire-plugin</artifactId>
184 |         <version>2.7</version>
185 |         <configuration>
186 |           <skipTests>true</skipTests>
187 |         </configuration>
188 |       </plugin>
189 |       <plugin>
190 |         <groupId>org.sonatype.plugins</groupId>
191 |         <artifactId>nexus-staging-maven-plugin</artifactId>
192 |         <version>1.6.7</version>
193 |         <extensions>true</extensions>
194 |         <configuration>
195 |           <serverId>ossrh</serverId>
196 |           <nexusUrl>https://oss.sonatype.org/</nexusUrl>
197 |           <autoReleaseAfterClose>true</autoReleaseAfterClose>
198 |         </configuration>
199 |       </plugin>
200 |       <plugin>
201 |         <groupId>org.apache.maven.plugins</groupId>
202 |         <artifactId>maven-source-plugin</artifactId>
203 |         <version>2.2.1</version>
204 |         <executions>
205 |           <execution>
206 |             <id>attach-sources</id>
207 |             <goals>
208 |               <goal>jar-no-fork</goal>
209 |             </goals>
210 |           </execution>
211 |         </executions>
212 |       </plugin>
213 |       <plugin>
214 |         <groupId>net.alchim31.maven</groupId>
215 |         <artifactId>scala-maven-plugin</artifactId>
216 |         <version>4.3.1</version>
217 |         <executions>
218 |           <execution>
219 |             <goals>
220 |               <goal>compile</goal>
221 |               <goal>testCompile</goal>
222 |             </goals>
223 |           </execution>
224 |           <execution>
225 |             <id>attach-javadocs</id>
226 |             <goals>
227 |               <goal>doc-jar</goal>
228 |             </goals>
229 |           </execution>
230 |         </executions>
231 |       </plugin>
232 |       <plugin>
233 |         <groupId>org.scalatest</groupId>
234 |         <artifactId>scalatest-maven-plugin</artifactId>
235 |         <version>1.0</version>
236 |         <configuration>
237 |           <reportsDirectory>
238 |           ${project.build.directory}/surefire-reports</reportsDirectory>
239 |           <junitxml>.</junitxml>
240 |           <forkMode>once</forkMode>
241 |           <filereports>WDF TestSuite.txt</filereports>
242 |         </configuration>
243 |         <executions>
244 |           <execution>
245 |             <id>test</id>
246 |             <goals>
247 |               <goal>test</goal>
248 |             </goals>
249 |           </execution>
250 |         </executions>
251 |       </plugin>
252 |       <!-- adding shaded plugin to be able to build Uber jars for dev/tests -->
253 |       <plugin>
254 |         <groupId>org.apache.maven.plugins</groupId>
255 |         <artifactId>maven-shade-plugin</artifactId>
256 |         <version>3.2.2</version>
257 |         <executions>
258 |           <execution>
259 |             <phase>package</phase>
260 |             <goals>
261 |               <goal>shade</goal>
262 |             </goals>
263 |             <configuration>
264 |               <artifactSet>
265 |                 <excludes>
266 |                   <exclude>org.slf4j:slf4j-api</exclude>
267 |                   <exclude>
268 |                   org.apache.httpcomponents:httpclient</exclude>
269 |                   <exclude>
270 |                   org.apache.httpcomponents:httpcore</exclude>
271 |                   <exclude>
272 |                   commons-logging:commons-logging</exclude>
273 |                   <exclude>commons-codec:commons-codec</exclude>
274 |                   <exclude>commons-io:commons-io</exclude>
275 |                   <exclude />
276 |                   <exclude>joda-time:joda-time</exclude>
277 |                   <exclude>commons-lang:commons-lang</exclude>
278 |                   <exclude>
279 |                   com.typesafe.scala-logging:scala-logging_2.12</exclude>
280 |                 </excludes>
281 |               </artifactSet>
282 |               <transformers>
283 |                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
284 | 
285 |                   <mainClass>
286 |                     <!-- YOUR_APPLICATION_MAIN_CLASS -->
287 |                   </mainClass>
288 |                 </transformer>
289 |               </transformers>
290 |               <filters>
291 |                 <filter>
292 |                   <artifact>*:*</artifact>
293 |                   <excludes>
294 |                     <exclude>META-INF/maven/**</exclude>
295 |                     <exclude>META-INF/*.SF</exclude>
296 |                     <exclude>META-INF/*.DSA</exclude>
297 |                     <exclude>META-INF/*.RSA</exclude>
298 |                   </excludes>
299 |                 </filter>
300 |               </filters>
301 |               <relocations>
302 |                 <relocation>
303 |                   <pattern>org</pattern>
304 |                   <shadedPattern>repackaged.org</shadedPattern>
305 |                   <includes>
306 |                     <include>org.jsoup.**</include>
307 |                   </includes>
308 |                 </relocation>
309 |               </relocations>
310 |             </configuration>
311 |           </execution>
312 |         </executions>
313 |       </plugin>
314 |       <plugin>
315 |         <groupId>org.apache.maven.plugins</groupId>
316 |         <artifactId>maven-release-plugin</artifactId>
317 |         <version>2.5.3</version>
318 |         <configuration>
319 |           <autoVersionSubmodules>true</autoVersionSubmodules>
320 |           <useReleaseProfile>false</useReleaseProfile>
321 |           <releaseProfiles>release</releaseProfiles>
322 |           <goals>deploy</goals>
323 |         </configuration>
324 |         <dependencies>
325 |           <dependency>
326 |             <groupId>org.apache.maven.scm</groupId>
327 |             <artifactId>maven-scm-provider-gitexe</artifactId>
328 |             <version>1.8.1</version>
329 |           </dependency>
330 |         </dependencies>
331 |       </plugin>
332 |       <plugin>
333 |         <groupId>org.apache.maven.plugins</groupId>
334 |         <artifactId>maven-gpg-plugin</artifactId>
335 |         <version>1.5</version>
336 |         <executions>
337 |           <execution>
338 |             <id>sign-artifacts</id>
339 |             <phase>verify</phase>
340 |             <goals>
341 |               <goal>sign</goal>
342 |             </goals>
343 |           </execution>
344 |         </executions>
345 |       </plugin>
346 |     </plugins>
347 |   </build>
348 | </project>
349 | 


--------------------------------------------------------------------------------
/src/main/resources/com/aamend/spark/gdelt/reference/cameoEvent.txt:
--------------------------------------------------------------------------------
  1 | CAMEOEVENTCODE	EVENTDESCRIPTION
  2 | 01	MAKE PUBLIC STATEMENT
  3 | 010	Make statement, not specified below
  4 | 011	Decline comment
  5 | 012	Make pessimistic comment
  6 | 013	Make optimistic comment
  7 | 014	Consider policy option
  8 | 015	Acknowledge or claim responsibility
  9 | 016	Deny responsibility
 10 | 017	Engage in symbolic act
 11 | 018	Make empathetic comment
 12 | 019	Express accord
 13 | 02	APPEAL
 14 | 020	Appeal, not specified below
 15 | 021	Appeal for material cooperation, not specified below
 16 | 0211	Appeal for economic cooperation
 17 | 0212	Appeal for military cooperation
 18 | 0213	Appeal for judicial cooperation
 19 | 0214	Appeal for intelligence
 20 | 022	Appeal for diplomatic cooperation, such as policy support
 21 | 023	Appeal for aid, not specified below
 22 | 0231	Appeal for economic aid
 23 | 0232	Appeal for military aid
 24 | 0233	Appeal for humanitarian aid
 25 | 0234	Appeal for military protection or peacekeeping
 26 | 024	Appeal for political reform, not specified below
 27 | 0241	Appeal for change in leadership
 28 | 0242	Appeal for policy change
 29 | 0243	Appeal for rights
 30 | 0244	Appeal for change in institutions, regime
 31 | 025	Appeal to yield
 32 | 0251	Appeal for easing of administrative sanctions
 33 | 0252	Appeal for easing of popular dissent
 34 | 0253	Appeal for release of persons or property
 35 | 0254	Appeal for easing of economic sanctions, boycott, or embargo
 36 | 0255	Appeal for target to allow international involvement (non-mediation)
 37 | 0256	Appeal for de-escalation of military engagement
 38 | 026	Appeal to others to meet or negotiate
 39 | 027	Appeal to others to settle dispute
 40 | 028	Appeal to others to engage in or accept mediation
 41 | 03	EXPRESS INTENT TO COOPERATE
 42 | 030	Express intent to cooperate, not specified below
 43 | 031	Express intent to engage in material cooperation,  not specified below
 44 | 0311	Express intent to cooperate economically
 45 | 0312	Express intent to cooperate militarily
 46 | 0313	Express intent to cooperate on judicial matters
 47 | 0314	Express intent to cooperate on intelligence
 48 | 032	Express intent to provide diplomatic cooperation such as policy support
 49 | 033	Express intent to provide matyerial aid, not specified below
 50 | 0331	Express intent to provide economic aid
 51 | 0332	Express intent to provide military aid
 52 | 0333	Express intent to provide humanitarian aid
 53 | 0334	Express intent to provide military protection or peacekeeping
 54 | 034	Express intent to institute political reform, not specified below
 55 | 0341	Express intent to change leadership
 56 | 0342	Express intent to change policy
 57 | 0343	Express intent to provide rights
 58 | 0344	Express intent to change institutions, regime
 59 | 035	Express intent to yield, not specified below
 60 | 0351	Express intent to ease administrative sanctions
 61 | 0352	Express intent to ease popular dissent
 62 | 0353	Express intent to release persons or property
 63 | 0354	Express intent to ease economic sanctions, boycott, or embargo
 64 | 0355	Express intent allow international involvement (not mediation)
 65 | 0356	Express intent to de-escalate military engagement
 66 | 036	Express intent to meet or negotiate
 67 | 037	Express intent to settle dispute
 68 | 038	Express intent to accept mediation
 69 | 039	Express intent to mediate
 70 | 04	CONSULT
 71 | 040	Consult, not specified below
 72 | 041	Discuss by telephone
 73 | 042	Make a visit
 74 | 043	Host a visit
 75 | 044	Meet at a Ã’hirdÃ“location
 76 | 045	Mediate
 77 | 046	Engage in negotiation
 78 | 05	ENGAGE IN DIPLOMATIC COOPERATION
 79 | 050	Engage in diplomatic cooperation, not specified below
 80 | 051	Praise or endorse
 81 | 052	Defend verbally
 82 | 053	Rally support on behalf of
 83 | 054	Grant diplomatic recognition
 84 | 055	Apologize
 85 | 056	Forgive
 86 | 057	Sign formal agreement
 87 | 06	ENGAGE IN MATERIAL COOPERATION
 88 | 060	Engage in material cooperation, not specified below
 89 | 061	Cooperate economically
 90 | 062	Cooperate militarily
 91 | 063	Engage in judicial cooperation
 92 | 064	Share intelligence or information
 93 | 07	PROVIDE AID
 94 | 070	Provide aid, not specified below
 95 | 071	Provide economic aid
 96 | 072	Provide military aid
 97 | 073	Provide humanitarian aid
 98 | 074	Provide military protection or peacekeeping
 99 | 075	Grant asylum
100 | 08	YIELD
101 | 080	Yield, not specified below
102 | 081	Ease administrative sanctions, not specified below
103 | 0811	Ease restrictions on political freedoms
104 | 0812	Ease ban on political parties or politicians
105 | 0813	Ease curfew
106 | 0814	Ease state of emergency or martial law
107 | 082	Ease political dissent
108 | 083	Accede to requests or demands for political reform not specified below
109 | 0831	Accede to demands for change in leadership
110 | 0832	Accede to demands for change in policy
111 | 0833	Accede to demands for rights
112 | 0834	Accede to demands for change in institutions, regime
113 | 084	Return, release, not specified below
114 | 0841	Return, release person(s)
115 | 0842	Return, release property
116 | 085	Ease economic sanctions, boycott, embargo
117 | 086	Allow international involvement not specified below
118 | 0861	Receive deployment of peacekeepers
119 | 0862	Receive inspectors
120 | 0863	Allow delivery of humanitarian aid
121 | 087	De-escalate military engagement
122 | 0871	Declare truce, ceasefire
123 | 0872	Ease military blockade
124 | 0873	Demobilize armed forces
125 | 0874	Retreat or surrender militarily
126 | 09	INVESTIGATE
127 | 090	Investigate, not specified below
128 | 091	Investigate crime, corruption
129 | 092	Investigate human rights abuses
130 | 093	Investigate military action
131 | 094	Investigate war crimes
132 | 10	DEMAND
133 | 100	Demand, not specified below
134 | 101	Demand information, investigation
135 | 1011	Demand economic cooperation
136 | 1012	Demand military cooperation
137 | 1013	Demand judicial cooperation
138 | 1014	Demand intelligence cooperation
139 | 102	Demand policy support
140 | 103	Demand aid, protection, or peacekeeping
141 | 1031	Demand economic aid
142 | 1032	Demand military aid
143 | 1033	Demand humanitarian aid
144 | 1034	Demand military protection or peacekeeping
145 | 104	Demand political reform, not specified below
146 | 1041	Demand change in leadership
147 | 1042	Demand policy change
148 | 1043	Demand rights
149 | 1044	Demand change in institutions, regime
150 | 105	Demand mediation
151 | 1051	Demand easing of administrative sanctions
152 | 1052	Demand easing of political dissent
153 | 1053	Demand release of persons or property
154 | 1054	Demand easing of economic sanctions, boycott, or embargo
155 | 1055	Demand that target allows international involvement (non-mediation)
156 | 1056	Demand de-escalation of military engagement106:[-5.0] Demand withdrawal
157 | 107	Demand ceasefire
158 | 108	Demand meeting, negotiation
159 | 11	DISAPPROVE
160 | 110	Disapprove, not specified below
161 | 111	Criticize or denounce
162 | 112	Accuse, not specified below
163 | 1121	Accuse of crime, corruption
164 | 1122	Accuse of human rights abuses
165 | 1123	Accuse of aggression
166 | 1124	Accuse of war crimes
167 | 1125	Accuse of espionage, treason
168 | 113	Rally opposition against
169 | 114	Complain officially
170 | 115	Bring lawsuit against
171 | 116	Find guilty or liable (legally)
172 | 12	REJECT
173 | 120	Reject, not specified below
174 | 121	Reject material cooperation
175 | 1211	Reject economic cooperation
176 | 1212	Reject military cooperation
177 | 122	Reject request or demand for material aid, not specified below
178 | 1221	Reject request for economic aid
179 | 1222	Reject request for military aid
180 | 1223	Reject request for humanitarian aid
181 | 1224	Reject request for military protection or peacekeeping
182 | 123	Reject request or demand for political reform, not specified below
183 | 1231	Reject request for change in leadership
184 | 1232	Reject request for policy change
185 | 1233	Reject request for rights
186 | 1234	Reject request for change in institutions, regime
187 | 124	Refuse to yield, not specified below
188 | 1241	Refuse to ease administrative sanctions
189 | 1242	Refuse to ease popular dissent
190 | 1243	Refuse to release persons or property
191 | 1244	Refuse to ease economic sanctions, boycott, or embargo
192 | 1245	Refuse to allow international involvement (non mediation)
193 | 1246	Refuse to de-escalate military engagement
194 | 125	Reject proposal to meet, discuss, or negotiate
195 | 126	Reject mediation
196 | 127	Reject plan, agreement to settle dispute
197 | 128	Defy norms, law
198 | 129	Veto
199 | 13	THREATEN
200 | 130	Threaten, not specified below
201 | 131	Threaten non-force, not specified below
202 | 1311	Threaten to reduce or stop aid
203 | 1312	Threaten to boycott, embargo, or sanction
204 | 1313	Threaten to reduce or break relations
205 | 132	Threaten with administrative sanctions, not specified below
206 | 1321	Threaten to impose restrictions on political freedoms
207 | 1322	Threaten to ban political parties or politicians
208 | 1323	Threaten to impose curfew
209 | 1324	Threaten to impose state of emergency or martial law
210 | 133	Threaten political dissent, protest
211 | 134	Threaten to halt negotiations
212 | 135	Threaten to halt mediation
213 | 136	Threaten to halt international involvement (non-mediation)
214 | 137	Threaten with violent repression
215 | 138	Threaten to use military force, not specified below
216 | 1381	Threaten blockade
217 | 1382	Threaten occupation
218 | 1383	Threaten unconventional violence
219 | 1384	Threaten conventional attack
220 | 1385	Threaten attack with WMD
221 | 139	Give ultimatum
222 | 14	PROTEST
223 | 140	Engage in political dissent, not specified below
224 | 141	Demonstrate or rally
225 | 1411	Demonstrate for leadership change
226 | 1412	Demonstrate for policy change
227 | 1413	Demonstrate for rights
228 | 1414	Demonstrate for change in institutions, regime
229 | 142	Conduct hunger strike, not specified below
230 | 1421	Conduct hunger strike for leadership change
231 | 1422	Conduct hunger strike for policy change
232 | 1423	Conduct hunger strike for rights
233 | 1424	Conduct hunger strike for change in institutions, regime
234 | 143	Conduct strike or boycott, not specified below
235 | 1431	Conduct strike or boycott for leadership change
236 | 1432	Conduct strike or boycott for policy change
237 | 1433	Conduct strike or boycott for rights
238 | 1434	Conduct strike or boycott for change in institutions, regime
239 | 144	Obstruct passage, block
240 | 1441	Obstruct passage to demand leadership change
241 | 1442	Obstruct passage to demand policy change
242 | 1443	Obstruct passage to demand rights
243 | 1444	Obstruct passage to demand change in institutions, regime
244 | 145	Protest violently, riot
245 | 1451	Engage in violent protest for leadership change
246 | 1452	Engage in violent protest for policy change
247 | 1453	Engage in violent protest for rights
248 | 1454	Engage in violent protest for  change in institutions, regime
249 | 15	EXHIBIT FORCE POSTURE
250 | 150	Demonstrate military or police power, not specified below
251 | 151	Increase police alert status
252 | 152	Increase military alert status
253 | 153	Mobilize or increase police power
254 | 154	Mobilize or increase armed forces
255 | 16	REDUCE RELATIONS
256 | 160	Reduce relations, not specified below
257 | 161	Reduce or break diplomatic relations
258 | 162	Reduce or stop aid, not specified below
259 | 1621	Reduce or stop economic assistance
260 | 1622	Reduce or stop military assistance
261 | 1623	Reduce or stop humanitarian assistance
262 | 163	Impose embargo, boycott, or sanctions
263 | 164	Halt negotiations
264 | 165	Halt mediation
265 | 166	Expel or withdraw, not specified below
266 | 1661	Expel or withdraw peacekeepers
267 | 1662	Expel or withdraw inspectors, observers
268 | 1663	Expel or withdraw aid agencies
269 | 17	COERCE
270 | 170	Coerce, not specified below
271 | 171	Seize or damage property, not specified below
272 | 1711	Confiscate property
273 | 1712	Destroy property
274 | 172	Impose administrative sanctions, not specified below
275 | 1721	Impose restrictions on political freedoms
276 | 1722	Ban political parties or politicians
277 | 1723	Impose curfew
278 | 1724	Impose state of emergency or martial law
279 | 173	Arrest, detain, or charge with legal action
280 | 174	Expel or deport individuals
281 | 175	Use tactics of violent repression
282 | 18	ASSAULT
283 | 180	Use unconventional violence, not specified below
284 | 181	Abduct, hijack, or take hostage
285 | 182	Physically assault, not specified below
286 | 1821	Sexually assault
287 | 1822	Torture
288 | 1823	Kill by physical assault
289 | 183	Conduct suicide, car, or other non-military bombing, not spec below
290 | 1831	Carry out suicide bombing
291 | 1832	Carry out car bombing
292 | 1833	Carry out roadside bombing
293 | 184	Use as human shield
294 | 185	Attempt to assassinate
295 | 186	Assassinate
296 | 19	FIGHT
297 | 190	Use conventional military force, not specified below
298 | 191	Impose blockade, restrict movement
299 | 192	Occupy territory
300 | 193	Fight with small arms and light weapons
301 | 194	Fight with artillery and tanks
302 | 195	Employ aerial weapons
303 | 196	Violate ceasefire
304 | 20	USE UNCONVENTIONAL MASS VIOLENCE
305 | 200	Use unconventional mass violence, not specified below
306 | 201	Engage in mass expulsion
307 | 202	Engage in mass killings
308 | 203	Engage in ethnic cleansing
309 | 204	Use weapons of mass destruction, not specified below
310 | 2041	Use chemical, biological, or radiologicalweapons
311 | 2042	Detonate nuclear weapons


--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/HtmlFetcher.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to Gravity.com under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Gravity.com licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package com.gravity.goose.network
 20 | 
 21 | import org.apache.http.HttpEntity
 22 | import org.apache.http.HttpResponse
 23 | import org.apache.http.HttpVersion
 24 | import org.apache.http.client.CookieStore
 25 | import org.apache.http.client.HttpClient
 26 | import org.apache.http.client.methods.HttpGet
 27 | import org.apache.http.client.params.CookiePolicy
 28 | import org.apache.http.client.protocol.ClientContext
 29 | import org.apache.http.conn.scheme.PlainSocketFactory
 30 | import org.apache.http.conn.ssl.SSLSocketFactory
 31 | import org.apache.http.conn.scheme.Scheme
 32 | import org.apache.http.conn.scheme.SchemeRegistry
 33 | import org.apache.http.cookie.Cookie
 34 | import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager
 35 | import org.apache.http.params.BasicHttpParams
 36 | import org.apache.http.params.HttpConnectionParams
 37 | import org.apache.http.params.HttpParams
 38 | import org.apache.http.params.HttpProtocolParams
 39 | import org.apache.http.protocol.BasicHttpContext
 40 | import org.apache.http.protocol.HttpContext
 41 | import org.apache.http.util.EntityUtils
 42 | import java.io._
 43 | import java.net.SocketException
 44 | import java.net.SocketTimeoutException
 45 | import java.net.URLConnection
 46 | import java.util.ArrayList
 47 | import java.util.Date
 48 | import java.util.List
 49 | import com.gravity.goose.utils.Logging
 50 | import com.gravity.goose.Configuration
 51 | import org.apache.http.impl.client.{DefaultHttpRequestRetryHandler, AbstractHttpClient, DefaultHttpClient}
 52 | 
 53 | 
 54 | /**
 55 |  * User: Jim Plush
 56 |  * Date: 12/16/10
 57 |  * This guy is kind of a doozy because goose is meant to pull millions of articles per day so the legitimacy of these links
 58 |  * is in question. For example many times you'll see mp3, mov, wav, etc.. files mislabeled as HTML with HTML content types,
 59 |  * only through inspection of the actual content will you learn what the real type of content is. Also spam sites could
 60 |  * contain up to 1GB of text that is just wasted resources so we set a max bytes level on how much content we're going
 61 |  * to try and pull back before we say screw it.
 62 |  */
 63 | object HtmlFetcher extends AbstractHtmlFetcher with Logging {
 64 |   /**
 65 |    * holds a reference to our override cookie store, we don't want to store
 66 |    * cookies for head requests, only slows shit down
 67 |    */
 68 |   var emptyCookieStore: CookieStore = null
 69 |   /**
 70 |    * holds the HttpClient object for making requests
 71 |    */
 72 |   private var httpClient: HttpClient = null
 73 |   initClient()
 74 | 
 75 | 
 76 |   def getHttpClient: HttpClient = {
 77 |     httpClient
 78 |   }
 79 | 
 80 |   /**
 81 |    * Makes an http fetch to go retrieve the HTML from a url, store it to disk and pass it off
 82 |    * @param config Goose Configuration
 83 |    * @param url The web address to fetch
 84 |    * @return If all goes well, a `Some[String]` otherwise `None`
 85 |    * @throws NotFoundException(String)
 86 |    * @throws BadRequestException(String)
 87 |    * @throws NotAuthorizedException(String, Int)
 88 |    * @throws ServerErrorException(String, Int)
 89 |    * @throws UnhandledStatusCodeException(String, Int)
 90 |    * @throws MaxBytesException()
 91 |    */
 92 |   def getHtml(config: Configuration, url: String): Option[String] = {
 93 |     var httpget: HttpGet = null
 94 |     var htmlResult: String = null
 95 |     var entity: HttpEntity = null
 96 |     var instream: InputStream = null
 97 | 
 98 |     // Identified the the apache http client does not drop URL fragments before opening the request to the host
 99 |     // more info: http://stackoverflow.com/questions/4251841/400-error-with-httpclient-for-a-link-with-an-anchor
100 |     val cleanUrl = {
101 |       val foundAt = url.indexOf("#")
102 |       if (foundAt >= 0) url.substring(0, foundAt) else url
103 |     }
104 | 
105 |     try {
106 |       val localContext: HttpContext = new BasicHttpContext
107 |       localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore)
108 |       httpget = new HttpGet(cleanUrl)
109 |       HttpProtocolParams.setUserAgent(httpClient.getParams, config.getBrowserUserAgent());
110 | 
111 |       val params = httpClient.getParams
112 |       HttpConnectionParams.setConnectionTimeout(params, config.getConnectionTimeout())
113 |       HttpConnectionParams.setSoTimeout(params, config.getSocketTimeout())
114 | 
115 |       trace("Setting UserAgent To: " + HttpProtocolParams.getUserAgent(httpClient.getParams))
116 |       val response: HttpResponse = httpClient.execute(httpget, localContext)
117 | 
118 |       HttpStatusValidator.validate(cleanUrl, response.getStatusLine.getStatusCode) match {
119 |         case Left(ex) => throw ex
120 |         case _ =>
121 |       }
122 | 
123 |       entity = response.getEntity
124 |       if (entity != null) {
125 |         instream = entity.getContent
126 |         var encodingType: String = "UTF-8"
127 |         try {
128 |           encodingType = EntityUtils.getContentCharSet(entity)
129 |           if (encodingType == null) {
130 |             encodingType = "UTF-8"
131 |           }
132 |         }
133 |         catch {
134 |           case e: Exception => {
135 |             if (logger.isDebugEnabled) {
136 |               trace("Unable to get charset for: " + cleanUrl)
137 |               trace("Encoding Type is: " + encodingType)
138 |             }
139 |           }
140 |         }
141 |         try {
142 |           htmlResult = HtmlFetcher.convertStreamToString(instream, 15728640, encodingType).trim
143 |         }
144 |         finally {
145 |           EntityUtils.consume(entity)
146 |         }
147 |       }
148 |       else {
149 |         trace("Unable to fetch URL Properly: " + cleanUrl)
150 |       }
151 |     }
152 |     catch {
153 |       case e: NullPointerException => {
154 |         logger.warn(e.toString + " " + e.getMessage + " Caught for URL: " + cleanUrl)
155 |       }
156 |       case e: MaxBytesException => {
157 |         trace("GRVBIGFAIL: " + cleanUrl + " Reached max bytes size")
158 |         throw e
159 |       }
160 |       case e: SocketException => {
161 |         logger.warn(e.getMessage + " Caught for URL: " + cleanUrl)
162 |       }
163 |       case e: SocketTimeoutException => {
164 |         trace(e.toString)
165 |       }
166 |       case e: LoggableException => {
167 |         logger.warn(e.getMessage)
168 |         return None
169 |       }
170 |       case e: Exception => {
171 |         trace("FAILURE FOR LINK: " + cleanUrl + " " + e.toString)
172 |         return None
173 |       }
174 |     }
175 |     finally {
176 |       if (instream != null) {
177 |         try {
178 |           instream.close()
179 |         }
180 |         catch {
181 |           case e: Exception => {
182 |             logger.warn(e.getMessage + " Caught for URL: " + cleanUrl)
183 |           }
184 |         }
185 |       }
186 |       if (httpget != null) {
187 |         try {
188 |           httpget.abort()
189 |           entity = null
190 |         }
191 |         catch {
192 |           case e: Exception => {
193 |           }
194 |         }
195 |       }
196 |     }
197 |     if (logger.isDebugEnabled) {
198 |       logger.debug("starting...")
199 |     }
200 |     if (htmlResult == null || htmlResult.length < 1) {
201 |       if (logger.isDebugEnabled) {
202 |         logger.debug("HTMLRESULT is empty or null")
203 |       }
204 |       throw new NotHtmlException(cleanUrl)
205 |     }
206 |     var is: InputStream = null
207 |     var mimeType: String = null
208 |     try {
209 |       is = new ByteArrayInputStream(htmlResult.getBytes("UTF-8"))
210 |       mimeType = URLConnection.guessContentTypeFromStream(is)
211 |       if (mimeType != null) {
212 |         if ((mimeType == "text/html") == true || (mimeType == "application/xml") == true) {
213 |           return Some(htmlResult)
214 |         }
215 |         else {
216 |           if (htmlResult.contains("<title>") == true && htmlResult.contains("<p>") == true) {
217 |             return Some(htmlResult)
218 |           }
219 |           trace("GRVBIGFAIL: " + mimeType + " - " + cleanUrl)
220 |           throw new NotHtmlException(cleanUrl)
221 |         }
222 |       }
223 |       else {
224 |         throw new NotHtmlException(cleanUrl)
225 |       }
226 |     }
227 |     catch {
228 |       case e: UnsupportedEncodingException => {
229 |         logger.warn(e.getMessage + " Caught for URL: " + cleanUrl)
230 |       }
231 |       case e: IOException => {
232 |         logger.warn(e.getMessage + " Caught for URL: " + cleanUrl)
233 |       }
234 |     }
235 |     None
236 |   }
237 | 
238 |   private def initClient() {
239 | 
240 |     trace("Initializing HttpClient")
241 | 
242 |     val httpParams: HttpParams = new BasicHttpParams
243 |     HttpConnectionParams.setConnectionTimeout(httpParams, 10 * 1000)
244 |     HttpConnectionParams.setSoTimeout(httpParams, 10 * 1000)
245 |     HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1)
246 |     emptyCookieStore = new CookieStore {
247 |       def addCookie(cookie: Cookie) {
248 |       }
249 | 
250 |       def getCookies: List[Cookie] = {
251 |         emptyList
252 |       }
253 | 
254 |       def clearExpired(date: Date): Boolean = {
255 |         false
256 |       }
257 | 
258 |       def clear() {
259 |       }
260 | 
261 |       private[network] var emptyList: ArrayList[Cookie] = new ArrayList[Cookie]
262 |     }
263 |     httpParams.setParameter("http.protocol.cookie-policy", CookiePolicy.BROWSER_COMPATIBILITY)
264 |     httpParams.setParameter("http.User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8")
265 |     httpParams.setParameter("http.language.Accept-Language", "en-us")
266 |     httpParams.setParameter("http.protocol.content-charset", "UTF-8")
267 |     httpParams.setParameter("Accept", "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5")
268 |     httpParams.setParameter("Cache-Control", "max-age=0")
269 |     httpParams.setParameter("http.connection.stalecheck", false)
270 |     val schemeRegistry: SchemeRegistry = new SchemeRegistry
271 |     schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory))
272 |     schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory))
273 |     val cm = new ThreadSafeClientConnManager(schemeRegistry)
274 |     cm.setMaxTotal(20000)
275 |     cm.setDefaultMaxPerRoute(500)
276 |     httpClient = new DefaultHttpClient(cm, httpParams)
277 |     httpClient.asInstanceOf[AbstractHttpClient].setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false))
278 |     httpClient.getParams.setParameter("http.conn-manager.timeout", 120000L)
279 |     httpClient.getParams.setParameter("http.protocol.wait-for-continue", 10000L)
280 |     httpClient.getParams.setParameter("http.tcp.nodelay", true)
281 |   }
282 | 
283 |   /**
284 |    * reads bytes off the string and returns a string
285 |    *
286 |    * @param is the source stream from the response
287 |    * @param maxBytes The max bytes that we want to read from the input stream
288 |    * @return String
289 |    */
290 |   def convertStreamToString(is: InputStream, maxBytes: Int, encodingType: String): String = {
291 |     val buf: Array[Char] = new Array[Char](2048)
292 |     var r: Reader = null
293 |     val s = new StringBuilder
294 |     try {
295 |       r = new InputStreamReader(is, encodingType)
296 |       var bytesRead: Int = 2048
297 |       var inLoop = true
298 |       while (inLoop) {
299 |         if (bytesRead >= maxBytes) {
300 |           throw new MaxBytesException
301 |         }
302 |         var n: Int = r.read(buf)
303 |         bytesRead += 2048
304 | 
305 |         if (n < 0) inLoop = false
306 |         if (inLoop) s.appendAll(buf, 0, n)
307 |       }
308 |       return s.toString()
309 |     }
310 |     catch {
311 |       case e: SocketTimeoutException => {
312 |         logger.warn(e.toString + " " + e.getMessage)
313 |       }
314 |       case e: UnsupportedEncodingException => {
315 |         logger.warn(e.toString + " Encoding: " + encodingType)
316 |       }
317 |       case e: IOException => {
318 |         logger.warn(e.toString + " " + e.getMessage)
319 |       }
320 |     }
321 |     finally {
322 |       if (r != null) {
323 |         try {
324 |           r.close()
325 |         }
326 |         catch {
327 |           case e: Exception => {
328 |           }
329 |         }
330 |       }
331 |     }
332 |     null
333 |   }
334 | 
335 | 
336 | }
337 | 
338 | 
339 | 


--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/statichtml/guardian1_result.txt:
--------------------------------------------------------------------------------
 1 | Kristen Wiig has had the kind of summer one might imagine to be life-changing. For four years, she andAnnie Mumolo, her friend and co-writer, slaved over a comedy script commissioned by Judd Apatow about a woman whose best friend is getting married. It was the 38-year-old's first lead in a film, and her first full-length script to be produced. As an indication of how it played with audiences, I watched it on a plane last month with a friend who, during the scene in which the bride squats in the street to relieve herself after a bad kebab, laughed so long and hard a woman passing in the aisle leant over and said, "What are you watching?"
 2 | 
 3 | Wiig smiles when I tell her this. "Proud," she says, of the day they filmed the shitting-in-the-street scene. "A proud moment."
 4 | 
 5 | We are in the tearoom of a fashionable hotel in Tribeca, the Manhattan neighbourhood where Wiig lives. Before Bridesmaids, she was known to US audiences as a long-running cast member of Saturday Night Live and elsewhere for scene-stealing cameos in films such as Ghost Town and Knocked Up.
 6 | 
 7 | That the film, by midsummer, had grossed more than $150m in the US and outstripped not only all of Apatow's other films, but every "R-rated female comedy" in history, puts Wiig in the zone of woman of the moment, although she chafes against this, with its implication that before Bridesmaids she was an ingenue.
 8 | 
 9 | "In most ways my life hasn't changed," she says. "I know that's a boring answer. People want to hear that I bought all gold, fur…" She allows a perfectly timed beat. "I would never wear fur."
10 | 
11 | But hasn't she had to turn down lots of offers?
12 | 
13 | "I mean. Yes, no. It feels weird to say that; you don't want to be like, 'Everyone wants me!' I mean. I guess Bridesmaids was definitely the biggest role I've ever had. And the fact that I co-wrote it and everything. But, um…" Wiig, who is slight, with very straight hair and an eager tilt to her body language, looks mortified. "It's not like I have boxes of scripts arriving at my door."
14 | 
15 | Her understatement is fuelled perhaps by the inevitable and awkward comparisons she has gained with other women in her business, as if the culture can sustain only a couple at a time. Wiig has been getting "the new Tina Fey" quite a lot – Fey was head writer at SNL when Wiig joined – although the comparison is faulty. Wiig is an actor first and a comedian second, and with a film directed by Sean Pennin the pipeline and another, Imogene, in which she stars alongside Annette Bening and Matt Dillon, wants to develop her career away from comedy. "People always call me a comedian. And I don't really see myself like that. I guess I just consider myself an actor who does comedy. But who wants to do other things as well."
16 | 
17 | It took her a long time to get here. After growing up in upstate New York, she went to university in Arizona and studied art before dropping out after the first year and going to LA to try to make it as an actor. Arizona is a notorious party college, but Wiig says all of that was out of her system by the time she left high school, where she had a few shaky years. "I was not that good a student because I was very… social. I cared more about going out with my friends. I didn't quite realise the importance of school. But then when I went to college I took it much more seriously, because I enjoyed it."
18 | 
19 | How social was she? Suspended?
20 | 
21 | "Um. Not for more than a couple of days. There were suspensions." Her expression fixes. "That's the past." Before the spotlight was so firmly on her, Wiig talked publicly about her minor-league acts of teenage hooliganism, including being caught underage drinking at a Grateful Deadgig, skipping school and, what she called the worst of it, smashing pot plants on a neighbour's porch, which she feels terrible about. As she entered her 20s her parents were still worried, she says, and then when she kicked in her degree and told them she wanted to be an actor, "probably the most worried they could be".
22 | 
23 | "Yes. Also, they didn't want me to get disappointed. They would always mention the numbers – do you know how many people are trying to do what you're doing? Your chances are really slim. And they're right. Technically. But when you're 20, you're like, why can't you just support me?! Can't you be proud that I'm trying to go after my dream?" She pulls a whiny face and tilts her head. "But they came around quickly when they saw how happy it made me. They would come and see me in the horrible little shows that I was in."
24 | 
25 | Wiig hadn't any great sense of being funny when she was growing up. Her dad, she says, tells a lot of jokes. Her mother is funny, but "mom funny, where she isn't trying to be funny, but is". Before retiring, her father ran a marina on one of the lakes upstate in New York (the name Wiig is from his Norwegian heritage). Her mother was an artist. Even after all these years, they haven't quite shed the sense of precariousness around their daughter's life; when she tells them she's in a movie, her mother will say tentatively, "Is that something we can see in theatres?" Wiig smiles and says, "They're still getting used to the idea that I'm working and it's OK."
26 | 
27 | With good reason. Wiig was 11 years in LA before she got the call from Saturday Night Live, during which time her income was erratic. She had arrived in the city with no professional contacts and a nagging sense of insurmountable competition. "I was incredibly intimidated and had no experience. I felt very scared and unsure and I didn't have any résumé, and everyone around me was very beautiful and young and I thought, oh, maybe I should work in a store and enjoy the weather. But I started taking improv classes and that's what got me started."
28 | 
29 | Improv was something she had never heard of before. But when she turned up to watch a gig one day at the Groundlings, the famous LA improv troupe with alumni such as Lisa Kudrow, Conan O'Brienand Will Ferrell, something resonated. The idea of standing on stage and making up stuff was, she says, less scary to her than the notion of saying lines, with the lurking fear there was a right and a wrong way to say them. With improvisation, there was no right and wrong: "You can't mess it up and you can't forget your lines."
30 | 
31 | Her enthusiasm wouldn't pay the bills, however, and Wiig worked at a series of day jobs, including at a floral design studio for a couple of years, and as a waitress in the refectory at Universal Studios. Now and then she'll run into someone on a TV show or a movie, and wonder where she knows them from. "And then I'll remember: oh yeah, I used to serve you Cobb salad."
32 | 
33 | There were many long, dark nights of the soul. "Oh my God, every month, yeah, because you don't have a lot of money coming in. When I look back, it was one of the best times of my life, because you're so in it with your friends. But you do have those moments when you're like: have I given it a try, should I stop, should I quit? But, no. You have a family there, you have a space to put shows on. I would rather be doing what I love and living above a garage – which I did – than not."
34 | 
35 | The call came in 2005. Wiig flew to New York for the first of several auditions with the Saturday Night Live creators. The audition format was standup, which she had never done before, and in front of a terrifying panel includingLorne Michaels, the legendary SNL producer, and Tina Fey. Wiig was required to unveil a range of characters of her own creation that might be suitable for the sketch show and, quivering up there alone on stage, she fully expected to be met with silence. When she heard a few laughs, she gathered strength, got through it and was called back for a second audition. After which, nothing. And then the new season started. "So I thought, right, pretty clear – thanks for coming. And then after the third show I got a call saying I was hired, come in…"
36 | 
37 | Wiig joined the show at a time when it was undergoing a cultural transition. Fey was the first female head writer and has written about the formally macho culture of the show – men pissing in jars by their desks, etc, which she put to comic use in 30 Rock. It was tough, she says, walking into a workplace where everyone knew each other: "Kind of like going into someone's living room for a party and they are really comfortable and have their shoes off and are sitting on the couch and I walk in and am a little dressed up and don't know where to stand? They were all very welcoming and nice but I knew I wasn't at that place yet where I could take my shoes off."
38 | 
39 | She was excited to be working with the likes of Fey, Amy Poehlerand Rachel Dratch, although Wiig is reluctant to describe the still testosterone-heavy environment at SNL as off-putting. "I mean, I mean, merely by numbers there are more men that work there, but I don't consider it… I don't even think about it. Men work there, women work there, we have a lot of amazing female writers on staff right now… There are more men, but I don't think anyone really…"
40 | 
41 | Was she a fan of Fey's before she joined the show?
42 | 
43 | "Um. I've watched the show since I was born. I mean I definitely admire all the stuff that she's accomplished, especially coming from SNL and being head writer, and then doing 30 Rock and all these movies and her book, I mean it's definitely something where you go, oh, that can happen. Someone can do that. She's done it. She deserves it."
44 | 
45 | To date, Bridesmaids has earned in the region of $286m worldwide; it doesn't need the qualifier "best female comedy" since it outgrossed Apatow's entire back catalogue, including Anchorman and The 40-Year-Old Virgin. Still, Wiig does not claim feminist dividends for the film – that it allowed women actors to be as gross on screen as men. She says when she and Mumolo were writing the shitting-in-the-street scene ("Can that be the title of the piece?"), it wasn't with an eye on levelling the playing field, nor was there much discussion of whether the market would tolerate that kind of vulgarity from women. No. "I think when you are doing anything creative and you think, 'What are the critics going to think?' instead of what you want to express, it can get a little muddy, and – I'm talking so seriously about this shitting-in-the-street – but with that in particular we were like, oh, this is a fun way to end the scene, and Annie used to do an impression of someone slowly realising they were shitting their pants, kind of slowly going down on to the ground. She would just do it as a joke, and it would always make me laugh really hard. She took it to a whole new level."
46 | 
47 | Apatow had approached Wiig and asked her to write a script for him after they worked together on Knocked Up, in which she played a small pivotal role as Katherine Heigl's bitchy boss. In her five minutes on screen, Wiig managed to communicate brilliantly the gap between what her character was saying and thinking. She and Mumolo first conceived of Bridesmaids not as a wedding movie per se, but as a movie about friendship. "I mean, it's called Bridesmaids, I get that. But it's about women who, when they reach that age, whether it's in their 30s or not, thought they were supposed to be somewhere else. That's where we started from. And the fact that Annie had been to seven weddings in two years. And that she had friends who were marrying money and she'd showed up at the country club for the bridal shower with her wing mirrors duct-taped to her car, and at the end of the night had to crawl through her window because the front door would always swell when it was hot out. But if it's your best friend, you don't want to be complaining…"
48 | 
49 | On paper at least, it didn't look too promising, with the generic title and the number of lame wedding movies in a seemingly exhausted genre. Apatow's name raised suspicions, too, about the use to which certain characters would be put, especially that of Megan, played by Melissa McCarthy, who looked like the inevitable one-fat-girl-in-the-group and the obvious butt of fat-girl jokes. In fact, McCarthy is the other break-out star of the film, and "the character that didn't care what anybody else thought. It was a lesson my character needed to learn. She doesn't care what anyone thinks, she's in her own world, but is generous and sweet. We wanted to have that opposite look on life, the character who seems at first like there was nothing she could say that would help, but…"
50 | 
51 | The writing of the dialogue was relatively easy, says Wiig, compared with figuring out what should happen in each scene, and the film went through countless draft versions, crammed in around other work commitments, so that Mumolo, for example, would fly out to Mexico where Wiig was filming, to work on it for a weekend. In early drafts, the women ended up in Vegas, but that got chucked out when, over the four years of writing, it was used up in other wedding films such as The Hangover.
52 | 
53 | Apart from the fact that it is very funny, Bridesmaids ultimately works because it has a kind of sweet sincerity and the friendship between the two lead characters seems real. It bemuses Wiig that the film has widely been described as "raunchy". It's really not raunchy. "Raunchy means like Porky's," she says and smiles. "Which is my next movie; it's going to be a Porky's prequel."
54 | 
55 | After six years in New York, Wiig is finally at home in the city. It was tough in the early days, she says, and when friends came to visit she would burst into tears as they left. ("I was so embarrassed. I thought, oh my God, they're going to go back and say, 'Kristen's not good. She is noooot coping well.'") If accounts are to be believed, she was briefly married to an actor called Hayes Hargrove and currently lives with her partner, a film-maker called Brian Petsos, but she responds to even the mildest question about her domestic life with a frozen smile. She would, of course, rather talk about acting, and her success in her first lead role – "I felt like I had to do a good job or no one would ever invite me to the party again" – has, despite her scrupulous modesty, been rewarded with the kind of films she always hoped she'd walk into. In the Sean Penn film The Comedian, which is still in the early stages of production, Wiig will co-star with Robert De Niro. It will be the real test of whether she is leading lady material, and whether she can carry a film without jokes. "I don't really think about it," says Wiig. "When you're in it, you're in it."
56 | 
57 | In the meantime, she has sketches to write and shoot as part of the gruelling schedule of Saturday Night Live. After the interview, she is due in at the office for the weekly writing night, when everyone is required to be in at 4pm and stay until the following morning. Wiig is riding so high at the moment that when, as we leave, I ask her to confirm her age, I'm surprised when she grimaces. Yes, she says, she's 38. Why the face? Under her breath, like a dangerous heresy, she says, "I feel like women are asked their age more than men." And she snaps on a smile and leaves the restaurant.


--------------------------------------------------------------------------------