├── images ├── gdelt.png └── article.jpeg ├── .gitignore ├── .travis.yml ├── src ├── main │ ├── resources │ │ ├── com │ │ │ ├── gravity │ │ │ │ └── goose │ │ │ │ │ ├── images │ │ │ │ │ └── known-image-css.txt │ │ │ │ │ ├── statichtml │ │ │ │ │ ├── msn1_result.txt │ │ │ │ │ ├── issue_24_result.txt │ │ │ │ │ ├── issue_24.txt │ │ │ │ │ └── guardian1_result.txt │ │ │ │ │ └── text │ │ │ │ │ └── stopwords-en.txt │ │ │ └── aamend │ │ │ │ └── spark │ │ │ │ └── gdelt │ │ │ │ └── reference │ │ │ │ ├── cameoReligion.txt │ │ │ │ ├── cameoType.txt │ │ │ │ ├── cameoGroup.txt │ │ │ │ ├── cameoCountry.txt │ │ │ │ ├── cameoEthnic.txt │ │ │ │ └── cameoEvent.txt │ │ └── log4j.properties │ └── scala │ │ └── com │ │ ├── gravity │ │ └── goose │ │ │ ├── network │ │ │ ├── MaxBytesException.scala │ │ │ ├── AbstractHtmlFetcher.scala │ │ │ ├── NotHtmlException.scala │ │ │ ├── HttpExceptions.scala │ │ │ └── HtmlFetcher.scala │ │ │ ├── images │ │ │ ├── DepthTraversal.scala │ │ │ ├── SecretGifException.scala │ │ │ ├── Image.scala │ │ │ ├── ImageDetails.scala │ │ │ ├── ImageExtractor.scala │ │ │ ├── ImageSaver.scala │ │ │ └── ImageUtils.scala │ │ │ ├── extractors │ │ │ ├── TagsEvaluator.scala │ │ │ ├── StandardContentExtractor.scala │ │ │ ├── AdditionalDataExtractor.scala │ │ │ ├── Extractor.scala │ │ │ └── PublishDateExtractor.scala │ │ │ ├── cleaners │ │ │ └── StandardDocumentCleaner.scala │ │ │ ├── outputformatters │ │ │ ├── StandardOutputFormatter.scala │ │ │ └── OutputFormatter.scala │ │ │ ├── text │ │ │ ├── HashUtils.scala │ │ │ ├── StringSplitter.scala │ │ │ ├── string.scala │ │ │ ├── StringReplacement.scala │ │ │ ├── WordStats.scala │ │ │ ├── StopWords.scala │ │ │ └── ReplaceSequence.scala │ │ │ ├── utils │ │ │ ├── FileHelper.scala │ │ │ ├── URLHelper.scala │ │ │ └── Logging.scala │ │ │ ├── spark │ │ │ ├── package.scala │ │ │ └── GooseFetcher.scala │ │ │ ├── Goose.scala │ │ │ ├── Article.scala │ │ │ ├── Configuration.scala │ │ │ └── Crawler.scala │ │ └── aamend │ │ └── spark │ │ └── gdelt │ │ ├── reference │ │ ├── CountryCodes.scala │ │ ├── GcamCodes.scala │ │ └── CameoCodes.scala │ │ └── ContentFetcher.scala └── test │ ├── scala │ └── com │ │ └── aamend │ │ └── spark │ │ └── gdelt │ │ ├── SparkSpec.scala │ │ ├── ContentFetcherTest.scala │ │ ├── TTest.scala │ │ └── GdeltParserTest.scala │ └── resources │ └── com │ └── aamend │ └── spark │ └── gdelt │ ├── normDaily.csv │ └── normDailyByCountry.csv ├── LICENSE └── pom.xml /images/gdelt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aamend/spark-gdelt/HEAD/images/gdelt.png -------------------------------------------------------------------------------- /images/article.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aamend/spark-gdelt/HEAD/images/article.jpeg -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | target 4 | derby.log 5 | spark-warehouse 6 | metastore_db 7 | .DS_Store 8 | movejar.sh -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | install: mvn --quiet install -Dmaven.javadoc.skip=true -Dgpg.skip=true 3 | script: mvn test -Dmaven.javadoc.skip=true -DargLine="-Xmx2G" -------------------------------------------------------------------------------- /src/main/resources/com/gravity/goose/images/known-image-css.txt: -------------------------------------------------------------------------------- 1 | latimes.com^thumbnail 2 | cnn.com^storytext|cnn_strycntntlft 3 | foxnews.com^entry-content 4 | msn.com^articleText 5 | go.com^mediaimage 6 | buzznet.com^itembody 7 | time.com^entry-content -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/network/MaxBytesException.scala: -------------------------------------------------------------------------------- 1 | package com.gravity.goose.network 2 | 3 | /** 4 | * Created by Jim Plush 5 | * User: jim 6 | * Date: 8/14/11 7 | */ 8 | 9 | class MaxBytesException extends Exception { 10 | 11 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/images/DepthTraversal.scala: -------------------------------------------------------------------------------- 1 | package com.gravity.goose.images 2 | 3 | import org.jsoup.nodes.Element 4 | 5 | /** 6 | * Created by Jim Plush 7 | * User: jim 8 | * Date: 8/18/11 9 | */ 10 | 11 | case class DepthTraversal(node: Element, parentDepth: Int, siblingDepth: Int) 12 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | # Direct log messages to stdout 4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | log4j.appender.stdout.Target=System.out 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %40.40c:%4L - %m%n -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/extractors/TagsEvaluator.scala: -------------------------------------------------------------------------------- 1 | package org.jsoup.select 2 | 3 | import org.jsoup.nodes.Element 4 | 5 | /** 6 | * Created by IntelliJ IDEA. 7 | * Author: Robbie Coleman 8 | * Date: 6/12/12 9 | * Time: 12:04 PM 10 | */ 11 | 12 | class TagsEvaluator(tags: scala.collection.Set[String]) extends Evaluator { 13 | def matches(root: Element, element: Element) = tags.contains(element.tagName()) 14 | } 15 | 16 | object TagsEvaluator { 17 | def apply(tags: String*): TagsEvaluator = new TagsEvaluator(tags.toSet) 18 | } 19 | -------------------------------------------------------------------------------- /src/main/resources/com/aamend/spark/gdelt/reference/cameoReligion.txt: -------------------------------------------------------------------------------- 1 | CODE LABEL 2 | ADR African Diasporic Religion 3 | ALE Alewi 4 | ATH Agnostic 5 | BAH Bahai Faith 6 | BUD Buddhism 7 | CHR Christianity 8 | CON Confucianism 9 | CPT Coptic 10 | CTH Catholic 11 | DOX Orthodox 12 | DRZ Druze 13 | HIN Hinduism 14 | HSD Hasidic 15 | ITR Indigenous Tribal Religion 16 | JAN Jainism 17 | JEW Judaism 18 | JHW Jehovah's Witness 19 | LDS Latter Day Saints 20 | MOS Muslim 21 | MRN Maronite 22 | NRM New Religious Movement 23 | PAG Pagan 24 | PRO Protestant 25 | SFI Sufi 26 | SHI Shia 27 | SHN Old Shinto School 28 | SIK Sikh 29 | SUN Sunni 30 | TAO Taoist 31 | UDX Ultra-Orthodox 32 | ZRO Zoroastrianism -------------------------------------------------------------------------------- /src/test/scala/com/aamend/spark/gdelt/SparkSpec.scala: -------------------------------------------------------------------------------- 1 | package com.aamend.spark.gdelt 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.sql.SparkSession 5 | import org.scalatest.FunSuite 6 | 7 | trait SparkSpec extends FunSuite { 8 | 9 | Logger.getLogger("org").setLevel(Level.OFF) 10 | Logger.getLogger("akka").setLevel(Level.OFF) 11 | 12 | def sparkTest(name: String)(f: SparkSession => Unit): Unit = { 13 | 14 | this.test(name) { 15 | 16 | val spark = SparkSession 17 | .builder() 18 | .appName(name) 19 | .master("local") 20 | .config("spark.default.parallelism", "1") 21 | .getOrCreate() 22 | 23 | try { 24 | f(spark) 25 | } finally { 26 | spark.stop() 27 | } 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /src/main/scala/com/aamend/spark/gdelt/reference/CountryCodes.scala: -------------------------------------------------------------------------------- 1 | package com.aamend.spark.gdelt.reference 2 | 3 | import com.aamend.spark.gdelt.CountryCode 4 | import com.aamend.spark.gdelt.T 5 | import org.apache.spark.sql.{Dataset, SparkSession} 6 | 7 | import scala.io.Source 8 | 9 | object CountryCodes { 10 | 11 | def load(spark: SparkSession): Dataset[CountryCode] = { 12 | import spark.implicits._ 13 | Source.fromInputStream(this.getClass.getResourceAsStream("countryInfo.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => { 14 | val tokens = line.split("\t") 15 | CountryCode( 16 | iso = T(()=>tokens(0)), 17 | iso3 = T(()=>tokens(1)), 18 | isoNumeric = T(()=>tokens(2)), 19 | fips = T(()=>tokens(3)), 20 | country = T(()=>tokens(4).toLowerCase()) 21 | ) 22 | }).toDS() 23 | } 24 | 25 | } 26 | 27 | -------------------------------------------------------------------------------- /src/test/scala/com/aamend/spark/gdelt/ContentFetcherTest.scala: -------------------------------------------------------------------------------- 1 | package com.aamend.spark.gdelt 2 | 3 | import org.apache.spark.ml.Pipeline 4 | import org.scalatest.Matchers 5 | 6 | class ContentFetcherTest extends SparkSpec with Matchers { 7 | 8 | sparkTest("testing E2E pipeline") { spark => 9 | 10 | import spark.implicits._ 11 | val gdeltDf = List("https://www.theguardian.com/world/2018/jun/01/mariano-rajoy-ousted-as-spain-prime-minister").toDF("sourceUrl") 12 | 13 | val contentFetcher = new ContentFetcher() 14 | .setInputCol("sourceUrl") 15 | .setOutputImageUrlCol("imageUrl") 16 | .setOutputImageBase64Col("imageBase64") 17 | .setImagemagickConvert("/usr/local/bin/convert") 18 | .setImagemagickIdentify("/usr/local/bin/identify") 19 | 20 | val contentDF = contentFetcher.transform(gdeltDf) 21 | contentDF.show(false) 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/test/scala/com/aamend/spark/gdelt/TTest.scala: -------------------------------------------------------------------------------- 1 | package com.aamend.spark.gdelt 2 | 3 | import org.scalatest.{FlatSpec, Matchers} 4 | 5 | class TTest extends FlatSpec with Matchers { 6 | 7 | "null" should "return None" in { 8 | T(()=>null) should be(None) 9 | T(()=>null.toString) should be(None) 10 | } 11 | 12 | "Integer" should "return Int" in { 13 | T(()=>"1".toInt) should be(Some(1)) 14 | T(()=>"a".toInt) should be(None) 15 | } 16 | 17 | "Long" should "return Long" in { 18 | T(()=>"1".toLong) should be(Some(1L)) 19 | T(()=>"a".toLong) should be(None) 20 | } 21 | 22 | "Float" should "return Float" in { 23 | T(()=>"1.0".toFloat) should be(Some(1.0)) 24 | T(()=>"a".toFloat) should be(None) 25 | } 26 | 27 | "String" should "return String" in { 28 | T(()=>"1") should be(Some("1")) 29 | T(()=>" 1 ") should be(Some("1")) 30 | T(()=>" ") should be(None) 31 | T(()=>"") should be(None) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/resources/com/aamend/spark/gdelt/reference/cameoType.txt: -------------------------------------------------------------------------------- 1 | CODE LABEL 2 | COP Police forces 3 | GOV Government 4 | INS Insurgents 5 | JUD Judiciary 6 | MIL Military 7 | OPP Political Opposition 8 | REB Rebels 9 | SEP Separatist Rebels 10 | SPY State Intelligence 11 | UAF Unaligned Armed Forces 12 | AGR Agriculture 13 | BUS Business 14 | CRM Criminal 15 | CVL Civilian 16 | DEV Development 17 | EDU Education 18 | ELI Elites 19 | ENV Environmental 20 | HLH Health 21 | HRI Human Rights 22 | LAB Labor 23 | LEG Legislature 24 | MED Media 25 | REF Refugees 26 | MOD Moderate 27 | RAD Radical 28 | AMN Amnesty International 29 | IRC Red Cross 30 | GRP Greenpeace 31 | UNO United Nations 32 | PKO Peacekeepers 33 | UIS Unidentified State Actor 34 | IGO Inter-Governmental Organization 35 | IMG International Militarized Group 36 | INT International/Transnational Generic 37 | MNC Multinational Corporation 38 | NGM Non-Governmental Movement 39 | NGO Non-Governmental Organization 40 | UIS Unidentified State Actor 41 | SET Settler -------------------------------------------------------------------------------- /src/main/scala/com/aamend/spark/gdelt/reference/GcamCodes.scala: -------------------------------------------------------------------------------- 1 | package com.aamend.spark.gdelt.reference 2 | 3 | import com.aamend.spark.gdelt.GcamCode 4 | import com.aamend.spark.gdelt.T 5 | import org.apache.spark.sql.{Dataset, SparkSession} 6 | 7 | import scala.io.Source 8 | 9 | object GcamCodes { 10 | 11 | def load(spark: SparkSession): Dataset[GcamCode] = { 12 | import spark.implicits._ 13 | Source.fromInputStream(this.getClass.getResourceAsStream("gcam.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => { 14 | val tokens = line.split("\t") 15 | GcamCode( 16 | gcamCode = T(()=>tokens(0)), 17 | dictionaryId = T(()=>tokens(1)), 18 | dimensionId = T(()=>tokens(2)), 19 | dictionaryType = T(()=>tokens(3)), 20 | languageCode = T(()=>tokens(4)), 21 | dictionaryHumanName = T(()=>tokens(5)), 22 | dimensionHumanName = T(()=>tokens(6)), 23 | dictionaryCitation = T(()=>tokens(7)) 24 | ) 25 | }).toDS() 26 | } 27 | 28 | } 29 | 30 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/network/AbstractHtmlFetcher.scala: -------------------------------------------------------------------------------- 1 | package com.gravity.goose.network 2 | 3 | import com.gravity.goose.Configuration 4 | import org.apache.http.client.HttpClient 5 | 6 | /** 7 | * Created by IntelliJ IDEA. 8 | * Author: Robbie Coleman 9 | * Date: 10/13/12 10 | * Time: 1:02 AM 11 | * 12 | * The workhorse of goose. Override the {@see com.gravity.goose.network.HtmlFetcher} within your configuration for complete control. 13 | */ 14 | trait AbstractHtmlFetcher { 15 | /** 16 | * Access the `url` over the internet and retrieve the HTML from it 17 | * @param config overrides and tweaks 18 | * @param url the address to access and retrieve content from 19 | * @return `Some` `String` of the response from the specified `url` or `None` if failed to retrieve HTML. 20 | */ 21 | def getHtml(config: Configuration, url: String): Option[String] 22 | 23 | /** 24 | * A shared accessor for making image calls 25 | * @return a fully configured and initialized instance for shared use 26 | */ 27 | def getHttpClient: HttpClient 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/images/SecretGifException.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.images 19 | 20 | /** 21 | * Created by Jim Plush 22 | * User: jim 23 | * Date: 8/18/11 24 | */ 25 | class SecretGifException extends Exception { 26 | } 27 | 28 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/cleaners/StandardDocumentCleaner.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.cleaners 19 | 20 | /** 21 | * Created by Jim Plush 22 | * User: jim 23 | * Date: 8/16/11 24 | */ 25 | 26 | class StandardDocumentCleaner extends DocumentCleaner { 27 | 28 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/images/Image.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.images 19 | 20 | /** 21 | * Created by Jim Plush 22 | * User: jim 23 | * Date: 8/18/11 24 | */ 25 | 26 | class Image { 27 | var imageSrc: String = "" 28 | var imageBase64: String = "" 29 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/extractors/StandardContentExtractor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.extractors 19 | 20 | import com.gravity.goose.utils.Logging 21 | 22 | 23 | /** 24 | * Created by Jim Plush 25 | * User: jim 26 | * Date: 8/15/11 27 | */ 28 | 29 | object StandardContentExtractor extends ContentExtractor -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/network/NotHtmlException.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.network 20 | 21 | /** 22 | * Created by Jim Plush 23 | * User: jim 24 | * Date: 8/14/11 25 | */ 26 | 27 | class NotHtmlException(url: String) extends Exception { 28 | override val getMessage = "No HTML returned for url: " + url 29 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/outputformatters/StandardOutputFormatter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.outputformatters 20 | 21 | import com.gravity.goose.utils.Logging 22 | 23 | /** 24 | * Created by Jim Plush 25 | * User: jim 26 | * Date: 8/17/11 27 | */ 28 | 29 | object StandardOutputFormatter extends OutputFormatter with Logging -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/extractors/AdditionalDataExtractor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.extractors 19 | 20 | import org.jsoup.nodes.Element 21 | 22 | /** 23 | * Implement this abstract class to extract anything not currently contained within the {@link com.jimplush.goose.Article} class 24 | */ 25 | class AdditionalDataExtractor extends Extractor[Map[String, String]] { 26 | def extract(rootElement: Element): Map[String, String] = { 27 | Map.empty 28 | } 29 | } 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/main/resources/com/gravity/goose/statichtml/msn1_result.txt: -------------------------------------------------------------------------------- 1 | "Head to the supermarket an hour before closing time. Some stores mark down prepared foods and bakery items then because they can't sell them the following day. You could get a rotisserie chicken or freshly baked cookies for 50 percent off, or nab two sushi meals for the price of one. If you're planning to host a party or some other gathering, it's worth your time to ask the deli or bakery manager for a 5 to 10 percent discount off your catering order. Also, keep an eye out for online coupons: Some grocery stores accept coupons printed out from sites like TheGroceryGame.com, ShopAtHome.com, and CouponMom.com, even though they rarely publicize the fact. (Find out your store's policy at the customer-service counter.) It also pays to check the market's own website. You could find weekly deals there that it doesn't advertise anywhere else, including its in-store flyers. 2 | 3 | "And even though it's convenient to do all your shopping in one place, avoid going to a grocery store for kitchen supplies, like measuring cups and cookie sheets, or seasonal items, like holiday decorations and gift bags. These products will have inflated prices. Buy them at a big-box chain, like Target or Walmart, instead." 4 | 5 | More from Bing and MSN Lifestyle Site Search: Get additional content on saving on your grocery bill -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/text/HashUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.text 20 | 21 | import java.security.MessageDigest 22 | 23 | /** 24 | * Created by Jim Plush 25 | * User: jim 26 | * Date: 8/14/11 27 | */ 28 | 29 | object HashUtils { 30 | 31 | def md5(s: String): String = { 32 | val md5 = MessageDigest.getInstance("MD5") 33 | 34 | md5.reset() 35 | md5.update(s.getBytes) 36 | 37 | val result = md5.digest().map(0xFF & _).map { "%02x".format(_) }.mkString 38 | 39 | result 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/text/StringSplitter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.text 20 | 21 | /** 22 | * Created by IntelliJ IDEA. 23 | * User: robbie 24 | * Date: 5/13/11 25 | * Time: 3:53 PM 26 | */ 27 | 28 | import java.util.regex.Pattern 29 | 30 | class StringSplitter { 31 | def this(pattern: String) { 32 | this () 33 | this.pattern = Pattern.compile(pattern) 34 | } 35 | 36 | def split(input: String): Array[String] = { 37 | if (string.isNullOrEmpty(input)) return string.emptyArray 38 | pattern.split(input) 39 | } 40 | 41 | private var pattern: Pattern = null 42 | } 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/utils/FileHelper.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.utils 20 | 21 | import org.apache.commons.io.IOUtils 22 | import java.io.{IOException, InputStream} 23 | 24 | 25 | /** 26 | * Created by Jim Plush 27 | * User: jim 28 | * Date: 8/16/11 29 | */ 30 | 31 | object FileHelper extends Logging { 32 | 33 | def loadResourceFile[A](filename: String, cls: Class[A]): String = { 34 | var filedata: String = "" 35 | val is: InputStream = cls.getResourceAsStream(filename) 36 | try { 37 | filedata = IOUtils.toString(is, "UTF-8") 38 | } 39 | catch { 40 | case e: IOException => warn(e, e.toString) 41 | } 42 | filedata 43 | } 44 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/text/string.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.text 20 | 21 | /** 22 | * Created by IntelliJ IDEA. 23 | * User: robbie 24 | * Date: 5/13/11 25 | * Time: 12:11 AM 26 | */ 27 | 28 | object string { 29 | def isNullOrEmpty(input: String): Boolean = { 30 | if (input == null) return true 31 | if (input.length == 0) return true 32 | false 33 | } 34 | 35 | val empty: String = "" 36 | val emptyArray: Array[String] = Array[String](empty) 37 | var SPACE_SPLITTER: StringSplitter = new StringSplitter(" ") 38 | 39 | def tryToInt(input: String): Option[Int] = { 40 | try { 41 | Some(input.toInt) 42 | } catch { 43 | case _: Exception => None 44 | } 45 | } 46 | } 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/extractors/Extractor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.extractors 19 | 20 | import org.jsoup.nodes.Element 21 | 22 | /** 23 | * Created by IntelliJ IDEA. 24 | * User: robbie 25 | * Date: 5/19/11 26 | * Time: 2:45 PM 27 | */ 28 | /** 29 | * Encapsulates the process of extracting some type T from an article 30 | * @param the type of {@link Object} the implementing class will return 31 | */ 32 | trait Extractor[T] { 33 | /** 34 | * Given the specified {@link Element}, extract @param 35 | * 36 | * @param rootElement passed in from the {@link com.jimplush.goose.ContentExtractor} after the article has been parsed 37 | * @return an instance of type T 38 | */ 39 | def extract(rootElement: Element): T 40 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/images/ImageDetails.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.images 19 | 20 | /** 21 | * Created by Jim Plush 22 | * User: jim 23 | * Date: 8/18/11 24 | */ 25 | 26 | /** 27 | * holds the details of the result of inspecting an image 28 | * @author Jim Plush 29 | * 30 | */ 31 | class ImageDetails { 32 | def getWidth: Int = width 33 | 34 | def setWidth(width: Int) { 35 | this.width = width 36 | } 37 | 38 | def getHeight: Int = height 39 | 40 | def setHeight(height: Int) { 41 | this.height = height 42 | } 43 | 44 | def getMimeType: String = mimeType 45 | 46 | def setMimeType(mimeType: String) { 47 | this.mimeType = mimeType 48 | } 49 | 50 | /** 51 | * the width of the image 52 | */ 53 | private var width: Int = 0 54 | /** 55 | * height of the image 56 | */ 57 | private var height: Int = 0 58 | /** 59 | * the mimeType of the image JPEG / PNG 60 | */ 61 | private var mimeType: String = _ 62 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/text/StringReplacement.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.text 20 | 21 | /** 22 | * Created by IntelliJ IDEA. 23 | * User: robbie 24 | * Date: 5/13/11 25 | * Time: 11:38 AM 26 | */ 27 | 28 | import java.util.regex.Pattern 29 | 30 | object StringReplacement { 31 | def compile(pattern: String, replaceWith: String): StringReplacement = { 32 | if (string.isNullOrEmpty(pattern)) throw new IllegalArgumentException("Patterns must not be null or empty!") 33 | var p: Pattern = Pattern.compile(pattern) 34 | return new StringReplacement(p, replaceWith) 35 | } 36 | } 37 | 38 | class StringReplacement { 39 | private def this(pattern: Pattern, replaceWith: String) { 40 | this () 41 | this.pattern = pattern 42 | this.replaceWith = replaceWith 43 | } 44 | 45 | def replaceAll(input: String): String = { 46 | if (string.isNullOrEmpty(input)) return string.empty 47 | return pattern.matcher(input).replaceAll(replaceWith) 48 | } 49 | 50 | private var pattern: Pattern = null 51 | private var replaceWith: String = null 52 | } 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/test/resources/com/aamend/spark/gdelt/normDaily.csv: -------------------------------------------------------------------------------- 1 | 19200101,87765 2 | 19200102,121054 3 | 19200103,153580 4 | 19200104,110228 5 | 19200105,63362 6 | 19200106,85 7 | 19790101,661 8 | 19790102,976 9 | 19790103,1060 10 | 19790104,950 11 | 19790105,1027 12 | 19790106,644 13 | 19790107,884 14 | 19790108,1291 15 | 19790109,1287 16 | 19790110,1290 17 | 19790111,866 18 | 19790112,1443 19 | 19790113,726 20 | 19790114,851 21 | 19790115,1138 22 | 19790116,1290 23 | 19790117,1279 24 | 19790118,1353 25 | 19790119,1324 26 | 19790120,1165 27 | 19790121,744 28 | 19790122,1065 29 | 19790123,1277 30 | 19790124,1079 31 | 19790125,1156 32 | 19790126,1563 33 | 19790127,780 34 | 19790128,983 35 | 19790129,1285 36 | 19790130,1168 37 | 19790131,925 38 | 19790201,902 39 | 19790202,1625 40 | 19790203,738 41 | 19790204,602 42 | 19790205,1332 43 | 19790206,1170 44 | 19790207,1218 45 | 19790208,1286 46 | 19790209,1383 47 | 19790210,825 48 | 19790211,714 49 | 19790212,944 50 | 19790213,1056 51 | 19790214,1184 52 | 19790215,1159 53 | 19790216,1473 54 | 19790217,1216 55 | 19790218,1072 56 | 19790219,1581 57 | 19790220,1246 58 | 19790221,1617 59 | 19790222,1165 60 | 19790223,1685 61 | 19790224,875 62 | 19790225,1255 63 | 19790226,1364 64 | 19790227,1247 65 | 19790228,1516 66 | 19790301,789 67 | 19790302,1237 68 | 19790303,507 69 | 19790304,648 70 | 19790305,839 71 | 19790306,864 72 | 19790307,842 73 | 19790308,648 74 | 19790309,1145 75 | 19790310,738 76 | 19790311,719 77 | 19790312,1465 78 | 19790313,969 79 | 19790314,1034 80 | 19790315,1420 81 | 19790316,2019 82 | 19790317,1349 83 | 19790318,1056 84 | 19790319,1312 85 | 19790320,1450 86 | 19790321,1387 87 | 19790322,1354 88 | 19790323,1630 89 | 19790324,1125 90 | 19790325,967 91 | 19790326,1043 92 | 19790327,1297 93 | 19790328,1244 94 | 19790329,1286 95 | 19790330,1661 96 | 19790331,1120 97 | 19790401,1240 98 | 19790402,1038 99 | 19790403,1193 100 | 19790404,1276 101 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/network/HttpExceptions.scala: -------------------------------------------------------------------------------- 1 | package com.gravity.goose.network 2 | 3 | /** 4 | * Created by IntelliJ IDEA. 5 | * Author: Robbie Coleman 6 | * Date: 11/2/11 7 | * Time: 10:25 AM 8 | */ 9 | 10 | class LoggableException(msg: String, innerEx: Exception = null) extends Exception(msg, innerEx) { 11 | override lazy val getMessage = { 12 | val innerMessage = if (innerEx != null) { 13 | "%n\tand inner Exception of type %s:%n\t\tmessage: %s".format(innerEx.getClass.getName, innerEx.getMessage) 14 | } else { 15 | "" 16 | } 17 | getClass.getName + " ==> " + msg + innerMessage 18 | } 19 | } 20 | 21 | class NotFoundException(url: String) extends LoggableException("SERVER RETURNED 404 FOR LINK: " + url) 22 | class BadRequestException(url: String) extends LoggableException("Bad Request for URL: " + url) 23 | class NotAuthorizedException(url: String, statusCode: Int = 403) extends LoggableException("Not authorized (statusCode: %d) to access URL: %s".format(statusCode, url)) 24 | class ServerErrorException(url: String, statusCode: Int = 500) extends LoggableException("Server Error! Status code returned: %d for URL: %s".format(statusCode, url)) 25 | class UnhandledStatusCodeException(url: String, statusCode: Int) extends LoggableException("Received HTTP statusCode: %d from URL: %s and did not know how to handle it!".format(statusCode, url)) 26 | 27 | object HttpStatusValidator { 28 | def validate(url: String, statusCode: Int): Either[Exception, String] = statusCode match { 29 | case 200 => Right("OK") 30 | case 400 => Left(new BadRequestException(url)) 31 | case 404 => Left(new NotFoundException(url)) 32 | case auth if (auth > 400 && auth < 500) => Left(new NotAuthorizedException(url, auth)) 33 | case error if (error > 499) => Left(new ServerErrorException(url, error)) 34 | case unk => Left(new UnhandledStatusCodeException(url, statusCode)) 35 | } 36 | } 37 | 38 | class ImageFetchException(imgSrc: String, ex: Exception = null) extends LoggableException("Failed to fetch image file from imgSrc: " + imgSrc, ex) -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/text/WordStats.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.text 20 | 21 | import java.util.ArrayList 22 | import java.util.List 23 | 24 | /** 25 | * User: Jim Plush 26 | * Date: Oct 29, 2010 27 | * Time: 3:59:44 PM 28 | */ 29 | object WordStats { 30 | var EMPTY: WordStats = new WordStats 31 | } 32 | 33 | class WordStats { 34 | 35 | 36 | import WordStats._ 37 | 38 | /** 39 | * total number of stopwords or good words that we can calculate 40 | */ 41 | var stopWordCount: Int = 0 42 | /** 43 | * total number of words on a node 44 | */ 45 | var wordCount: Int = 0 46 | /** 47 | * holds an actual list of the stop words we found 48 | */ 49 | var stopWords: List[String] = new ArrayList[String] 50 | 51 | def getStopWords: List[String] = { 52 | stopWords 53 | } 54 | 55 | def setStopWords(words: List[String]) { 56 | stopWords = words 57 | } 58 | 59 | def getStopWordCount: Int = { 60 | stopWordCount 61 | } 62 | 63 | def setStopWordCount(wordcount: Int) { 64 | stopWordCount = wordcount 65 | } 66 | 67 | def getWordCount: Int = { 68 | wordCount 69 | } 70 | 71 | def setWordCount(cnt: Int) { 72 | wordCount = cnt 73 | } 74 | 75 | 76 | } 77 | 78 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/spark/package.scala: -------------------------------------------------------------------------------- 1 | package com.gravity.goose 2 | 3 | import java.sql.Date 4 | 5 | import org.apache.commons.lang.StringUtils 6 | 7 | import scala.util.Try 8 | 9 | package object spark { 10 | 11 | val ANNOTATOR_TITLE = "title" 12 | val ANNOTATOR_CONTENT = "content" 13 | val ANNOTATOR_DESCRIPTION = "description" 14 | val ANNOTATOR_KEYWORDS = "keywords" 15 | val ANNOTATOR_PUBLISH_DATE = "publishDate" 16 | 17 | // List of supported annotators 18 | val ANNOTATORS = Array( 19 | ANNOTATOR_TITLE, 20 | ANNOTATOR_CONTENT, 21 | ANNOTATOR_DESCRIPTION, 22 | ANNOTATOR_KEYWORDS, 23 | ANNOTATOR_PUBLISH_DATE 24 | ) 25 | 26 | def scrapeArticles(it: Iterator[String], goose: Goose): Iterator[GooseArticle] = { 27 | it.map(url => { 28 | Try { 29 | val article = goose.extractContent(url) 30 | GooseArticle( 31 | url = url, 32 | title = if(StringUtils.isNotEmpty(article.title)) Some(article.title) else None, 33 | content = if(StringUtils.isNotEmpty(article.cleanedArticleText)) Some(article.cleanedArticleText.replaceAll("\\n+", "\n")) else None, 34 | description = if(StringUtils.isNotEmpty(article.metaDescription)) Some(article.metaDescription) else None, 35 | keywords = if(StringUtils.isNotEmpty(article.metaKeywords)) article.metaKeywords.split(",").map(_.trim.toUpperCase) else Array.empty[String], 36 | publishDate = if(article.publishDate != null) Some(new Date(article.publishDate.getTime)) else None, 37 | image = None 38 | ) 39 | } getOrElse GooseArticle(url) 40 | }) 41 | } 42 | 43 | case class GooseArticle( 44 | url: String, 45 | title: Option[String] = None, 46 | content: Option[String] = None, 47 | description: Option[String] = None, 48 | keywords: Array[String] = Array.empty[String], 49 | publishDate: Option[Date] = None, 50 | image: Option[String] = None 51 | ) 52 | } 53 | -------------------------------------------------------------------------------- /src/test/resources/com/aamend/spark/gdelt/normDailyByCountry.csv: -------------------------------------------------------------------------------- 1 | 19200101,,2396 2 | 19200101,AC,10 3 | 19200101,AE,200 4 | 19200101,AF,699 5 | 19200101,AG,55 6 | 19200101,AJ,48 7 | 19200101,AL,20 8 | 19200101,AM,37 9 | 19200101,AO,76 10 | 19200101,AR,68 11 | 19200101,AS,2650 12 | 19200101,AU,165 13 | 19200101,AY,12 14 | 19200101,BA,72 15 | 19200101,BB,47 16 | 19200101,BC,18 17 | 19200101,BD,9 18 | 19200101,BE,104 19 | 19200101,BF,68 20 | 19200101,BG,456 21 | 19200101,BH,28 22 | 19200101,BK,15 23 | 19200101,BL,64 24 | 19200101,BM,70 25 | 19200101,BN,49 26 | 19200101,BO,67 27 | 19200101,BR,246 28 | 19200101,BT,180 29 | 19200101,BU,41 30 | 19200101,BX,36 31 | 19200101,BY,9 32 | 19200101,CA,1720 33 | 19200101,CB,174 34 | 19200101,CD,43 35 | 19200101,CE,351 36 | 19200101,CF,40 37 | 19200101,CG,11 38 | 19200101,CH,2595 39 | 19200101,CI,93 40 | 19200101,CJ,1 41 | 19200101,CM,90 42 | 19200101,CO,100 43 | 19200101,CS,14 44 | 19200101,CT,12 45 | 19200101,CU,96 46 | 19200101,CW,17 47 | 19200101,CY,306 48 | 19200101,DA,76 49 | 19200101,DJ,19 50 | 19200101,DR,18 51 | 19200101,EC,19 52 | 19200101,EG,350 53 | 19200101,EI,700 54 | 19200101,EK,3 55 | 19200101,EN,25 56 | 19200101,ER,1 57 | 19200101,ES,47 58 | 19200101,ET,145 59 | 19200101,EZ,18 60 | 19200101,FI,50 61 | 19200101,FJ,53 62 | 19200101,FR,974 63 | 19200101,GA,29 64 | 19200101,GB,6 65 | 19200101,GG,19 66 | 19200101,GH,450 67 | 19200101,GI,2 68 | 19200101,GJ,9 69 | 19200101,GK,7 70 | 19200101,GL,7 71 | 19200101,GM,604 72 | 19200101,GQ,11 73 | 19200101,GR,254 74 | 19200101,GT,80 75 | 19200101,GV,72 76 | 19200101,GY,60 77 | 19200101,GZ,8 78 | 19200101,HA,88 79 | 19200101,HK,518 80 | 19200101,HO,36 81 | 19200101,HR,51 82 | 19200101,HU,99 83 | 19200101,IC,23 84 | 19200101,ID,349 85 | 19200101,IN,5068 86 | 19200101,IR,3040 87 | 19200101,IS,1931 88 | 19200101,IT,421 89 | 19200101,IV,45 90 | 19200101,IZ,3500 91 | 19200101,JA,1203 92 | 19200101,JE,101 93 | 19200101,JM,138 94 | 19200101,JO,146 95 | 19200101,KE,353 96 | 19200101,KG,17 97 | 19200101,KN,776 98 | 19200101,KR,19 99 | 19200101,KS,552 100 | 19200101,KU,249 101 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/text/StopWords.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.text 20 | 21 | /** 22 | * Created by Jim Plush 23 | * User: jim 24 | * Date: 8/16/11 25 | */ 26 | 27 | import java.util._ 28 | import com.gravity.goose.utils.FileHelper 29 | 30 | object StopWords { 31 | 32 | // the confusing pattern below is basically just match any non-word character excluding white-space. 33 | private val PUNCTUATION: StringReplacement = StringReplacement.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]", string.empty) 34 | 35 | val STOP_WORDS = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass).split(sys.props("line.separator")).toSet 36 | 37 | 38 | def removePunctuation(str: String): String = { 39 | PUNCTUATION.replaceAll(str) 40 | } 41 | 42 | def getStopWordCount(content: String): WordStats = { 43 | 44 | if (string.isNullOrEmpty(content)) return WordStats.EMPTY 45 | val ws: WordStats = new WordStats 46 | val strippedInput: String = removePunctuation(content) 47 | 48 | val candidateWords: Array[String] = string.SPACE_SPLITTER.split(strippedInput) 49 | 50 | val overlappingStopWords: List[String] = new ArrayList[String] 51 | 52 | candidateWords.foreach(w => { 53 | if (STOP_WORDS.contains(w.toLowerCase)) overlappingStopWords.add(w.toLowerCase) 54 | }) 55 | ws.setWordCount(candidateWords.length) 56 | ws.setStopWordCount(overlappingStopWords.size) 57 | ws.setStopWords(overlappingStopWords) 58 | ws 59 | } 60 | 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/test/scala/com/aamend/spark/gdelt/GdeltParserTest.scala: -------------------------------------------------------------------------------- 1 | package com.aamend.spark.gdelt 2 | 3 | import org.scalatest.Matchers 4 | 5 | import scala.io.Source 6 | 7 | class GdeltParserTest extends SparkSpec with Matchers { 8 | 9 | // I simply test all my dataframes can be loaded, no exception should be thrown 10 | sparkTest("loading GDELT universe") { spark => 11 | import spark.implicits._ 12 | Source.fromInputStream(this.getClass.getResourceAsStream("gkg.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV2).show() 13 | Source.fromInputStream(this.getClass.getResourceAsStream("gkgT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV2).show() 14 | Source.fromInputStream(this.getClass.getResourceAsStream("gkg1.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV1).show() 15 | Source.fromInputStream(this.getClass.getResourceAsStream("gkg1Count.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgCountV1).show() 16 | Source.fromInputStream(this.getClass.getResourceAsStream("events.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV2).show() 17 | Source.fromInputStream(this.getClass.getResourceAsStream("events1.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV1).show() 18 | Source.fromInputStream(this.getClass.getResourceAsStream("eventsT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV2).show() 19 | Source.fromInputStream(this.getClass.getResourceAsStream("mentions.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseMentionV2).show() 20 | Source.fromInputStream(this.getClass.getResourceAsStream("mentionsT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseMentionV2).show() 21 | Source.fromInputStream(this.getClass.getResourceAsStream("normDaily.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseNormDaily).show() 22 | Source.fromInputStream(this.getClass.getResourceAsStream("normDailyByCountry.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseNormDailyByCountry).show() 23 | } 24 | 25 | // I simply test all my dataframes can be loaded, no exception should be thrown 26 | sparkTest("loading GDELT reference data") { spark => 27 | spark.loadCountryCodes.show() 28 | spark.loadGcams.show() 29 | spark.loadCameoEventCodes.show() 30 | spark.loadCameoTypeCodes.show() 31 | spark.loadCameoGroupCodes.show() 32 | spark.loadCameoEthnicCodes.show() 33 | spark.loadCameoReligionCodes.show() 34 | spark.loadCameoCountryCodes.show() 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/Goose.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose 20 | 21 | import network.HtmlFetcher 22 | import java.io.File 23 | 24 | /** 25 | * Created by Jim Plush - Gravity.com 26 | * Date: 8/14/11 27 | */ 28 | class Goose(config: Configuration = new Configuration) { 29 | 30 | 31 | initializeEnvironment() 32 | 33 | /** 34 | * Main method to extract an article object from a URL, pass in a url and get back a Article 35 | * @param url The url that you want to extract 36 | */ 37 | def extractContent(url: String, rawHTML: String): Article = { 38 | val cc = CrawlCandidate(config, url, rawHTML) 39 | sendToActor(cc) 40 | } 41 | 42 | def extractContent(url: String): Article = { 43 | val cc = CrawlCandidate(config, url, null) 44 | sendToActor(cc) 45 | } 46 | 47 | def shutdownNetwork() { 48 | HtmlFetcher.getHttpClient.getConnectionManager.shutdown() 49 | } 50 | 51 | def sendToActor(crawlCandidate: CrawlCandidate): Article = { 52 | val crawler = new Crawler(config) 53 | val article = crawler.crawl(crawlCandidate) 54 | article 55 | } 56 | 57 | def initializeEnvironment() { 58 | 59 | val f = new File(config.localStoragePath) 60 | try { 61 | if (!f.isDirectory) f.mkdirs() 62 | } catch { 63 | case _: Exception => 64 | } 65 | if (!f.isDirectory) 66 | throw new Exception(config.localStoragePath + " directory does not seem to exist, you need to set this for image processing downloads") 67 | if (!f.canWrite) 68 | throw new Exception(config.localStoragePath + " directory is not writable, you need to set this for image processing downloads") 69 | 70 | // todo cleanup any jank that may be in the tmp folder currently 71 | } 72 | 73 | } 74 | 75 | object Goose { 76 | implicit val config = new Configuration 77 | val logPrefix = "goose: " 78 | } -------------------------------------------------------------------------------- /src/main/resources/com/gravity/goose/statichtml/issue_24_result.txt: -------------------------------------------------------------------------------- 1 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity. For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin. 2 | 3 | Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically. The classpath for compilation is taken from the Eclipse project .classpath file. You may take the source directory from there as well if you wish (exercise to the reader). The params are not passed to the fsc in the command line since in my project's case the line is too long for the OS to handle. The alternative is to put it into a file and let fsc handle it for you. 4 | 5 | TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code. In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day. 6 | 7 | Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle. -------------------------------------------------------------------------------- /src/main/scala/com/aamend/spark/gdelt/reference/CameoCodes.scala: -------------------------------------------------------------------------------- 1 | package com.aamend.spark.gdelt.reference 2 | 3 | import com.aamend.spark.gdelt.CameoCode 4 | import com.aamend.spark.gdelt.T 5 | import org.apache.spark.sql.{Dataset, SparkSession} 6 | 7 | import scala.io.Source 8 | 9 | object CameoCodes { 10 | 11 | def loadEventCode(spark: SparkSession): Dataset[CameoCode] = { 12 | import spark.implicits._ 13 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoEvent.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => { 14 | val tokens = line.split("\t") 15 | CameoCode( 16 | cameoCode = T(()=>tokens(0).toUpperCase()), 17 | cameoValue = T(()=>tokens(1).toLowerCase()) 18 | ) 19 | }).toDS() 20 | } 21 | 22 | def loadTypeCode(spark: SparkSession): Dataset[CameoCode] = { 23 | import spark.implicits._ 24 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoType.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => { 25 | val tokens = line.split("\t") 26 | CameoCode( 27 | cameoCode = T(()=>tokens(0).toUpperCase()), 28 | cameoValue = T(()=>tokens(1).toLowerCase()) 29 | ) 30 | }).toDS() 31 | } 32 | 33 | def loadGroupCode(spark: SparkSession): Dataset[CameoCode] = { 34 | import spark.implicits._ 35 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoGroup.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => { 36 | val tokens = line.split("\t") 37 | CameoCode( 38 | cameoCode = T(()=>tokens(0).toUpperCase()), 39 | cameoValue = T(()=>tokens(1).toLowerCase()) 40 | ) 41 | }).toDS() 42 | } 43 | 44 | def loadEthnicCode(spark: SparkSession): Dataset[CameoCode] = { 45 | import spark.implicits._ 46 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoEthnic.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => { 47 | val tokens = line.split("\t") 48 | CameoCode( 49 | cameoCode = T(()=>tokens(0).toUpperCase()), 50 | cameoValue = T(()=>tokens(1).toLowerCase()) 51 | ) 52 | }).toDS() 53 | } 54 | 55 | def loadReligionCode(spark: SparkSession): Dataset[CameoCode] = { 56 | import spark.implicits._ 57 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoReligion.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => { 58 | val tokens = line.split("\t") 59 | CameoCode( 60 | cameoCode = T(()=>tokens(0).toUpperCase()), 61 | cameoValue = T(()=>tokens(1).toLowerCase()) 62 | ) 63 | }).toDS() 64 | } 65 | 66 | def loadCountryCode(spark: SparkSession): Dataset[CameoCode] = { 67 | import spark.implicits._ 68 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoCountry.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => { 69 | val tokens = line.split("\t") 70 | CameoCode( 71 | cameoCode = T(()=>tokens(0).toUpperCase()), 72 | cameoValue = T(()=>tokens(1).toLowerCase()) 73 | ) 74 | }).toDS() 75 | } 76 | } 77 | 78 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/extractors/PublishDateExtractor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.extractors 19 | 20 | import java.sql.Date 21 | import javax.xml.datatype.DatatypeFactory 22 | 23 | import com.gravity.goose.utils.Logging 24 | import org.jsoup.nodes.Element 25 | 26 | /** 27 | * Implement this class to extract the {@link Date} of when this article was published. 28 | */ 29 | /** 30 | * Created by IntelliJ IDEA. 31 | * User: robbie 32 | * Date: 5/19/11 33 | * Time: 2:50 PM 34 | */ 35 | abstract class PublishDateExtractor extends Extractor[Date] { 36 | /** 37 | * Intended to search the DOM and identify the {@link Date} of when this article was published. 38 | *

This will be called by the {@link com.jimplush.goose.ContentExtractor#extractContent(String)} method and will be passed to {@link com.jimplush.goose.Article#setPublishDate(java.sql.Date)}

39 | * 40 | * @param rootElement passed in from the {@link com.jimplush.goose.ContentExtractor} after the article has been parsed 41 | * @return {@link Date} of when this particular article was published or null if no date could be found. 42 | */ 43 | def extract(rootElement: Element): Date 44 | } 45 | 46 | object PublishDateExtractor extends Logging { 47 | val logPrefix = "PublishDateExtractor: " 48 | 49 | lazy val datatypeFactory: DatatypeFactory = DatatypeFactory.newInstance() 50 | 51 | /** 52 | * Helper function to return the minimum of two non-null Java Dates. 53 | */ 54 | def minDate(lhs: java.sql.Date, rhs: java.sql.Date): java.sql.Date = { 55 | if (lhs.getTime < rhs.getTime) 56 | lhs 57 | else 58 | rhs 59 | } 60 | 61 | /** 62 | * Helper function to parse ISO 8601 date/time strings safely. 63 | */ 64 | def safeParseISO8601Date(txt: String): Option[java.sql.Date] = { 65 | if (txt == null || txt.isEmpty) 66 | return None 67 | 68 | try { 69 | Option(new Date(datatypeFactory.newXMLGregorianCalendar(txt).toGregorianCalendar.getTime.getTime)) 70 | } catch { 71 | case ex: Exception => 72 | info(s"`$txt` could not be parsed to date as it did not meet the ISO 8601 spec") 73 | None 74 | } 75 | } 76 | } 77 | 78 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/images/ImageExtractor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.images 19 | 20 | import org.jsoup.nodes.{Document, Element} 21 | import com.gravity.goose.utils.{CanLog, Logging} 22 | import org.slf4j.Logger 23 | 24 | /** 25 | * Created by Jim Plush 26 | * User: jim 27 | * Date: 8/18/11 28 | */ 29 | 30 | // represents a file stored on disk that we've downloaded 31 | case class LocallyStoredImage( 32 | imgSrc: String, 33 | mimeType: String, 34 | base64: String, 35 | bytes: Long, 36 | height: Int = 0, 37 | width: Int = 0 38 | ) 39 | 40 | trait ImageExtractor extends CanLog { 41 | 42 | def getBestImage(doc: Document, topNode: Element): Image 43 | 44 | def logPrefix: String = ImageExtractor.loggingPrefix 45 | 46 | def critical(msg: String, refs: Any*) { 47 | ImageExtractor.critical(msg, refs: _*) 48 | } 49 | 50 | def critical(t: Throwable, msg: String, refs: Any*) { 51 | ImageExtractor.critical(t, msg, refs: _*) 52 | } 53 | 54 | def debug(msg: String, refs: Any*) { 55 | ImageExtractor.debug(msg, refs: _*) 56 | } 57 | 58 | def debug(t: Throwable, msg: String, refs: Any*) { 59 | ImageExtractor.debug(t, msg, refs: _*) 60 | } 61 | 62 | def info(msg: String, refs: Any*) { 63 | ImageExtractor.info(msg, refs: _*) 64 | } 65 | 66 | def info(t: Throwable, msg: String, refs: Any*) { 67 | ImageExtractor.info(t, msg, refs: _*) 68 | } 69 | 70 | def logger: Logger = ImageExtractor.logger 71 | 72 | def trace(msg: String, refs: Any*) { 73 | ImageExtractor.trace(msg, refs: _*) 74 | } 75 | 76 | def trace(t: Throwable, msg: String, refs: Any*) { 77 | ImageExtractor.trace(t, msg, refs: _*) 78 | } 79 | 80 | def warn(msg: String, refs: Any*) { 81 | ImageExtractor.warn(msg, refs: _*) 82 | } 83 | 84 | def warn(t: Throwable, msg: String, refs: Any*) { 85 | ImageExtractor.warn(t, msg, refs: _*) 86 | } 87 | } 88 | 89 | object ImageExtractor extends Logging { 90 | val loggingPrefix = "images: " 91 | } 92 | 93 | -------------------------------------------------------------------------------- /src/main/resources/com/gravity/goose/statichtml/issue_24.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Paragraph Order Test 5 | 6 | 7 | 8 |
9 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. 10 | 11 | Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity. 12 | For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin. 13 |

Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically. 14 | The classpath for compilation is taken from the Eclipse project .classpath file. You may take the source directory from there as well if you wish (exercise to the reader). 15 | The params are not passed to the fsc in the command line since in my project's case the line is too long for the OS to handle. The alternative is to put it into a file and let fsc handle it for you.

16 | 17 | TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code. 18 | In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day. 19 |

Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.

20 |
21 | 22 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/utils/URLHelper.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.utils 20 | 21 | import com.gravity.goose.text.{StringReplacement, HashUtils} 22 | import java.net.{URI, MalformedURLException, URL} 23 | import org.apache.http.client.methods.HttpGet 24 | 25 | /** 26 | * Created by Jim Plush 27 | * User: jim 28 | * Date: 8/14/11 29 | */ 30 | 31 | case class ParsingCandidate(urlString: String, linkhash: String, url: URL) 32 | 33 | object URLHelper extends Logging { 34 | 35 | private val ESCAPED_FRAGMENT_REPLACEMENT: StringReplacement = StringReplacement.compile("#!", "?_escaped_fragment_=") 36 | 37 | /** 38 | * returns a ParseCandidate object that is a valid URL 39 | */ 40 | def getCleanedUrl(urlToCrawl: String): Option[ParsingCandidate] = { 41 | 42 | val finalURL = 43 | if (urlToCrawl.contains("#!")) ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(urlToCrawl) else urlToCrawl 44 | 45 | try { 46 | val url = new URL(finalURL) 47 | val linkhash = HashUtils.md5(finalURL) 48 | Some(ParsingCandidate(finalURL, linkhash, url)) 49 | } 50 | catch { 51 | case e: MalformedURLException => { 52 | warn("{0} - is a malformed URL and cannot be processed", urlToCrawl) 53 | None 54 | } 55 | case unknown: Exception => { 56 | critical("Unable to process URL: {0} due to an unexpected exception:\n\tException Type: {1}\n\tException Message: {2}\n\tException Stack:\n{3}", 57 | urlToCrawl, 58 | unknown.getClass.getCanonicalName, 59 | unknown.getMessage, 60 | unknown.getStackTraceString) 61 | 62 | None 63 | } 64 | } 65 | } 66 | 67 | def tryToURL(url: String): Option[URL] = { 68 | val finalUrl = if (url.contains("#!")) { 69 | ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(url) 70 | } else { 71 | url 72 | } 73 | 74 | try { 75 | Some(new URL(finalUrl)) 76 | } catch { 77 | case _: Exception => None 78 | } 79 | } 80 | 81 | def tryToURI(url: String): Option[URI] = { 82 | val finalUrl = if (url.contains("#!")) { 83 | ESCAPED_FRAGMENT_REPLACEMENT.replaceAll(url) 84 | } else { 85 | url 86 | } 87 | 88 | try { 89 | Some(URI.create(finalUrl)) 90 | } catch { 91 | case _: Exception => None 92 | } 93 | } 94 | 95 | def tryToHttpGet(url: String): Option[HttpGet] = { 96 | tryToURI(url) match { 97 | case Some(uri) => Some(new HttpGet(uri)) 98 | case None => None 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/utils/Logging.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.utils 20 | 21 | import org.slf4j._ 22 | import java.text.MessageFormat 23 | 24 | /** 25 | * User: chris bissel 26 | * Date: 1/2/11 27 | * Time: 1:47 PM 28 | */ 29 | 30 | /** 31 | * Trait that enables logging. String formatting is based on the Java MessageFormat object, NOT the 32 | * regular String.format. See this documentation: 33 | * http://download.oracle.com/javase/1.4.2/docs/api/java/text/MessageFormat.html 34 | * 35 | * The code was initially taken from this location at Stack Overflow: 36 | * From http://stackoverflow.com/questions/978252/logging-in-scala/981942#981942 37 | */ 38 | trait Logging extends CanLog { 39 | 40 | val logger: Logger = Logging.getLogger(this) 41 | 42 | private def formatmsg(msg: String, refs: Seq[Any]): String = { 43 | new MessageFormat(msg).format(refs.toArray) 44 | } 45 | 46 | private def checkFormat(msg: String, refs: Seq[Any]): String = 47 | if (refs.size > 0) formatmsg(msg, refs) else msg 48 | 49 | def trace(msg: String, refs: Any*) { logger trace checkFormat(msg, refs) } 50 | 51 | def trace(t: Throwable, msg: String, refs: Any*) { logger trace(checkFormat(msg, refs), t) } 52 | 53 | def info(msg: String, refs: Any*) { logger info checkFormat(msg, refs) } 54 | 55 | def info(t: Throwable, msg: String, refs: Any*) { logger info (checkFormat(msg, refs), t) } 56 | 57 | def warn(msg: String, refs: Any*) { logger warn checkFormat(msg, refs) } 58 | 59 | def warn(t: Throwable, msg: String, refs: Any*) { logger warn (checkFormat(msg, refs), t) } 60 | 61 | def critical(msg: String, refs: Any*) { logger error checkFormat(msg, refs) } 62 | 63 | def critical(t: Throwable, msg: String, refs: Any*) { logger error (checkFormat(msg, refs), t) } 64 | 65 | def debug(msg: String, refs: Any*) { logger debug checkFormat(msg, refs) } 66 | 67 | def debug(t: Throwable, msg: String, refs: Any*) { logger debug (checkFormat(msg, refs), t) } 68 | 69 | } 70 | 71 | /** 72 | * Note: implementation taken from scalax.logging API 73 | */ 74 | object Logging { 75 | 76 | def loggerNameForClass(className: String) = { 77 | if (className endsWith "$") { 78 | className.substring(0, className.length - 1) 79 | } 80 | else { 81 | className 82 | } 83 | } 84 | 85 | def getLogger(logging: AnyRef) = LoggerFactory.getLogger(loggerNameForClass(logging.getClass.getName)) 86 | } 87 | 88 | trait CanLog { 89 | def logger: Logger 90 | 91 | def trace(msg: String, refs: Any*) 92 | 93 | def trace(t: Throwable, msg: String, refs: Any*) 94 | 95 | def info(msg: String, refs: Any*) 96 | 97 | def info(t: Throwable, msg: String, refs: Any*) 98 | 99 | def warn(msg: String, refs: Any*) 100 | 101 | def warn(t: Throwable, msg: String, refs: Any*) 102 | 103 | def critical(msg: String, refs: Any*) 104 | 105 | def critical(t: Throwable, msg: String, refs: Any*) 106 | 107 | def debug(msg: String, refs: Any*) 108 | 109 | def debug(t: Throwable, msg: String, refs: Any*) 110 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/Article.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose 20 | 21 | import java.sql.Date 22 | 23 | import images.Image 24 | import org.jsoup.nodes.{Document, Element} 25 | 26 | import scala.collection._ 27 | 28 | /** 29 | * Created by Jim Plush 30 | * User: jim 31 | * Date: 8/14/11 32 | */ 33 | 34 | class Article { 35 | 36 | /** 37 | * title of the article 38 | */ 39 | var title: String = null 40 | 41 | /** 42 | * stores the lovely, pure text from the article, stripped of html, formatting, etc... 43 | * just raw text with paragraphs separated by newlines. This is probably what you want to use. 44 | */ 45 | var cleanedArticleText: String = "" 46 | 47 | /** 48 | * meta description field in HTML source 49 | */ 50 | var metaDescription: String = "" 51 | 52 | /** 53 | * meta keywords field in the HTML source 54 | */ 55 | var metaKeywords: String = "" 56 | 57 | /** 58 | * The canonical link of this article if found in the meta data 59 | */ 60 | var canonicalLink: String = "" 61 | 62 | /** 63 | * holds the domain of this article we're parsing 64 | */ 65 | var domain: String = "" 66 | 67 | /** 68 | * holds the top Element we think is a candidate for the main body of the article 69 | */ 70 | var topNode: Element = null 71 | 72 | /** 73 | * holds the top Image object that we think represents this article 74 | */ 75 | var topImage: Image = new Image 76 | 77 | 78 | /** 79 | * holds a set of tags that may have been in the artcle, these are not meta keywords 80 | */ 81 | var tags: Set[String] = null 82 | 83 | /** 84 | * holds a list of any movies we found on the page like youtube, vimeo 85 | */ 86 | var movies: List[Element] = Nil 87 | 88 | /** 89 | * stores the final URL that we're going to try and fetch content against, this would be expanded if any 90 | * escaped fragments were found in the starting url 91 | */ 92 | var finalUrl: String = ""; 93 | 94 | /** 95 | * stores the MD5 hash of the url to use for various identification tasks 96 | */ 97 | var linkhash: String = ""; 98 | 99 | /** 100 | * stores the RAW HTML straight from the network connection 101 | */ 102 | var rawHtml: String = "" 103 | 104 | /** 105 | * the JSoup Document object 106 | */ 107 | var doc: Document = null 108 | 109 | /** 110 | * this is the original JSoup document that contains a pure object from the original HTML without any cleaning 111 | * options done on it 112 | */ 113 | var rawDoc: Document = null 114 | 115 | /** 116 | * Sometimes useful to try and know when the publish date of an article was 117 | */ 118 | var publishDate: Date = null 119 | 120 | /** 121 | * A property bucket for consumers of goose to store custom data extractions. 122 | * This is populated by an implementation of {@link com.gravity.goose.extractors.AdditionalDataExtractor} 123 | * which is executed before document cleansing within {@link com.gravity.goose.CrawlingActor#crawl} 124 | * @return a {@link Map Map<String,String>} of property name to property vaue (represented as a {@link String}. 125 | */ 126 | var additionalData: Map[String, String] = Map.empty 127 | } -------------------------------------------------------------------------------- /src/main/resources/com/aamend/spark/gdelt/reference/cameoGroup.txt: -------------------------------------------------------------------------------- 1 | CODE LABEL 2 | AAM Al Aqsa Martyrs Brigade 3 | ABD Arab Bank for Economic Development in Africa 4 | ACC Arab Cooperation Council 5 | ADB Asian Development Bank 6 | AEU Arab Economic Unity Council 7 | AFB African Development Bank 8 | ALQ Al Qaeda 9 | AMF Arab Monetary Fund for Economic and Social Development 10 | AML Amal Militia 11 | AMN Amnesty International 12 | AMU Arab Maghreb Union 13 | ANO Abu Nidal Organization 14 | APE Org. of Arab Petroleum Exporting Countries (OAPEC) 15 | ARL Arab League 16 | ASL South Lebanon Army 17 | ASN Association of Southeast Asian Nations (ASEAN) 18 | ATD Eastern and Southern African Trade and Development Bank 19 | BCA Bank of Central African States (BEAC) 20 | BIS Bank for International Settlements 21 | BTH Baath Party 22 | CEM Common Market for Eastern and Southern Africa 23 | CEM Monetary and Economic Community of Central Africa 24 | CFA Franc Zone Financial Community of Africa 25 | CIS Commonwealth of Independent States 26 | CMN Communist 27 | COE Council of Europe 28 | CPA Cocoa Producer's Alliance 29 | CPC Association of Coffee Producing Countries 30 | CRC International Fed. of Red Cross and Red Crescent (ICRC) 31 | CSS Community of Sahel-Saharan States (CENSAD) 32 | CWN Commonwealth of Nations 33 | DFL Democratic Front for the Lib. of Palestine (DFLP) 34 | EBR European Bank for Reconstruction and Development 35 | ECA Economic Community of Central African States 36 | EEC European Union 37 | EFT European Free Trade Association 38 | ENN Ennahda Movement 39 | FAO United Nations Food and Agriculture Organization 40 | FID International Federation of Human Rights (FIDH) 41 | FIS Islamic Salvation Army 42 | FLN National Liberation Front (FLN) 43 | FTA Fatah 44 | GCC Gulf Cooperation Council 45 | GIA Armed Islamic Group (GIA) 46 | GOE Group of Eight (G-8) (G-7 plus Russia) 47 | GOS Group of Seven (G-7) 48 | GSP Salafist Group 49 | GSS Group of Seventy-Seven (G-77) 50 | HCH UN High Commission for Human Rights 51 | HCR UN High Commission for Refugees 52 | HEZ Hezbullah 53 | HIP Highly Indebted Poor Countries (HIPC) 54 | HMS Hamas 55 | HRW Human Rights Watch 56 | IAC Inter-African Coffee Organization (IACO) 57 | IAD Intergovernmental Authority on Development (IGAD) 58 | IAE International Atomic Energy Agency (IAEA) 59 | IAF Islamic Action Front 60 | ICC International Criminal Court 61 | ICG International Crisis Group 62 | ICJ International Court of Justice (ICJ) 63 | ICO International Cocoa Organization (ICCO) 64 | IDB Islamic Development Bank 65 | IGC International Grains Council 66 | IHF International Helsinki Federation for Human Rights 67 | ILO International Labor Organization 68 | IMF International Monetary Fund (IMF) 69 | IOM International Organization for Migration 70 | IPU Inter-Parliamentary Union 71 | IRC Red Cross 72 | ISJ Palestinian Islamic Jihad 73 | ITP Interpol 74 | JUR International Commission of Jurists 75 | KDP Kurdish Democratic Party (KDP) 76 | KID United Nations Children?s Fund (UNICEF) 77 | LBA Israeli Labor Party 78 | LKD Likud Party 79 | MBR Muslim Brotherhood 80 | MRZ Meretz Party 81 | MSF Medecins Sans Frontieres (Doctors Without Borders) 82 | MSP Movement of the Society for Peace 83 | NAT North Atlantic Treaty Organization (NATO) 84 | NEP New Economic Partnership for Africa?s Development 85 | NON Organization of Non-Aligned Countries 86 | OAS Organization of American States 87 | OAU Organization of African Unity (OAU) 88 | OIC Organization of Islamic Conferences (OIC) 89 | OPC Organization of Petroleum Exporting Countries (OPEC) 90 | PAP Pan-African Parliament 91 | PFL People's Front for the Liberation of Palestine (PFLP) 92 | PLF Palestine Liberation Front 93 | PLO Palestine Liberation Organization 94 | PLS Polisario Guerillas 95 | PMD People's Mujahedeen 96 | PRC Paris Club 97 | PSE Occupied Palestinian Territories 98 | RCR Red Crescent 99 | RND Democratic National Rally 100 | SAA South Asian Association 101 | SAD Southern African Development Community 102 | SCE Council of Security and Cooperation in Europe (OSCE) 103 | SHA Shas Party 104 | SOT Southeast Asia Collective Defense Treaty (SEATO) 105 | TAL Taliban 106 | UEM Economic and Monetary Union of West Africa (UEMOA) 107 | UNO United Nations 108 | WAD West Africa Development Bank 109 | WAM West Africa Monetary and Economic Union 110 | WAS Economic Community of West African States (ECOWAS) 111 | WBK World Bank 112 | WCT International War Crimes Tribunals 113 | WEF World Economic Forum 114 | WFP World Food Program 115 | WHO World Health Organization 116 | WTO World Trade Organization 117 | WTO World Trade Organization (WTO) 118 | XFM Oxfam -------------------------------------------------------------------------------- /src/main/resources/com/aamend/spark/gdelt/reference/cameoCountry.txt: -------------------------------------------------------------------------------- 1 | CODE LABEL 2 | WSB West Bank 3 | BAG Baghdad 4 | GZS Gaza Strip 5 | AFR Africa 6 | ASA Asia 7 | BLK Balkans 8 | CRB Caribbean 9 | CAU Caucasus 10 | CFR Central Africa 11 | CAS Central Asia 12 | CEU Central Europe 13 | EIN East Indies 14 | EAF Eastern Africa 15 | EEU Eastern Europe 16 | EUR Europe 17 | LAM Latin America 18 | MEA Middle East 19 | MDT Mediterranean 20 | NAF North Africa 21 | NMR North America 22 | PGS Persian Gulf 23 | SCN Scandinavia 24 | SAM South America 25 | SAS South Asia 26 | SEA Southeast Asia 27 | SAF Southern Africa 28 | WAF West Africa 29 | WST The West 30 | AFG Afghanistan 31 | ALA Aland Islands 32 | ALB Albania 33 | DZA Algeria 34 | ASM American Samoa 35 | AND Andorra 36 | AGO Angola 37 | AIA Anguilla 38 | ATG Antigua and Barbuda 39 | ARG Argentina 40 | ARM Armenia 41 | ABW Aruba 42 | AUS Australia 43 | AUT Austria 44 | AZE Azerbaijan 45 | BHS Bahamas 46 | BHR Bahrain 47 | BGD Bangladesh 48 | BRB Barbados 49 | BLR Belarus 50 | BEL Belgium 51 | BLZ Belize 52 | BEN Benin 53 | BMU Bermuda 54 | BTN Bhutan 55 | BOL Bolivia 56 | BIH Bosnia and Herzegovina 57 | BWA Botswana 58 | BRA Brazil 59 | VGB British Virgin Islands 60 | BRN Brunei Darussalam 61 | BGR Bulgaria 62 | BFA Burkina Faso 63 | BDI Burundi 64 | KHM Cambodia 65 | CMR Cameroon 66 | CAN Canada 67 | CPV Cape Verde 68 | CYM Cayman Islands 69 | CAF Central African Republic 70 | TCD Chad 71 | CHL Chile 72 | CHN China 73 | COL Columbia 74 | COM Comoros 75 | COD Democratic Republic of the Congo 76 | COG People's Republic of the Congo 77 | COK Cook Islands 78 | CRI Costa Rica 79 | CIV Ivory Coast 80 | HRV Croatia 81 | CUB Cuba 82 | CYP Cyprus 83 | CZE Czech Republic 84 | DNK Denmark 85 | DJI Djibouti 86 | DMA Dominica 87 | DOM Dominican Republic 88 | TMP East Timor 89 | ECU Ecuador 90 | EGY Egypt 91 | SLV El Salvador 92 | GNQ Equatorial Guinea 93 | ERI Eritrea 94 | EST Estonia 95 | ETH Ethiopia 96 | FRO Faeroe Islands 97 | FLK Falkland Islands 98 | FJI Fiji 99 | FIN Finland 100 | FRA France 101 | GUF French Guiana 102 | PYF French Polynesia 103 | GAB Gabon 104 | GMB Gambia 105 | GEO Georgia 106 | DEU Germany 107 | GHA Ghana 108 | GIB Gibraltar 109 | GRC Greece 110 | GRL Greenland 111 | GRD Grenada 112 | GLP Guadeloupe 113 | GUM Guam 114 | GTM Guatemala 115 | GIN Guinea 116 | GNB Guinea-Bissau 117 | GUY Guyana 118 | HTI Haiti 119 | VAT Vatican City 120 | HND Honduras 121 | HKG Hong Kong 122 | HUN Hungary 123 | ISL Iceland 124 | IND India 125 | IDN Indonesia 126 | IRN Iran 127 | IRQ Iraq 128 | IRL Ireland 129 | IMY Isle of Man 130 | ISR Israel 131 | ITA Italy 132 | JAM Jamaica 133 | JPN Japan 134 | JOR Jordan 135 | KAZ Kazakhstan 136 | KEN Kenya 137 | KIR Kiribati 138 | PRK North Korea 139 | KOR South Korea 140 | KWT Kuwait 141 | KGZ Kyrgyzstan 142 | LAO Laos 143 | LVA Latvia 144 | LBN Lebanon 145 | LSO Lesotho 146 | LBR Liberia 147 | LBY Libya 148 | LIE Liechtenstein 149 | LTU Lithuania 150 | LUX Luxembourg 151 | MAC Macao 152 | MKD Macedonia 153 | MDG Madagascar 154 | MWI Malawi 155 | MYS Malaysia 156 | MDV Maldives 157 | MLI Mali 158 | MLT Malta 159 | MHL Marshall Islands 160 | MTQ Martinique 161 | MRT Mauritania 162 | MUS Mauritius 163 | MYT Mayotte 164 | MEX Mexico 165 | FSM Micronesia 166 | MDA Moldova 167 | MCO Monaco 168 | MNG Mongolia 169 | MTN Montenegro 170 | MSR Montserrat 171 | MAR Morocco 172 | MOZ Mozambique 173 | MMR Myanmar 174 | NAM Namibia 175 | NRU Nauru 176 | NPL Nepal 177 | NLD Netherlands 178 | ANT Netherlands Antilles 179 | NCL New Caledonia 180 | NZL New Zealand 181 | NIC Nicaragua 182 | NER Niger 183 | NGA Nigeria 184 | NIU Niue 185 | NFK Norfolk Island 186 | MNP Northern Mariana Islands 187 | NOR Norway 188 | PSE Occupied Palestinian Territory 189 | OMN Oman 190 | PAK Pakistan 191 | PLW Palau 192 | PAN Panama 193 | PNG Papua New Guinea 194 | PRY Paraguay 195 | PER Peru 196 | PHL Philippines 197 | PCN Pitcairn 198 | POL Poland 199 | PRT Portugal 200 | PRI Puerto Rico 201 | QAT Qatar 202 | REU Runion 203 | ROM Romania 204 | RUS Russia 205 | RWA Rwanda 206 | SHN Saint Helena 207 | KNA Saint Kitts-Nevis 208 | LCA Saint Lucia 209 | SPM Saint Pierre and Miquelon 210 | VCT Saint Vincent and the Grenadines 211 | WSM Samoa 212 | SMR San Marino 213 | STP Sao Tome and Principe 214 | SAU Saudi Arabia 215 | SEN Senegal 216 | SRB Serbia 217 | SYC Seychelles 218 | SLE Sierra Leone 219 | SGP Singapore 220 | SVK Slovakia 221 | SVN Slovenia 222 | SLB Solomon Islands 223 | SOM Somalia 224 | ZAF South Africa 225 | ESP Spain 226 | LKA Sri Lanka 227 | SDN Sudan 228 | SUR Suriname 229 | SJM Svalbard and Jan Mayen Islands 230 | SWZ Swaziland 231 | SWE Sweden 232 | CHE Switzerland 233 | SYR Syria 234 | TWN Taiwan 235 | TJK Tajikistan 236 | TZA Tanzania 237 | THA Thailand 238 | TGO Togo 239 | TKL Tokelau 240 | TON Tonga 241 | TTO Trinidad and Tobago 242 | TUN Tunisia 243 | TUR Turkey 244 | TKM Turkmenistan 245 | TCA Turks and Caicos Islands 246 | TUV Tuvalu 247 | UGA Uganda 248 | UKR Ukraine 249 | ARE United Arab Emirates 250 | GBR United Kingdom 251 | USA United States 252 | VIR United States Virgin Islands 253 | URY Uruguay 254 | UZB Uzbekistan 255 | VUT Vanuatu 256 | VEN Venezuela 257 | VNM Vietnam 258 | WLF Wallis and Futuna Islands 259 | ESH Western Sahara 260 | YEM Yemen 261 | ZMB Zambia 262 | ZWE Zimbabwe -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/text/ReplaceSequence.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.text 20 | 21 | import org.slf4j.Logger 22 | import org.slf4j.LoggerFactory 23 | import java.util.ArrayList 24 | import java.util.List 25 | import java.util.regex.Matcher 26 | import java.util.regex.Pattern 27 | 28 | /** 29 | * Created by IntelliJ IDEA. 30 | * User: robbie 31 | * Date: 5/13/11 32 | * Time: 12:03 AM 33 | */ 34 | /** 35 | * Wraps the usage of making multiple string replacements in an ordered sequence. 36 | * For Example... instead of doing this over and over:

37 | *
38 | *
 39 | *     String text = "   Some example text     ";
 40 | *     text = text.{@link String#replaceAll(String, String) replaceAll}("e", "E");
 41 | *     text = text.{@link String#replaceAll(String, String) replaceAll}(" ", "_");
 42 | *     text = text.{@link String#replaceAll(String, String) replaceAll}("^\\s+$", "");
 43 | *   
44 | *
45 | * You can use a ReplaceSequence like this:

46 | *
47 | *
 48 | *     static final betterReplacements = ReplaceSequence.{@link #create(String, String) create}("e", "E").{@link #append(String, String) append}(" ", "_").{@link #append(String) append}("^\\s+$");
 49 | *
 50 | *     void fixMyString(String text) {
 51 | *       return betterReplacements.{@link #replaceAll(String) replaceAll}(text);
 52 | *     }
 53 | *   
54 | *
55 | * 56 | * Internally, an ordered list of {@link Matcher}s and its associated replacement is built as the {@link #append} method is called.
57 | * Each matcher is {@link Matcher#reset(CharSequence) reset} with the input specified in the {@link #replaceAll(String)} method.

58 | * Use of this class can improve performance if the sequence of replacements is intended to be used repeatedly throughout the life of an application.
59 | * This is due to the fact that each {@link Pattern} is only compiled once and each {@link Matcher} is only generated once. 60 | */ 61 | object ReplaceSequence { 62 | /** 63 | * Creates a new ReplaceSequence with the first pattern to be replaced with an empty String 64 | * @param firstPattern The regex {@link Pattern pattern} string for the first replacement 65 | * @return a new instance 66 | */ 67 | def create(firstPattern: String): ReplaceSequence = { 68 | create(firstPattern, string.empty) 69 | } 70 | 71 | /** 72 | * Creates a new ReplaceSequence with the first pattern to be replaced with the specified replaceWith parameter. 73 | * @param firstPattern The regex {@link Pattern pattern} {@link String} for the first replacement 74 | * @param replaceWith The {@link String} to replace matches of the specified pattern 75 | * @return a new instance 76 | */ 77 | def create(firstPattern: String, replaceWith: String): ReplaceSequence = { 78 | val result: ReplaceSequence = new ReplaceSequence(StringReplacement.compile(firstPattern, replaceWith)) 79 | result 80 | } 81 | } 82 | 83 | class ReplaceSequence { 84 | /** 85 | * Appends a new pattern to this instance in a builder pattern 86 | * @param pattern The regex {@link Pattern pattern} {@link String} for this replacement 87 | * @return this instance of itself for use in a builder pattern 88 | */ 89 | def append(pattern: String): ReplaceSequence = { 90 | append(pattern, string.empty) 91 | } 92 | 93 | /** 94 | * Appends a new pattern to this instance in a builder pattern 95 | * @param pattern The regex {@link Pattern pattern} {@link String} for this replacement 96 | * @param replaceWith The {@link String} to replace matches of the specified pattern 97 | * @return this instance of itself for use in a builder pattern 98 | */ 99 | def append(pattern: String, replaceWith: String): ReplaceSequence = { 100 | replacements.add(StringReplacement.compile(pattern, replaceWith)) 101 | this 102 | } 103 | 104 | /** 105 | * Applies each of the replacements specified via the initial {@link #create(String)} and/or any additional via {@link #append(String)} 106 | * @param input the {@link String} to apply all of the replacements to 107 | * @return the resulting {@link String} after all replacements have been applied 108 | */ 109 | def replaceAll(input: String): String = { 110 | if (string.isNullOrEmpty(input)) return string.empty 111 | var mutatedString = input 112 | import scala.collection.JavaConversions._ 113 | for (rp <- replacements) { 114 | mutatedString = rp.replaceAll(mutatedString) 115 | } 116 | mutatedString 117 | } 118 | 119 | private def this(pair: StringReplacement) { 120 | this () 121 | replacements.add(pair) 122 | } 123 | 124 | var replacements: List[StringReplacement] = new ArrayList[StringReplacement] 125 | } 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/spark/GooseFetcher.scala: -------------------------------------------------------------------------------- 1 | package com.gravity.goose.spark 2 | 3 | import com.gravity.goose.{Configuration, Goose} 4 | import org.apache.spark.ml.Transformer 5 | import org.apache.spark.ml.param._ 6 | import org.apache.spark.ml.util._ 7 | import org.apache.spark.sql.types._ 8 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 9 | 10 | trait GooseFetcherParams extends Params with DefaultParamsWritable { 11 | val annotators = new Param[Map[String, String]](this, "annotators", s"The list of annotators [${ANNOTATORS.mkString(",")}]") 12 | val urlColumn = new Param[String](this, "urlColumn", "The input column containing URLs") 13 | val userAgent = new Param[String](this, "userAgent", "User agent that is sent with your web requests to extract URL content") 14 | val socketTimeout = new Param[Int](this, "socketTimeout", "Socket timeout (ms)") 15 | val connectionTimeout = new Param[Int](this, "connectionTimeout", "Connection timeout (ms)") 16 | val enableImageFetching = new Param[Boolean](this, "enableImageFetching", "(Experimental) Fetching image header as base64") 17 | } 18 | 19 | class GooseFetcher(override val uid: String) extends Transformer with GooseFetcherParams { 20 | 21 | def setAnnotators(value: Map[String, String]): this.type = { 22 | require(value.nonEmpty, "At least one annotator must be provided") 23 | require(value.values.toSet.size == value.keys.size, "Annotator fields must be unique") 24 | value.keys.foreach(annotator => require(ANNOTATORS.contains(annotator), s"Annotator [$annotator] is not valid, supported are [${ANNOTATORS.mkString(",")}]")) 25 | set(annotators, value) 26 | } 27 | 28 | setDefault(annotators -> ANNOTATORS.zip(ANNOTATORS).toMap) 29 | 30 | def setUserAgent(value: String): this.type = set(userAgent, value) 31 | 32 | setDefault(userAgent -> "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8") 33 | 34 | def setSocketTimeout(value: Int): this.type = set(socketTimeout, value) 35 | 36 | setDefault(socketTimeout -> 10000) 37 | 38 | def setConnectionTimeout(value: Int): this.type = set(connectionTimeout, value) 39 | 40 | setDefault(connectionTimeout -> 10000) 41 | 42 | def setEnableImageFetching(value: Boolean): this.type = set(enableImageFetching, value) 43 | 44 | setDefault(enableImageFetching -> false) 45 | 46 | def setUrlColumn(value: String): this.type = set(urlColumn, value) 47 | 48 | setDefault(urlColumn -> "url") 49 | 50 | def this() = this(Identifiable.randomUID("goose")) 51 | 52 | override def transform(origDS: Dataset[_]): DataFrame = { 53 | 54 | // Make sure the URL field exist 55 | require(origDS.schema.exists(s => s.name == $(urlColumn) && s.dataType == StringType), "Field [" + $(urlColumn) + "] is not valid") 56 | 57 | // Make sure annotators field do not exist 58 | $(annotators).values.foreach(annotator => { 59 | require(!origDS.schema.exists(s => s.name == annotator), s"Annotator field [$annotator] already exist") 60 | }) 61 | 62 | // This intermediate dataset to make sure we don't scrape more than once a same URL 63 | val urlDF = origDS.select($(urlColumn)).dropDuplicates($(urlColumn)) 64 | 65 | // Append URL dataframe with article annotators 66 | val urlContentRDD = urlDF.rdd.mapPartitions(rows => { 67 | 68 | // Initialize Goose only once for each partition 69 | val conf = new Configuration() 70 | conf.setEnableImageFetching($(enableImageFetching)) 71 | conf.setBrowserUserAgent($(userAgent)) 72 | conf.setSocketTimeout($(socketTimeout)) 73 | conf.setConnectionTimeout($(connectionTimeout)) 74 | val goose = new Goose(conf) 75 | 76 | // Scrape each URL individually 77 | val articles = scrapeArticles(rows.map(_.getAs[String]($(urlColumn))), goose) 78 | 79 | // Convert articles as Row 80 | articles.map(article => { 81 | val appended: Seq[Any] = $(annotators).map { case (key, _) => 82 | key match { 83 | case ANNOTATOR_TITLE => article.title.getOrElse("") 84 | case ANNOTATOR_DESCRIPTION => article.description.getOrElse("") 85 | case ANNOTATOR_CONTENT => article.content.getOrElse("") 86 | case ANNOTATOR_KEYWORDS => article.keywords 87 | case ANNOTATOR_PUBLISH_DATE => article.publishDate.orNull 88 | } 89 | }.toSeq 90 | Row.fromSeq(Seq(article.url) ++ appended) 91 | }) 92 | }) 93 | 94 | // Transform RDD of Row to Dataframe 95 | val contentDF = origDS.sqlContext.createDataFrame(urlContentRDD, transformSchema(urlDF.schema)) 96 | 97 | // Join articles back to any duplicate URL dataset 98 | contentDF.join(origDS, List($(urlColumn))) 99 | 100 | } 101 | 102 | override def transformSchema(schema: StructType): StructType = { 103 | StructType( 104 | schema.seq ++ $(annotators).map { case (key, value) => 105 | key match { 106 | case ANNOTATOR_TITLE => StructField(value, StringType, nullable = false) 107 | case ANNOTATOR_DESCRIPTION => StructField(value, StringType, nullable = false) 108 | case ANNOTATOR_CONTENT => StructField(value, StringType, nullable = false) 109 | case ANNOTATOR_KEYWORDS => StructField(value, ArrayType.apply(StringType), nullable = false) 110 | case ANNOTATOR_PUBLISH_DATE => StructField(value, DateType, nullable = true) 111 | } 112 | } 113 | ) 114 | } 115 | 116 | override def copy(extra: ParamMap): Transformer = { 117 | defaultCopy(extra) 118 | } 119 | } 120 | 121 | object GooseFetcher extends DefaultParamsReadable[GooseFetcher] { 122 | override def load(path: String): GooseFetcher = super.load(path) 123 | } 124 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/Configuration.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose 20 | 21 | import network.{HtmlFetcher, AbstractHtmlFetcher} 22 | import org.jsoup.nodes.Element 23 | import scala.beans.BeanProperty 24 | import com.gravity.goose.extractors.{StandardContentExtractor, ContentExtractor, AdditionalDataExtractor, PublishDateExtractor} 25 | 26 | 27 | /** 28 | * Created by Jim Plush 29 | * User: jim 30 | * Date: 8/16/11 31 | */ 32 | 33 | 34 | class Configuration { 35 | 36 | /** 37 | * this is the local storage path used to place images to inspect them, should be writable 38 | */ 39 | @BeanProperty 40 | var localStoragePath: String = "/tmp/goose" 41 | /** 42 | * What's the minimum bytes for an image we'd accept is, alot of times we want to filter out the author's little images 43 | * in the beginning of the article 44 | */ 45 | @BeanProperty 46 | var minBytesForImages: Int = 4500 47 | /** 48 | * set this guy to false if you don't care about getting images, otherwise you can either use the default 49 | * image extractor to implement the ImageExtractor interface to build your own 50 | */ 51 | @BeanProperty 52 | var enableImageFetching: Boolean = true 53 | /** 54 | * path to your imagemagick convert executable, on the mac using mac ports this is the default listed 55 | */ 56 | @BeanProperty 57 | var imagemagickConvertPath: String = "/opt/local/bin/convert" 58 | /** 59 | * path to your imagemagick identify executable 60 | */ 61 | @BeanProperty 62 | var imagemagickIdentifyPath: String = "/opt/local/bin/identify" 63 | 64 | @BeanProperty 65 | var connectionTimeout: Int = 10000 66 | 67 | @BeanProperty 68 | var socketTimeout: Int = 10000 69 | 70 | /** 71 | * used as the user agent that is sent with your web requests to extract an article 72 | */ 73 | @BeanProperty 74 | var browserUserAgent: String = "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8" 75 | 76 | var contentExtractor: ContentExtractor = StandardContentExtractor 77 | 78 | var publishDateExtractor: PublishDateExtractor = new PublishDateExtractor { 79 | import PublishDateExtractor._ 80 | 81 | def extractCandidate(rootElement: Element, selector: String): Seq[java.sql.Date] = { 82 | import scala.collection.JavaConversions._ 83 | 84 | try { 85 | rootElement.select(selector).flatMap(item => safeParseISO8601Date(item.attr("content"))) 86 | } 87 | catch { 88 | case e: Exception => 89 | Nil 90 | } 91 | } 92 | 93 | final val pubSelectors = Seq( 94 | "meta[property~=article:published_time]" 95 | ) 96 | 97 | final val modSelectors = Seq( 98 | "meta[property~=article:modified_time]", 99 | "meta[property~=og:updated_time]" 100 | ) 101 | 102 | def extract(rootElement: Element): java.sql.Date = { 103 | // A few different ways to get a date. 104 | def bestPubDate = pubSelectors.flatMap(extractCandidate(rootElement, _)).reduceOption(minDate) 105 | def bestModDate = modSelectors.flatMap(extractCandidate(rootElement, _)).reduceOption(minDate) 106 | 107 | // Return the oldest 'published' date, or else the oldest 'modified' date, or null if none. 108 | bestPubDate.orElse(bestModDate).getOrElse(null) 109 | } 110 | } 111 | 112 | var additionalDataExtractor: AdditionalDataExtractor = new AdditionalDataExtractor 113 | 114 | def getPublishDateExtractor: PublishDateExtractor = { 115 | publishDateExtractor 116 | } 117 | 118 | def setContentExtractor(extractor: ContentExtractor) { 119 | if (extractor == null) throw new IllegalArgumentException("extractor must not be null!") 120 | contentExtractor = extractor 121 | } 122 | 123 | /** 124 | * Pass in to extract article publish dates. 125 | * @param extractor a concrete instance of {@link PublishDateExtractor} 126 | * @throws IllegalArgumentException if the instance passed in is null 127 | */ 128 | def setPublishDateExtractor(extractor: PublishDateExtractor) { 129 | if (extractor == null) throw new IllegalArgumentException("extractor must not be null!") 130 | this.publishDateExtractor = extractor 131 | } 132 | 133 | def getAdditionalDataExtractor: AdditionalDataExtractor = { 134 | additionalDataExtractor 135 | } 136 | 137 | /** 138 | * Pass in to extract any additional data not defined within {@link Article} 139 | * @param extractor a concrete instance of {@link AdditionalDataExtractor} 140 | * @throws IllegalArgumentException if the instance passed in is null 141 | */ 142 | def setAdditionalDataExtractor(extractor: AdditionalDataExtractor) { 143 | this.additionalDataExtractor = extractor 144 | } 145 | 146 | var htmlFetcher: AbstractHtmlFetcher = HtmlFetcher 147 | 148 | def setHtmlFetcher(fetcher: AbstractHtmlFetcher) { 149 | require(fetcher != null, "fetcher MUST NOT be null!") 150 | this.htmlFetcher = fetcher 151 | } 152 | 153 | def getHtmlFetcher: AbstractHtmlFetcher = htmlFetcher 154 | 155 | } -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/Crawler.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose 20 | 21 | import cleaners.{StandardDocumentCleaner, DocumentCleaner} 22 | import extractors.ContentExtractor 23 | import images.{Image, UpgradedImageIExtractor, ImageExtractor} 24 | import org.apache.http.client.HttpClient 25 | import org.jsoup.nodes.{Document, Element} 26 | import org.jsoup.Jsoup 27 | import java.io.File 28 | import utils.{ParsingCandidate, URLHelper, Logging} 29 | import com.gravity.goose.outputformatters.{StandardOutputFormatter, OutputFormatter} 30 | 31 | /** 32 | * Created by Jim Plush 33 | * User: jim 34 | * Date: 8/18/11 35 | */ 36 | 37 | case class CrawlCandidate(config: Configuration, url: String, rawHTML: String = null) 38 | 39 | class Crawler(config: Configuration) { 40 | 41 | import Crawler._ 42 | 43 | def crawl(crawlCandidate: CrawlCandidate): Article = { 44 | val article = new Article() 45 | for { 46 | parseCandidate <- URLHelper.getCleanedUrl(crawlCandidate.url) 47 | rawHtml <- getHTML(crawlCandidate, parseCandidate) 48 | doc <- getDocument(parseCandidate.url.toString, rawHtml) 49 | } { 50 | trace("Crawling url: " + parseCandidate.url) 51 | 52 | val extractor = getExtractor 53 | val docCleaner = getDocCleaner 54 | val outputFormatter = getOutputFormatter 55 | 56 | article.finalUrl = parseCandidate.url.toString 57 | article.domain = parseCandidate.url.getHost 58 | article.linkhash = parseCandidate.linkhash 59 | article.rawHtml = rawHtml 60 | article.doc = doc 61 | article.rawDoc = doc.clone() 62 | 63 | article.title = extractor.getTitle(article) 64 | article.publishDate = config.publishDateExtractor.extract(doc) 65 | article.additionalData = config.getAdditionalDataExtractor.extract(doc) 66 | article.metaDescription = extractor.getMetaDescription(article) 67 | article.metaKeywords = extractor.getMetaKeywords(article) 68 | article.canonicalLink = extractor.getCanonicalLink(article) 69 | article.tags = extractor.extractTags(article) 70 | 71 | // before we do any calcs on the body itself let's clean up the document 72 | article.doc = docCleaner.clean(article) 73 | 74 | extractor.calculateBestNodeBasedOnClustering(article) match { 75 | case Some(node: Element) => 76 | article.topNode = node 77 | article.movies = extractor.extractVideos(article.topNode) 78 | 79 | if (config.enableImageFetching) { 80 | trace(logPrefix + "Image fetching enabled...") 81 | val imageExtractor = getImageExtractor(article) 82 | try { 83 | if (article.rawDoc == null) { 84 | article.topImage = new Image 85 | } else { 86 | article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode) 87 | } 88 | } catch { 89 | case e: Exception => warn(e, e.toString) 90 | } 91 | } 92 | article.topNode = extractor.postExtractionCleanup(article.topNode) 93 | 94 | 95 | article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode) 96 | case _ => trace("NO ARTICLE FOUND") 97 | } 98 | releaseResources(article) 99 | article 100 | } 101 | 102 | article 103 | } 104 | 105 | def getHTML(crawlCandidate: CrawlCandidate, parsingCandidate: ParsingCandidate): Option[String] = { 106 | if (crawlCandidate.rawHTML != null) { 107 | Some(crawlCandidate.rawHTML) 108 | } else { 109 | config.getHtmlFetcher.getHtml(config, parsingCandidate.url.toString) match { 110 | case Some(html) => 111 | Some(html) 112 | case _ => None 113 | } 114 | } 115 | } 116 | 117 | 118 | def getImageExtractor(article: Article): ImageExtractor = { 119 | val httpClient: HttpClient = config.getHtmlFetcher.getHttpClient 120 | new UpgradedImageIExtractor(httpClient, article, config) 121 | } 122 | 123 | def getOutputFormatter: OutputFormatter = { 124 | StandardOutputFormatter 125 | } 126 | 127 | def getDocCleaner: DocumentCleaner = { 128 | new StandardDocumentCleaner 129 | } 130 | 131 | def getDocument(url: String, rawlHtml: String): Option[Document] = { 132 | 133 | try { 134 | Some(Jsoup.parse(rawlHtml)) 135 | } catch { 136 | case e: Exception => { 137 | trace("Unable to parse " + url + " properly into JSoup Doc") 138 | None 139 | } 140 | } 141 | } 142 | 143 | def getExtractor: ContentExtractor = { 144 | config.contentExtractor 145 | } 146 | 147 | /** 148 | * cleans up any temp files we have laying around like temp images 149 | * removes any image in the temp dir that starts with the linkhash of the url we just parsed 150 | */ 151 | def releaseResources(article: Article) { 152 | trace(logPrefix + "STARTING TO RELEASE ALL RESOURCES") 153 | 154 | val dir: File = new File(config.localStoragePath) 155 | 156 | dir.list.foreach(filename => { 157 | if (filename.startsWith(article.linkhash)) { 158 | val f: File = new File(dir.getAbsolutePath + "/" + filename) 159 | if (!f.delete) { 160 | warn("Unable to remove temp file: " + filename) 161 | } 162 | } 163 | }) 164 | } 165 | 166 | } 167 | 168 | object Crawler extends Logging { 169 | val logPrefix = "crawler: " 170 | } -------------------------------------------------------------------------------- /src/main/resources/com/gravity/goose/text/stopwords-en.txt: -------------------------------------------------------------------------------- 1 | a's 2 | able 3 | about 4 | above 5 | according 6 | accordingly 7 | across 8 | actually 9 | after 10 | afterwards 11 | again 12 | against 13 | ain't 14 | all 15 | allow 16 | allows 17 | almost 18 | alone 19 | along 20 | already 21 | also 22 | although 23 | always 24 | am 25 | among 26 | amongst 27 | an 28 | and 29 | another 30 | any 31 | anybody 32 | anyhow 33 | anyone 34 | anything 35 | anyway 36 | anyways 37 | anywhere 38 | apart 39 | appear 40 | appreciate 41 | appropriate 42 | are 43 | aren't 44 | around 45 | as 46 | aside 47 | ask 48 | asking 49 | associated 50 | at 51 | available 52 | away 53 | awfully 54 | be 55 | became 56 | because 57 | become 58 | becomes 59 | becoming 60 | been 61 | before 62 | beforehand 63 | behind 64 | being 65 | believe 66 | below 67 | beside 68 | besides 69 | best 70 | better 71 | between 72 | beyond 73 | both 74 | brief 75 | but 76 | by 77 | c 78 | c'mon 79 | c's 80 | came 81 | campaign 82 | can 83 | can't 84 | cannot 85 | cant 86 | cause 87 | causes 88 | certain 89 | certainly 90 | changes 91 | clearly 92 | co 93 | com 94 | come 95 | comes 96 | concerning 97 | consequently 98 | consider 99 | considering 100 | contain 101 | containing 102 | contains 103 | corresponding 104 | could 105 | couldn't 106 | course 107 | currently 108 | definitely 109 | described 110 | despite 111 | did 112 | didn't 113 | different 114 | do 115 | does 116 | doesn't 117 | doing 118 | don't 119 | done 120 | down 121 | downwards 122 | during 123 | each 124 | edu 125 | eight 126 | either 127 | else 128 | elsewhere 129 | enough 130 | endorsed 131 | entirely 132 | especially 133 | et 134 | etc 135 | even 136 | ever 137 | every 138 | everybody 139 | everyone 140 | everything 141 | everywhere 142 | ex 143 | exactly 144 | example 145 | except 146 | far 147 | few 148 | fifth 149 | first 150 | financial 151 | five 152 | followed 153 | following 154 | follows 155 | for 156 | former 157 | formerly 158 | forth 159 | four 160 | from 161 | further 162 | furthermore 163 | get 164 | gets 165 | getting 166 | given 167 | gives 168 | go 169 | goes 170 | going 171 | gone 172 | got 173 | gotten 174 | greetings 175 | had 176 | hadn't 177 | happens 178 | hardly 179 | has 180 | hasn't 181 | have 182 | haven't 183 | having 184 | he 185 | he's 186 | hello 187 | help 188 | hence 189 | her 190 | here 191 | here's 192 | hereafter 193 | hereby 194 | herein 195 | hereupon 196 | hers 197 | herself 198 | hi 199 | him 200 | himself 201 | his 202 | hither 203 | hopefully 204 | how 205 | howbeit 206 | however 207 | i'd 208 | i'll 209 | i'm 210 | i've 211 | if 212 | ignored 213 | immediate 214 | in 215 | inasmuch 216 | inc 217 | indeed 218 | indicate 219 | indicated 220 | indicates 221 | inner 222 | insofar 223 | instead 224 | into 225 | inward 226 | is 227 | isn't 228 | it 229 | it'd 230 | it'll 231 | it's 232 | its 233 | itself 234 | just 235 | keep 236 | keeps 237 | kept 238 | know 239 | knows 240 | known 241 | last 242 | lately 243 | later 244 | latter 245 | latterly 246 | least 247 | less 248 | lest 249 | let 250 | let's 251 | like 252 | liked 253 | likely 254 | little 255 | look 256 | looking 257 | looks 258 | ltd 259 | mainly 260 | many 261 | may 262 | maybe 263 | me 264 | mean 265 | meanwhile 266 | merely 267 | might 268 | more 269 | moreover 270 | most 271 | mostly 272 | much 273 | must 274 | my 275 | myself 276 | name 277 | namely 278 | nd 279 | near 280 | nearly 281 | necessary 282 | need 283 | needs 284 | neither 285 | never 286 | nevertheless 287 | new 288 | next 289 | nine 290 | no 291 | nobody 292 | non 293 | none 294 | noone 295 | nor 296 | normally 297 | not 298 | nothing 299 | novel 300 | now 301 | nowhere 302 | obviously 303 | of 304 | off 305 | often 306 | oh 307 | ok 308 | okay 309 | old 310 | on 311 | once 312 | one 313 | ones 314 | only 315 | onto 316 | or 317 | other 318 | others 319 | otherwise 320 | ought 321 | our 322 | ours 323 | ourselves 324 | out 325 | outside 326 | over 327 | overall 328 | own 329 | particular 330 | particularly 331 | per 332 | perhaps 333 | placed 334 | please 335 | plus 336 | possible 337 | presumably 338 | probably 339 | provides 340 | quite 341 | quote 342 | quarterly 343 | rather 344 | really 345 | reasonably 346 | regarding 347 | regardless 348 | regards 349 | relatively 350 | respectively 351 | right 352 | said 353 | same 354 | saw 355 | say 356 | saying 357 | says 358 | second 359 | secondly 360 | see 361 | seeing 362 | seem 363 | seemed 364 | seeming 365 | seems 366 | seen 367 | self 368 | selves 369 | sensible 370 | sent 371 | serious 372 | seriously 373 | seven 374 | several 375 | shall 376 | she 377 | should 378 | shouldn't 379 | since 380 | six 381 | so 382 | some 383 | somebody 384 | somehow 385 | someone 386 | something 387 | sometime 388 | sometimes 389 | somewhat 390 | somewhere 391 | soon 392 | sorry 393 | specified 394 | specify 395 | specifying 396 | still 397 | sub 398 | such 399 | sup 400 | sure 401 | t's 402 | take 403 | taken 404 | tell 405 | tends 406 | than 407 | thank 408 | thanks 409 | thanx 410 | that 411 | that's 412 | thats 413 | the 414 | their 415 | theirs 416 | them 417 | themselves 418 | then 419 | thence 420 | there 421 | there's 422 | thereafter 423 | thereby 424 | therefore 425 | therein 426 | theres 427 | thereupon 428 | these 429 | they 430 | they'd 431 | they'll 432 | they're 433 | they've 434 | think 435 | third 436 | this 437 | thorough 438 | thoroughly 439 | those 440 | though 441 | three 442 | through 443 | throughout 444 | thru 445 | thus 446 | to 447 | together 448 | too 449 | took 450 | toward 451 | towards 452 | tried 453 | tries 454 | truly 455 | try 456 | trying 457 | twice 458 | two 459 | under 460 | unfortunately 461 | unless 462 | unlikely 463 | until 464 | unto 465 | up 466 | upon 467 | us 468 | use 469 | used 470 | useful 471 | uses 472 | using 473 | usually 474 | uucp 475 | value 476 | various 477 | very 478 | via 479 | viz 480 | vs 481 | want 482 | wants 483 | was 484 | wasn't 485 | way 486 | we 487 | we'd 488 | we'll 489 | we're 490 | we've 491 | welcome 492 | well 493 | went 494 | were 495 | weren't 496 | what 497 | what's 498 | whatever 499 | when 500 | whence 501 | whenever 502 | where 503 | where's 504 | whereafter 505 | whereas 506 | whereby 507 | wherein 508 | whereupon 509 | wherever 510 | whether 511 | which 512 | while 513 | whither 514 | who 515 | who's 516 | whoever 517 | whole 518 | whom 519 | whose 520 | why 521 | will 522 | willing 523 | wish 524 | with 525 | within 526 | without 527 | won't 528 | wonder 529 | would 530 | would 531 | wouldn't 532 | yes 533 | yet 534 | you 535 | you'd 536 | you'll 537 | you're 538 | you've 539 | your 540 | yours 541 | yourself 542 | yourselves 543 | zero 544 | official 545 | sharply 546 | criticized -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/images/ImageSaver.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.images 19 | 20 | /** 21 | * Created by Jim Plush 22 | * User: jim 23 | * Date: 8/18/11 24 | */ 25 | 26 | import org.apache.commons.io.IOUtils 27 | import org.apache.http.HttpEntity 28 | import org.apache.http.client.ClientProtocolException 29 | import org.apache.http.client.HttpClient 30 | import org.apache.http.client.methods.HttpGet 31 | import org.apache.http.client.protocol.ClientContext 32 | import org.apache.http.protocol.BasicHttpContext 33 | import org.apache.http.protocol.HttpContext 34 | import java.io._ 35 | import java.util.Random 36 | import com.gravity.goose.utils.Logging 37 | import com.gravity.goose.Configuration 38 | import com.gravity.goose.network.HtmlFetcher 39 | 40 | /** 41 | * This class will be responsible for storing images to disk 42 | * 43 | * @author Jim Plush 44 | */ 45 | object ImageSaver extends Logging { 46 | private def getFileExtension(config: Configuration, fileName: String): String = { 47 | var fileExtension: String = "" 48 | var mimeType: String = null 49 | try { 50 | val imageDims: ImageDetails = ImageUtils.getImageDimensions(config.imagemagickIdentifyPath, fileName) 51 | mimeType = imageDims.getMimeType 52 | if (mimeType == "GIF") { 53 | if (logger.isDebugEnabled) { 54 | logger.debug("SNEAKY GIF! " + fileName) 55 | } 56 | throw new SecretGifException 57 | } 58 | if (mimeType == "JPEG") { 59 | fileExtension = ".jpg" 60 | } 61 | else if (mimeType == "PNG") { 62 | fileExtension = ".png" 63 | } 64 | else { 65 | throw new IOException("BAD MIME TYPE: " + mimeType + " FILENAME:" + fileName) 66 | } 67 | } 68 | catch { 69 | case e: SecretGifException => 70 | throw e 71 | case e: FileNotFoundException => 72 | logger.error(e.getMessage) 73 | case e: IOException => 74 | logger.error(e.getMessage) 75 | throw e 76 | } 77 | finally { 78 | } 79 | fileExtension 80 | } 81 | 82 | def fetchEntity(httpClient: HttpClient, imageSrc: String): Option[HttpEntity] = { 83 | 84 | val localContext: HttpContext = new BasicHttpContext 85 | localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore) 86 | val httpget = new HttpGet(imageSrc) 87 | val response = httpClient.execute(httpget, localContext) 88 | val respStatus: String = response.getStatusLine.toString 89 | if (!respStatus.contains("200")) { 90 | None 91 | } else { 92 | try { 93 | Some(response.getEntity) 94 | } catch { 95 | case e: Exception => warn(e, e.toString); None 96 | } finally { 97 | httpget.abort() 98 | } 99 | } 100 | } 101 | 102 | 103 | def copyInputStreamToLocalImage(entity: HttpEntity, linkhash: String, config: Configuration): String = { 104 | val generator: Random = new Random 105 | val randInt: Int = generator.nextInt 106 | val localSrcPath = config.localStoragePath + "/" + linkhash + "_" + randInt 107 | val instream: InputStream = entity.getContent 108 | val outstream: OutputStream = new FileOutputStream(localSrcPath) 109 | try { 110 | trace("Storing image locally: " + localSrcPath) 111 | IOUtils.copy(instream, outstream) 112 | val fileExtension = ImageSaver.getFileExtension(config, localSrcPath) 113 | if (fileExtension == "" || fileExtension == null) { 114 | trace("EMPTY FILE EXTENSION: " + localSrcPath) 115 | return null 116 | } 117 | val f: File = new File(localSrcPath) 118 | if (f.length < config.minBytesForImages) { 119 | if (logger.isDebugEnabled) { 120 | logger.debug("TOO SMALL AN IMAGE: " + localSrcPath + " bytes: " + f.length) 121 | } 122 | return null 123 | } 124 | val newFilename = localSrcPath + fileExtension 125 | val newFile: File = new File(newFilename) 126 | f.renameTo(newFile) 127 | trace("Image successfully Written to Disk") 128 | newFilename 129 | } 130 | catch { 131 | case e: Exception => 132 | throw e 133 | } 134 | finally { 135 | instream.close() 136 | outstream.close() 137 | } 138 | } 139 | 140 | /** 141 | * stores an image to disk and returns the path where the file was written 142 | * 143 | * @return 144 | */ 145 | def storeTempImage(httpClient: HttpClient, linkhash: String, imageSrcMaster: String, config: Configuration): String = { 146 | 147 | var imageSrc = imageSrcMaster 148 | try { 149 | imageSrc = imageSrc.replace(" ", "%20") 150 | trace("Starting to download image: " + imageSrc) 151 | 152 | fetchEntity(httpClient, imageSrc) match { 153 | case Some(entity) => 154 | try { 155 | return copyInputStreamToLocalImage(entity, linkhash, config) 156 | } 157 | catch { 158 | case e: SecretGifException => 159 | throw e 160 | case e: Exception => 161 | logger.error(e.getMessage) 162 | return null 163 | } 164 | case None => 165 | trace("Unable to get entity for: " + imageSrc) 166 | return null 167 | } 168 | 169 | } 170 | catch { 171 | case e: IllegalArgumentException => 172 | logger.warn(e.getMessage) 173 | case e: SecretGifException => 174 | raise(e) 175 | case e: ClientProtocolException => 176 | logger.error(e.toString) 177 | case e: IOException => 178 | logger.error(e.toString) 179 | case e: Exception => 180 | e.printStackTrace() 181 | logger.error(e.toString) 182 | e.printStackTrace() 183 | } 184 | finally { 185 | 186 | } 187 | null 188 | } 189 | 190 | private def raise(e: SecretGifException): Unit = { 191 | } 192 | 193 | 194 | } 195 | 196 | 197 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/outputformatters/OutputFormatter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.outputformatters 20 | 21 | import org.jsoup.nodes._ 22 | import org.apache.commons.lang.StringEscapeUtils 23 | import org.jsoup.select.Elements 24 | import com.gravity.goose.text.StopWords 25 | import scala.collection.JavaConversions._ 26 | import org.slf4j.Logger 27 | 28 | /** 29 | * Created by Jim Plush 30 | * User: jim 31 | * Date: 8/17/11 32 | */ 33 | 34 | trait OutputFormatter { 35 | val logPrefix = "outformat: " 36 | 37 | // used to remove tags within tags 38 | val tagReplace = "<[^>]+>".r 39 | 40 | def logger: Logger 41 | 42 | private def selectElements(query: String, topNode: Element): Elements = topNode match { 43 | case null => new Elements(List.empty[Element]) 44 | case n => n.select(query) 45 | } 46 | 47 | /** 48 | * Depricated use {@link #getFormattedText(Element)} 49 | * @param topNode the top most node to format 50 | * @return the prepared Element 51 | */ 52 | @Deprecated def getFormattedElement(topNode: Element): Element = { 53 | removeNodesWithNegativeScores(topNode) 54 | convertLinksToText(topNode) 55 | replaceTagsWithText(topNode) 56 | removeParagraphsWithFewWords(topNode) 57 | topNode 58 | } 59 | 60 | /** 61 | * Removes all unnecessarry elements and formats the selected text nodes 62 | * @param topNode the top most node to format 63 | * @return a formatted string with all HTML removed 64 | */ 65 | def getFormattedText(topNode: Element): String = { 66 | removeNodesWithNegativeScores(topNode) 67 | convertLinksToText(topNode) 68 | replaceTagsWithText(topNode) 69 | removeParagraphsWithFewWords(topNode) 70 | convertToText(topNode) 71 | } 72 | 73 | /** 74 | * Depricated use {@link #getFormattedText(Element)} 75 | * takes an element and turns the P tags into \n\n 76 | * 77 | * @return 78 | */ 79 | def convertToText(topNode: Element): String = topNode match { 80 | case null => "" 81 | case node => { 82 | (node.children().map((e: Element) => { 83 | StringEscapeUtils.unescapeHtml(e.text).trim 84 | })).toList.mkString("\n\n") 85 | } 86 | 87 | } 88 | 89 | /** 90 | * cleans up and converts any nodes that should be considered text into text 91 | */ 92 | private def convertLinksToText(topNode: Element) { 93 | if (topNode != null) { 94 | logger.trace(logPrefix + "Turning links to text") 95 | val baseUri = topNode.baseUri() 96 | 97 | val links = topNode.getElementsByTag("a") 98 | for (item <- links) { 99 | if (item.getElementsByTag("img").isEmpty) { 100 | val tn = new TextNode(item.text, baseUri) 101 | item.replaceWith(tn) 102 | } 103 | } 104 | } 105 | 106 | } 107 | 108 | /** 109 | * if there are elements inside our top node that have a negative gravity score, let's 110 | * give em the boot 111 | */ 112 | private def removeNodesWithNegativeScores(topNode: Element) { 113 | def tryInt(text: String): Int = try { 114 | Integer.parseInt(text) 115 | } catch { 116 | case _: Exception => 0 117 | } 118 | 119 | val gravityItems = selectElements("*[gravityScore]", topNode) 120 | for (item <- gravityItems) { 121 | val score = tryInt(item.attr("gravityScore")) 122 | if (score < 1) { 123 | item.remove() 124 | } 125 | } 126 | } 127 | 128 | /** 129 | * replace common tags with just text so we don't have any crazy formatting issues 130 | * so replace
, , , etc.... with whatever text is inside them 131 | */ 132 | private def replaceTagsWithText(topNode: Element) { 133 | if (topNode != null) { 134 | val baseUri = topNode.baseUri() 135 | val bolds = topNode.getElementsByTag("b") 136 | for (item <- bolds) { 137 | val tn = new TextNode(getTagCleanedText(item), baseUri) 138 | item.replaceWith(tn) 139 | } 140 | 141 | val strongs = topNode.getElementsByTag("strong") 142 | for (item <- strongs) { 143 | val tn = new TextNode(getTagCleanedText(item), baseUri) 144 | item.replaceWith(tn) 145 | } 146 | 147 | val italics = topNode.getElementsByTag("i") 148 | for (item <- italics) { 149 | val tn = new TextNode(getTagCleanedText(item), baseUri) 150 | item.replaceWith(tn) 151 | 152 | } 153 | } 154 | } 155 | 156 | private def getTagCleanedText(item: Node): String = { 157 | val sb = new StringBuilder() 158 | 159 | item.childNodes().foreach { 160 | case childText: TextNode => { 161 | sb.append(childText.getWholeText) 162 | } 163 | case childElement: Element => { 164 | sb.append(childElement.outerHtml()) 165 | } 166 | case _ => 167 | } 168 | 169 | val text = tagReplace replaceAllIn(sb.toString(), "") 170 | text 171 | } 172 | 173 | /** 174 | * remove paragraphs that have less than x number of words, would indicate that it's some sort of link 175 | */ 176 | private def removeParagraphsWithFewWords(topNode: Element) { 177 | if (topNode != null) { 178 | if (logger.isDebugEnabled) { 179 | logger.debug("removeParagraphsWithFewWords starting...") 180 | } 181 | 182 | val allNodes = topNode.getAllElements 183 | 184 | for (el <- allNodes) { 185 | try { 186 | val stopWords = StopWords.getStopWordCount(el.text) 187 | if (stopWords.getStopWordCount < 3 && el.getElementsByTag("object").size == 0 && el.getElementsByTag("embed").size == 0) { 188 | logger.debug("removeParagraphsWithFewWords - swcnt: %d removing text: %s".format(stopWords.getStopWordCount, el.text())) 189 | el.remove() 190 | } 191 | } 192 | catch { 193 | case e: IllegalArgumentException => { 194 | logger.error(e.getMessage) 195 | } 196 | } 197 | } 198 | 199 | Option(topNode.getElementsByTag("p").first()).foreach { 200 | case firstModdedNode: Element => { 201 | // check for open parens as the first paragraph, e.g. businessweek4.txt (IT) 202 | val trimmed = firstModdedNode.text().trim() 203 | if (trimmed.startsWith("(") && trimmed.endsWith(")")) { 204 | logger.trace("Removing parenthesis paragraph that is first paragraph") 205 | firstModdedNode.remove() 206 | } 207 | } 208 | } 209 | } 210 | } 211 | } -------------------------------------------------------------------------------- /src/main/scala/com/aamend/spark/gdelt/ContentFetcher.scala: -------------------------------------------------------------------------------- 1 | package com.aamend.spark.gdelt 2 | 3 | import java.io.File 4 | 5 | import com.gravity.goose.{Configuration, Goose} 6 | import org.apache.commons.lang.StringUtils 7 | import org.apache.spark.ml.Transformer 8 | import org.apache.spark.ml.param._ 9 | import org.apache.spark.ml.util._ 10 | import org.apache.spark.sql.types._ 11 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 12 | 13 | import scala.util.Try 14 | 15 | trait ContentFetcherParams extends Params with DefaultParamsWritable { 16 | val inputColumn = new Param[String](this, "inputColumn", "(MANDATORY) The input column containing URLs") 17 | val outputContentColumn = new Param[String](this, "outputContentColumn", "(OPTIONAL) Field that will contain HTML content") 18 | val outputTitleColumn = new Param[String](this, "outputTitleColumn", "(OPTIONAL) Field that will contain HTML title") 19 | val outputDescriptionColumn = new Param[String](this, "outputDescriptionColumn", "(OPTIONAL) Field that will contain HTML description") 20 | val outputKeywordsColumn = new Param[String](this, "outputKeywordsColumn", "(OPTIONAL) Field that will contain HTML keywords") 21 | val outputPublishDateColumn = new Param[String](this, "outputPublishDateColumn", "(OPTIONAL) Field that will contain HTML publishDate") 22 | val outputImageUrlColumn = new Param[String](this, "outputImageUrlColumn", "(OPTIONAL) Field that will contain HTML image header URL") 23 | val outputImageBase64Column = new Param[String](this, "outputImageBase64Column", "(OPTIONAL) Field that will contain HTML image header in Base64") 24 | val userAgent = new Param[String](this, "userAgent", "(OPTIONAL) User agent that is sent with your web requests to extract URL content") 25 | val socketTimeout = new Param[Int](this, "socketTimeout", "(OPTIONAL) Socket timeout (ms)") 26 | val connectionTimeout = new Param[Int](this, "connectionTimeout", "(OPTIONAL) Connection timeout (ms)") 27 | val imagemagickConvert = new Param[String](this, "imagemagickConvert", "(OPTIONAL) imagemagick convert executable") 28 | val imagemagickIdentify = new Param[String](this, "imagemagickIdentify", "(OPTIONAL) imagemagick identify executable") 29 | } 30 | 31 | class ContentFetcher(override val uid: String) extends Transformer with ContentFetcherParams { 32 | 33 | def setImagemagickConvert(value: String): this.type = set(imagemagickConvert, value) 34 | 35 | setDefault(imagemagickConvert -> "/usr/local/bin/convert") 36 | 37 | def setImagemagickIdentify(value: String): this.type = set(imagemagickIdentify, value) 38 | 39 | setDefault(imagemagickIdentify -> "/usr/local/bin/identify") 40 | 41 | def setUserAgent(value: String): this.type = set(userAgent, value) 42 | 43 | setDefault(userAgent -> "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8") 44 | 45 | def setSocketTimeout(value: Int): this.type = set(socketTimeout, value) 46 | 47 | setDefault(socketTimeout -> 10000) 48 | 49 | def setConnectionTimeout(value: Int): this.type = set(connectionTimeout, value) 50 | 51 | setDefault(connectionTimeout -> 10000) 52 | 53 | def setInputCol(value: String): this.type = set(inputColumn, value) 54 | 55 | setDefault(inputColumn -> "sourceURL") 56 | 57 | def setOutputContentCol(value: String): this.type = set(outputContentColumn, value) 58 | 59 | setDefault(outputContentColumn -> "") 60 | 61 | def setOutputTitleCol(value: String): this.type = set(outputTitleColumn, value) 62 | 63 | setDefault(outputTitleColumn -> "") 64 | 65 | def setOutputDescriptionCol(value: String): this.type = set(outputDescriptionColumn, value) 66 | 67 | setDefault(outputDescriptionColumn -> "") 68 | 69 | def setOutputKeywordsCol(value: String): this.type = set(outputKeywordsColumn, value) 70 | 71 | setDefault(outputKeywordsColumn -> "") 72 | 73 | def setOutputPublishDateCol(value: String): this.type = set(outputPublishDateColumn, value) 74 | 75 | setDefault(outputPublishDateColumn -> "") 76 | 77 | def setOutputImageUrlCol(value: String): this.type = set(outputImageUrlColumn, value) 78 | 79 | setDefault(outputImageUrlColumn -> "") 80 | 81 | def setOutputImageBase64Col(value: String): this.type = set(outputImageBase64Column, value) 82 | 83 | setDefault(outputImageBase64Column -> "") 84 | 85 | def this() = this(Identifiable.randomUID("com/gravity/goose")) 86 | 87 | override def transform(origDS: Dataset[_]): DataFrame = { 88 | 89 | val outputFields = loadOutputFields() 90 | 91 | // Make sure the URL field exist 92 | require(origDS.schema.exists(s => s.name == $(inputColumn) && s.dataType == StringType), "Field [" + $(inputColumn) + "] is not valid") 93 | 94 | // Make sure at least one output field is specified 95 | require(outputFields.nonEmpty, "At least one output field should be specified") 96 | 97 | // Make sure each specified output field does not exist 98 | outputFields.values.foreach(outputField => require(!origDS.schema.exists(_.name == outputField), s"Field [$outputField] already exist")) 99 | 100 | // This intermediate dataset to make sure we don't scrape more than once a same URL 101 | val urlDF = origDS.select($(inputColumn)).dropDuplicates($(inputColumn)) 102 | 103 | // If Image fetching enabled, we need path to image magic and convert 104 | if(StringUtils.isNotEmpty($(outputImageUrlColumn)) || StringUtils.isNotEmpty($(outputImageBase64Column))) { 105 | require(StringUtils.isNotEmpty($(imagemagickConvert)) && Try(new File($(imagemagickConvert))).isSuccess, "imagemagick convert executable needs to be specified for Image fetching") 106 | require(StringUtils.isNotEmpty($(imagemagickIdentify)) && Try(new File($(imagemagickIdentify))).isSuccess, "imagemagick identify executable needs to be specified for Image fetching") 107 | } 108 | 109 | // Append URL dataframe with article annotators 110 | val urlContentRDD = urlDF.rdd.mapPartitions(rows => { 111 | 112 | // Initialize Goose only once for each partition 113 | val conf = new Configuration() 114 | if(StringUtils.isNotEmpty($(outputImageUrlColumn)) || StringUtils.isNotEmpty($(outputImageBase64Column))) { 115 | conf.setEnableImageFetching(true) 116 | conf.setImagemagickConvertPath($(imagemagickConvert)) 117 | conf.setImagemagickIdentifyPath($(imagemagickIdentify)) 118 | } else { 119 | conf.setEnableImageFetching(false) 120 | } 121 | conf.setBrowserUserAgent($(userAgent)) 122 | conf.setSocketTimeout($(socketTimeout)) 123 | conf.setConnectionTimeout($(connectionTimeout)) 124 | val goose = new Goose(conf) 125 | 126 | // Scrape each URL individually 127 | val articles = scrapeContent(rows.map(_.getAs[String]($(inputColumn))), goose) 128 | 129 | // Convert articles as Row 130 | articles.map(article => { 131 | val appended: Seq[Any] = outputFields.map { case (key, _) => 132 | key match { 133 | case ANNOTATOR_TITLE => article.title.getOrElse("") 134 | case ANNOTATOR_DESCRIPTION => article.description.getOrElse("") 135 | case ANNOTATOR_CONTENT => article.content.getOrElse("") 136 | case ANNOTATOR_KEYWORDS => article.keywords 137 | case ANNOTATOR_PUBLISH_DATE => article.publishDate.orNull 138 | case ANNOTATOR_IMAGE_URL => article.imageURL.getOrElse("") 139 | case ANNOTATOR_IMAGE_BASE64 => article.imageBase64.getOrElse("") 140 | } 141 | }.toSeq 142 | Row.fromSeq(Seq(article.url) ++ appended) 143 | }) 144 | }) 145 | 146 | // Transform RDD of Row to Dataframe 147 | val contentDF = origDS.sqlContext.createDataFrame(urlContentRDD, transformSchema(urlDF.schema)) 148 | 149 | // Join articles back to any duplicate URL dataset 150 | contentDF.join(origDS, List($(inputColumn))) 151 | 152 | } 153 | 154 | private def loadOutputFields(): Map[String, String] = { 155 | Map( 156 | ANNOTATOR_TITLE -> $(outputTitleColumn), 157 | ANNOTATOR_DESCRIPTION -> $(outputDescriptionColumn), 158 | ANNOTATOR_CONTENT -> $(outputContentColumn), 159 | ANNOTATOR_KEYWORDS -> $(outputKeywordsColumn), 160 | ANNOTATOR_PUBLISH_DATE -> $(outputPublishDateColumn), 161 | ANNOTATOR_IMAGE_BASE64 -> $(outputImageBase64Column), 162 | ANNOTATOR_IMAGE_URL -> $(outputImageUrlColumn) 163 | ).filter(s => StringUtils.isNotEmpty(s._2)) 164 | } 165 | 166 | override def transformSchema(schema: StructType): StructType = { 167 | StructType( 168 | schema.seq ++ loadOutputFields().map { case (key, value) => 169 | key match { 170 | case ANNOTATOR_TITLE => StructField(value, StringType, nullable = false) 171 | case ANNOTATOR_DESCRIPTION => StructField(value, StringType, nullable = false) 172 | case ANNOTATOR_CONTENT => StructField(value, StringType, nullable = false) 173 | case ANNOTATOR_KEYWORDS => StructField(value, ArrayType.apply(StringType), nullable = false) 174 | case ANNOTATOR_PUBLISH_DATE => StructField(value, DateType, nullable = true) 175 | case ANNOTATOR_IMAGE_URL => StructField(value, StringType, nullable = true) 176 | case ANNOTATOR_IMAGE_BASE64 => StructField(value, StringType, nullable = true) 177 | } 178 | } 179 | ) 180 | } 181 | 182 | override def copy(extra: ParamMap): Transformer = { 183 | defaultCopy(extra) 184 | } 185 | } 186 | 187 | object ContentFetcher extends DefaultParamsReadable[ContentFetcher] { 188 | override def load(path: String): ContentFetcher = super.load(path) 189 | } 190 | -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/images/ImageUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package com.gravity.goose.images 19 | 20 | /** 21 | * Created by Jim Plush 22 | * User: jim 23 | * Date: 8/18/11 24 | */ 25 | 26 | import javax.imageio.ImageIO 27 | import java.awt.color.CMMException 28 | import java.awt.image.BufferedImage 29 | 30 | import com.gravity.goose.utils.{Logging, URLHelper} 31 | import org.apache.http.client.HttpClient 32 | import org.apache.http.HttpEntity 33 | import org.apache.http.protocol.{BasicHttpContext, HttpContext} 34 | import org.apache.http.client.protocol.ClientContext 35 | import java.util.Base64 36 | import java.io._ 37 | import java.util 38 | 39 | import com.gravity.goose.Configuration 40 | import com.gravity.goose.text.{HashUtils, string} 41 | import org.apache.http.util.EntityUtils 42 | import org.apache.commons.io.IOUtils 43 | import com.gravity.goose.network.{HtmlFetcher, ImageFetchException} 44 | 45 | import scala.util.Try 46 | import scala.util.matching.Regex 47 | 48 | object ImageUtils extends Logging { 49 | val spaceRegex: Regex = " ".r 50 | val xRegex: Regex = "x".r 51 | 52 | /** 53 | * User: Jim Plush 54 | * gets the image dimensions for an image file, pass in the path to the image who's dimensions you want to get 55 | * this will use imageMagick since the Java IO and imaging shit SUCKS for getting mime types and file info for jpg and png files 56 | * 57 | * @return 58 | */ 59 | def getImageDimensions(identifyProgram: String, filePath: String): ImageDetails = { 60 | val imageInfo = execToString(Array(identifyProgram, filePath)) 61 | val imageDetails: ImageDetails = new ImageDetails 62 | if (imageInfo == null || imageInfo.contains("no decode delegate for this image format")) { 63 | throw new IOException("Unable to get Image Information (no decode delegate) for: " + filePath + "\n\tcommand '" + identifyProgram + " " + filePath + "' returned: " + imageInfo) 64 | } 65 | val infoParts = spaceRegex.split(imageInfo) 66 | val mimeType = infoParts.lift(1).getOrElse(string.empty) 67 | val (width, height) = infoParts.lift(2) match { 68 | case Some(dimensions) => 69 | val pair = xRegex.split(dimensions) 70 | if (pair.length > 1) { 71 | val wStr = pair(0) 72 | val hStr = pair(1) 73 | 74 | (string.tryToInt(wStr).getOrElse(0), string.tryToInt(hStr).getOrElse(0)) 75 | } else { 76 | (0, 0) 77 | } 78 | case None => (0, 0) 79 | } 80 | imageDetails.setMimeType(mimeType) 81 | imageDetails.setWidth(width) 82 | imageDetails.setHeight(height) 83 | imageDetails 84 | } 85 | 86 | def readImageBase64(file: File): String = { 87 | val fileInputStreamReader = new FileInputStream(file) 88 | val bytes = new Array[Byte](file.length.asInstanceOf[Int]) 89 | fileInputStreamReader.read(bytes) 90 | Base64.getEncoder.encodeToString(bytes) 91 | } 92 | 93 | /** 94 | * gets the image dimensions for an image file, pass in the path to the image who's dimensions you want to get, uses the built in java commands 95 | * 96 | * @return 97 | */ 98 | def getImageDimensionsJava(filePath: String): util.HashMap[String, Integer] = { 99 | var image: BufferedImage = null 100 | try { 101 | val f: File = new File(filePath) 102 | image = ImageIO.read(f) 103 | val results: util.HashMap[String, Integer] = new util.HashMap[String, Integer] 104 | results.put("height", image.getHeight) 105 | results.put("width", image.getWidth) 106 | results 107 | } 108 | catch { 109 | case e: CMMException => 110 | logger.error("ERROR READING FILE: " + filePath + " \n", e) 111 | throw new IOException("Unable to read file: " + filePath) 112 | } 113 | finally { 114 | if (image != null) { 115 | try { 116 | image.flush() 117 | } 118 | catch { 119 | case _: Exception => 120 | } 121 | } 122 | } 123 | } 124 | 125 | /** 126 | * Tries to exec the command, waits for it to finish, logs errors if exit 127 | * status is nonzero, and returns true if exit status is 0 (success). 128 | * 129 | * @param command Description of the Parameter 130 | * @return Description of the Return Value 131 | */ 132 | private def execToString(command: Array[String]): String = { 133 | var p: Process = null 134 | var in: BufferedReader = null 135 | try { 136 | p = Runtime.getRuntime.exec(command) 137 | in = new BufferedReader(new InputStreamReader(p.getInputStream)) 138 | var line: String = null 139 | line = in.readLine 140 | p.waitFor 141 | return line 142 | } 143 | catch { 144 | case e: IOException => 145 | logger.error(e.toString, e) 146 | case e: InterruptedException => 147 | logger.error(e.toString, e) 148 | throw new RuntimeException(e) 149 | } 150 | finally { 151 | if (in != null) { 152 | try { 153 | in.close() 154 | } 155 | catch { 156 | case _: IOException => 157 | } 158 | } 159 | if (p != null) { 160 | p.destroy() 161 | } 162 | } 163 | null 164 | } 165 | 166 | /** 167 | * Writes an image src http string to disk as a temporary file and returns the LocallyStoredImage object that has the info you should need 168 | * on the image 169 | */ 170 | def storeImageToLocalFile(httpClient: HttpClient, linkhash: String, imageSrc: String, config: Configuration): Option[LocallyStoredImage] = { 171 | 172 | try { 173 | // check for a cache hit already on disk 174 | readExistingFileInfo(linkhash, imageSrc, config) match { 175 | case Some(locallyStoredImage) => 176 | trace("Image already cached on disk: " + imageSrc) 177 | return Some(locallyStoredImage) 178 | case None => 179 | } 180 | 181 | trace("Not found locally...starting to download image: " + imageSrc) 182 | fetchEntity(httpClient, imageSrc, config) match { 183 | case Some(entity) => 184 | trace("Got entity for " + imageSrc) 185 | writeEntityContentsToDisk(entity, linkhash, imageSrc, config) match { 186 | case Some(locallyStoredImage) => trace("Img Write successfull to disk"); Some(locallyStoredImage) 187 | case None => trace("Unable to write contents to disk: " + imageSrc); None 188 | } 189 | case None => trace("Unable to fetch entity for: " + imageSrc); None 190 | } 191 | } catch { 192 | case e: Exception => 193 | info(e, e.toString) 194 | None 195 | } 196 | 197 | } 198 | 199 | 200 | def readExistingFileInfo(linkhash: String, imageSrc: String, config: Configuration): Option[LocallyStoredImage] = { 201 | val localImageName = getLocalFileName(linkhash, imageSrc, config) 202 | val imageFile = new File(localImageName) 203 | if (imageFile.exists()) { 204 | try { 205 | trace("Reading image from disk: " + localImageName) 206 | val imageDetails = getImageDimensions(config.imagemagickIdentifyPath, localImageName) 207 | val mimeType = imageDetails.getMimeType.toLowerCase 208 | val base64 = ImageUtils.readImageBase64(new File(localImageName)) 209 | Some(LocallyStoredImage(imageSrc, mimeType, base64, imageFile.length(), imageDetails.getHeight, imageDetails.getWidth)) 210 | } catch { 211 | case e: Exception => 212 | trace(e, "Unable to get image file dimensions & extension name!") 213 | None 214 | } 215 | } else { 216 | None 217 | } 218 | 219 | } 220 | 221 | def writeEntityContentsToDisk(entity: HttpEntity, linkhash: String, imageSrc: String, config: Configuration): Option[LocallyStoredImage] = { 222 | 223 | val localSrcPath = getLocalFileName(linkhash, imageSrc, config) 224 | val outstream: OutputStream = new FileOutputStream(localSrcPath) 225 | val instream: InputStream = entity.getContent 226 | trace("Content Length: " + entity.getContentLength) 227 | try { 228 | val fileCopyBytes = IOUtils.copy(instream, outstream) 229 | trace(fileCopyBytes + " bytes copied to disk") 230 | } finally { 231 | Try { 232 | outstream.flush() 233 | outstream.close() 234 | instream.close() 235 | } 236 | } 237 | EntityUtils.consume(entity) 238 | trace("Content Length: " + entity.getContentLength) 239 | readExistingFileInfo(linkhash, imageSrc, config) 240 | 241 | } 242 | 243 | def getLocalFileName(linkhash: String, imageSrc: String, config: Configuration): String = { 244 | val imageHash = HashUtils.md5(imageSrc) 245 | config.localStoragePath + "/" + linkhash + "_" + imageHash 246 | } 247 | 248 | 249 | def cleanImageSrcString(imgSrc: String): String = spaceRegex.replaceAllIn(imgSrc, "%20") 250 | 251 | def fetchEntity(httpClient: HttpClient, imageSrc: String, config: Configuration): Option[HttpEntity] = { 252 | 253 | URLHelper.tryToHttpGet(imageSrc) match { 254 | case Some(httpget) => 255 | val localContext: HttpContext = new BasicHttpContext 256 | localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore) 257 | val response = try { 258 | config.getHtmlFetcher.getHttpClient.execute(httpget, localContext) 259 | } 260 | catch { 261 | case ex: Exception => throw new ImageFetchException(imageSrc, ex) 262 | } 263 | 264 | val respStatus = response.getStatusLine.getStatusCode 265 | 266 | 267 | if (respStatus != 200) { 268 | None 269 | } else { 270 | try { 271 | Option(response.getEntity) 272 | } catch { 273 | case e: Exception => warn(e, e.toString); httpget.abort(); None 274 | } 275 | } 276 | case None => 277 | warn("Unable to parse imageSrc: '" + imageSrc + "' into HttpGet") 278 | None 279 | } 280 | 281 | } 282 | 283 | 284 | } 285 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /src/main/resources/com/aamend/spark/gdelt/reference/cameoEthnic.txt: -------------------------------------------------------------------------------- 1 | CODE LABEL 2 | aar Afar 3 | abk Abkhaz 4 | abr Aboriginal-Australians 5 | ace Acehnese 6 | acg Achang 7 | ach Acholi 8 | ada Ga 9 | adi Adivasi 10 | adj Adjarians 11 | ady Adyghe 12 | afa Black-African 13 | afr Afrikaners 14 | ahm Ahmadis 15 | ain Ainu 16 | aja Aja 17 | aka Akan 18 | aku Aku 19 | ala Alawi 20 | alb Albanian 21 | ale Aleut 22 | alg Algonquian 23 | alt Altay 24 | alu Alur 25 | amb Ambonese 26 | ame Americo-Liberians 27 | amh Amhara 28 | anp Angika speakers 29 | apa Apache 30 | ara Arab 31 | ARB Arab 32 | arg Aragonese 33 | arm Armenian 34 | arn Mapuche 35 | arp Arapaho 36 | arw Arawak 37 | asa Asian 38 | ash Ashkenazi Jews 39 | asm Assamese 40 | ast Asturian 41 | asy Assyrian 42 | ata Atacamenos 43 | atg Argentinians 44 | ath Athabaskan 45 | ats Agnostic/Athiest 46 | aus Australians 47 | auu Austrians 48 | ava Caucasian Avars 49 | awa Awadhi 50 | aym Aymara 51 | aze Azerbaijani 52 | bad Baganda 53 | bah Bahais 54 | bai Bamileke 55 | bak Bashkirs 56 | bal Baloch 57 | bam Bambara 58 | ban Balinese 59 | baq Basque 60 | bar Bari 61 | bas Basoga 62 | bay Gbaya 63 | bda Rakhine 64 | bej Beja 65 | bel Belarusians 66 | bem Bemba 67 | ben Bengali-Hindu 68 | ber Berber 69 | bey Beydan 70 | bho Bhojpuri 71 | bih Bihari 72 | bii Bai 73 | bik Bicolano 74 | bin Edo 75 | bis Urban ni-Vanautu 76 | bke Bateke 77 | bkn Bakongo 78 | bkw Bakweri 79 | bla Siksikawa 80 | blg Blang 81 | blk Balkars 82 | bln Balanta 83 | bmr Bamar 84 | bni Beni-Shugal-Gumez 85 | bnt Bantu 86 | bny Banyarwanda 87 | bod Bodo 88 | bod Tibetan 89 | bol Bolivia 90 | bon Bonan 91 | bos Bosniaks 92 | bou Buyei 93 | bra Brijwasi 94 | brb Bariba 95 | bre Breton 96 | brh Brahui 97 | brk Burakumin 98 | brm Kurichiya 99 | bsh Bushmen 100 | bst Baster 101 | bsu Subiya 102 | bte Beti-Pahuin 103 | btk Batak 104 | bua Buryat 105 | bud Buddhist 106 | bug Bugis 107 | bul Bulgarian 108 | byn Bilen 109 | cab Cabindan-Mayombe 110 | cad Caddo 111 | cap Cape Verdean 112 | car Kali'na 113 | cat Catalan 114 | ceb Cebuano 115 | cha Chamorro 116 | chc Chukchi 117 | che Chechen 118 | chg Chagatai 119 | chi Chinese 120 | chk Chuukese 121 | chl Chileans 122 | chm Mari 123 | chn Chinook 124 | cho Choctaw 125 | chp Chipewyan 126 | chr Cherokee 127 | cht Ch'orti' 128 | chv Chuvash 129 | chw Chewa 130 | chy Cheyenne 131 | cir Adyghe 132 | cmc Cham 133 | col Colombian 134 | con Confusian 135 | cop Coptic Christians 136 | cor Cornish 137 | cos Corsican 138 | cot Cotiers 139 | cpe English-Creole 140 | cpf French-Creole 141 | cpp Portuguese-Creole 142 | cre Cree 143 | crh Crimean Tatar 144 | cri Christian 145 | cro Orthodox Christian 146 | crp Creole 147 | csb Kashubian 148 | csr Costa Ricans 149 | cth Catholics 150 | cus Cushitic 151 | cze Czech 152 | dai Dai 153 | dak Sioux 154 | dal Dalit 155 | dam Damara 156 | dan Danes 157 | dao Yao (Asia) 158 | dar Dargwa 159 | dau Daur 160 | day Dayak 161 | del Lenape 162 | den Slavey 163 | dgr Dogrib 164 | din Dinka 165 | div Maldivian 166 | dje Djerma-Songhai 167 | doi Dogras 168 | dom Dominicans 169 | don Dong 170 | dox Dongxiang 171 | dra Dravidian 172 | dru Druze 173 | drz Druze 174 | dsb Lower Sorbian 175 | dua Duala 176 | dut Dutch 177 | dyu Dyula 178 | dzo Ngalop 179 | eat East Timorese 180 | ecu Ecuadorians 181 | efi Efik 182 | ein East Indian 183 | eka Ekajuk 184 | eng English 185 | esh Eshira 186 | est Estonian 187 | eth Ethiopian-Jews 188 | eur Europeans 189 | eve Evenks 190 | ewe Ewe 191 | ewo Ewondo 192 | fan Fang 193 | fao Faroese 194 | fat Fante 195 | fij Fijian 196 | fil Filipino 197 | fin Finns 198 | fiu Finno-Ugric 199 | fon Fon 200 | fre French 201 | fri Santals 202 | frr Frisians 203 | fru Fur 204 | ful Fula 205 | fur Friulan 206 | gar Garifuna 207 | gay Gayo 208 | gba Gbaya 209 | gel Gelao 210 | geo Georgian 211 | ger German 212 | gia Gia Rai 213 | gil Kiribati 214 | gin Gin 215 | gio Gio 216 | gla Gaels 217 | gle Irish 218 | glg Galician 219 | glv Manx 220 | gon Gondi 221 | gor Gorontalonese 222 | gra Grassfielders 223 | grb Grebo 224 | gre Greek 225 | grn Guarani 226 | gsw Swiss Germans 227 | gua Guatemalan 228 | guj Gujarati 229 | gun Guan 230 | gwi Gwich'in 231 | had Hadjerai 232 | hai Haida 233 | har Harari 234 | hat Haitian 235 | hau Hausa 236 | haw Hawaiian 237 | haz Hazara 238 | her Herero 239 | hgh Hill Tribes 240 | hil Hiligayon 241 | him Himachali 242 | hin Hindu 243 | hjw Hasidic 244 | hmn Hmong 245 | hmo Hiri Motu 246 | hni Hani 247 | hoa Hoa 248 | hon Hondurans 249 | hrt Haratin 250 | hrv Croats 251 | hsb Upper Sorbian 252 | hui Hui 253 | hun Hungarian 254 | hup Hupa 255 | hut Hutu 256 | iba Iban 257 | ibo Igbo 258 | ice Icelanders 259 | idg Indigenous 260 | idn Indian 261 | iii Yi 262 | ijo Ijaw 263 | iku Inuit 264 | ilo Ilocono 265 | ind Indonesian 266 | inh Ingush 267 | ipk Inupiat 268 | ira Iranian 269 | iro Iroquois 270 | ita Itallian 271 | jan Jain 272 | jav Javanese 273 | jew Jewish 274 | jhw Jehovah's Witnesses 275 | jin Jino 276 | jol Jola 277 | jpn Japanese 278 | kaa Karakalpak 279 | kab Kabyle 280 | kac Kachin 281 | kad Kadazan 282 | kak Kakwa-Nubian 283 | kal Kalaallit 284 | kam Kamba 285 | kan Kannada 286 | kao Kaonde 287 | kar Karen 288 | kas Kashmiri 289 | kau Kanuri 290 | kav Kavango 291 | kaz Kazakhs 292 | kbd Kabarday 293 | kby Kabye 294 | kch Karachays 295 | kha Khasi 296 | khi Khoikhoi 297 | khk Khakas 298 | khm Khmer 299 | khu Khmu 300 | kik Kikuyu 301 | kin Kinyarwanda Speakers 302 | kir Kyrgyz 303 | kis Kisii 304 | klm Kalmyk 305 | kmb North Mbundu 306 | kno Kono 307 | knr Kanuri 308 | kok Kokani 309 | kom Komi 310 | kon Kongo 311 | kor Korean 312 | kos Kosraean 313 | kou Kouyou 314 | kpe Kpelle 315 | krh Krahn 316 | krl Karelians 317 | krm Karamojong 318 | kro Kru 319 | kru Kurukh 320 | kua Kwanyama 321 | kum Kumyks 322 | kur Kurd 323 | KUR Kurd 324 | kut Ktunaxa 325 | lad Sephardic Jew 326 | lak Lak (Russia) 327 | lam Lamba 328 | lao Lao 329 | lar Lari 330 | lav Latvian 331 | lba Limba 332 | lds Latter Day Saints 333 | len Lenca 334 | lez Lezgian 335 | lgb Lugbara 336 | lhu Lahu 337 | lii Li 338 | lim Limburgian 339 | lin Lingala 340 | lit Lithuanian 341 | lol Mongo 342 | lom Lomwe 343 | lov Lovale 344 | loz Lozi 345 | lsu Lisu 346 | ltk Latoka 347 | ltn Latinos 348 | ltz Luxembourgers 349 | lua Luba-Kasai 350 | lub Luba-Katanga 351 | lug Baganda 352 | luh Luhya 353 | lui Luiseno 354 | lul Lulua 355 | lun Lunda 356 | luo Luo 357 | lus Lusei 358 | mac Macedonian 359 | mad Madurese 360 | maf Mafwe 361 | mag Magahi 362 | mah Marshallese 363 | mai Maithili 364 | mak Makassarese 365 | mal Malayalam 366 | man Mandinka 367 | mao Maori 368 | mar Marathi 369 | mas Maasai 370 | may Malays 371 | mba Mbandja 372 | mbe Mbere 373 | mbk M'Baka 374 | mbo Mbochi 375 | mbu Mbundu-Mestico 376 | mdf Mokshas 377 | mdh Madhesi 378 | mdi Madi 379 | mdr Mandar 380 | men Mende 381 | mia Miao 382 | mic Mi'kmaq 383 | mij Mijikenda 384 | min Minangkabau 385 | miz Mizo 386 | mla Mulatto 387 | mld Mole-Dagbani 388 | mlg Malagasy 389 | mlo Mulao 390 | mlt Maltese 391 | mnc Manchu 392 | mnd Mande 393 | mng Mananja-Nayanja 394 | mnh Minahasa 395 | mni Manipuri 396 | mnj Manjack 397 | mnn Mano 398 | mno Lumad 399 | mns Mon 400 | mny Manyika 401 | moh Mohajirs 402 | moh Mohawk 403 | mok Makonde 404 | mon Maonan 405 | mon Mongol 406 | mos Mossi 407 | mri Mari 408 | mrn Maronites 409 | mro Moro 410 | msk Miskito 411 | msl Muslim 412 | mtn Montenegrins 413 | mtz Mestizo 414 | mun Munda 415 | muo Muong 416 | mus Muscogee 417 | mwl Mirandese 418 | mwr Marwaris 419 | mya Mayangnas 420 | mye Myene 421 | myn Maya 422 | myv Mordvins 423 | nag Naga 424 | nah Nahua 425 | nai Native American 426 | nam Nama 427 | nap Neapolitan 428 | nau Nauruan 429 | nav Navajo 430 | nax Nakhi 431 | nba Nuba 432 | nbl South Ndebele 433 | nca Nicaraguan 434 | nde Northern Ndebele 435 | ndo Ndonga 436 | nep Nepali 437 | ner Nuer 438 | new Newars 439 | ngn Ngbandi 440 | ngo Ngoni 441 | nia Niasans 442 | nib Nibolek 443 | nir Niari 444 | niu Niuean 445 | nkm Nkomi 446 | nng Nung 447 | nog Nogais 448 | nor Norwegians 449 | nso Northern Sotho 450 | nub Nubian 451 | nur Nuristani 452 | nuu Nu 453 | nya Chewa 454 | nyk Nyakyusa 455 | nym Nyamwezi 456 | nyn Ankole 457 | nyo Nyoro 458 | nze New Zealanders 459 | nzi Nzema 460 | oci Occitanians 461 | ogo Ogoni 462 | oji Ojibwe 463 | ojw Orthodox/Ultra-Orthodox Jew 464 | oki Okinawan 465 | ori Oriya 466 | orm Oromo 467 | oru Orgunu 468 | osa Osage 469 | oss Ossetians 470 | oto Otomi 471 | ova Ovambo 472 | paa Papuan 473 | pac Pacific Islanders 474 | pag Pangasinan 475 | pal Palestinian 476 | PAL Palestinian 477 | pam Kapampangan 478 | pan Punjabi 479 | pap Papiamento-Creole 480 | par Paraguayan 481 | pau Palauan 482 | per Persian 483 | pgn Animist/Pagan 484 | phu Puthai 485 | pnm Panamanians 486 | pol Poles 487 | pom Pomaks 488 | pon Pehnpeian 489 | por Portuguese 490 | ppl Papel 491 | pro Protestant 492 | pru Peruvian 493 | psh Pashayi 494 | pum Pumi 495 | pus Pashtun 496 | qia Qiang 497 | qiz Qizilbash 498 | que Quechua 499 | raj Rajasthani 500 | ran Pahari Rajput 501 | rap Rapa Nui 502 | rar Cook Islands Maori 503 | rel Unspecified Religion 504 | roh Romansh 505 | rom Romani 506 | rum Romanian 507 | run Rundi 508 | rup Aromanians 509 | rus Russian 510 | sad Sandawe 511 | sag Sango 512 | sah Yakuts 513 | sal Salish 514 | sar Sara 515 | sas Sasak 516 | sat Sudanese 517 | scn Sicilian 518 | sco Scottish 519 | sel Selkup 520 | sen Sena 521 | sfi Sufi 522 | sha Shafi'i 523 | she She 524 | shi Shi'ites 525 | shl Shilluk 526 | shn Shan 527 | shy Shaigiya 528 | sid Sidama 529 | sin Sinhalese 530 | sio Siouan 531 | sla Slavic 532 | slo Slovaks 533 | slr Salar 534 | slv Slovenes 535 | smi Sami 536 | smo Samoans 537 | sna Shona 538 | snd Sindhi 539 | snk Soninke 540 | som Somali 541 | son Songhai 542 | sot Sotho 543 | spa Spanish 544 | srd Sardinian 545 | srn Sranan Tongo 546 | srp Serbs 547 | srr Serer 548 | srr Serer 549 | ssw Swazi 550 | sui Sui 551 | suk Sukama 552 | sun Sunni 553 | sus Susu 554 | swa Swahili 555 | swe Swedes 556 | swf Swiss French 557 | swt Swiss Italian 558 | tab Tabasaran 559 | tah Tahitian 560 | tai Tai 561 | tam Tamil 562 | tao Taoist 563 | tat Tatars 564 | taw Tawahka 565 | tay Tay 566 | tel Telugu 567 | tem Temne 568 | ter Terenan 569 | tes Teso 570 | tet Tetum 571 | tgk Tajik 572 | tgl Tagalog 573 | tha Thai 574 | tib Tibetan 575 | tig Tigre 576 | tir Tigray-Tigrinya 577 | tiv Tiv 578 | tkl Tokelauan 579 | tli Tlingit 580 | tmh Tuareg 581 | tms Tama 582 | tog Tonga (Africa) 583 | ton Tonga (Pacific) 584 | tor Tooro 585 | tou Toubou 586 | tpi Tok Pisin 587 | tra Transnistrians 588 | tri Tripuri 589 | trn Ternate 590 | tsi Tsimshian 591 | tsn Tswana 592 | tso Tsonga 593 | tts Tutsi 594 | tuj Tujia 595 | tuk Turkmen 596 | tum Tumbuka 597 | tup Tupi 598 | tur Turkish 599 | tuu Mongour 600 | tvl Tuvaluans 601 | twi Ashanti 602 | twn Taiwanese 603 | tyv Tuvans 604 | udm Udmurt 605 | uig Uyghur 606 | ukr Ukranian 607 | umb Southern Mbundu 608 | und Undetermined 609 | urd Urdu 610 | uzb Uzbeks 611 | vaa Va 612 | vai Vai 613 | ven Venda 614 | vie Vietnamese 615 | vil Vili 616 | vnz Venezuelan 617 | vot Votes 618 | wak Wakashan 619 | wal Welayta 620 | war Waray 621 | was Washoe 622 | wel Welsch 623 | wel Welsh 624 | wen Sorbs 625 | whi Whites 626 | wln Walloons 627 | wol Wolof 628 | xal Kalmyk 629 | xho Xhosa 630 | xib Xibe 631 | xnc Xinca 632 | yao Yao 633 | yap Yapese 634 | yor Yoruba 635 | ypk Yupik 636 | yug Yugur 637 | zag Zaghawa 638 | zap Zapotec 639 | zay Zaidiyya 640 | zen Zenaga 641 | zha Zhuang 642 | znd Azande 643 | zom Zomi 644 | zor Zoroastrians 645 | zul Zulu 646 | zun Zuni 647 | zza Zaza -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 4.0.0 5 | gdelt-spark 6 | com.aamend.spark 7 | spark-gdelt 8 | 3.1-SNAPSHOT 9 | https://github.com/aamend/spark-gdelt 10 | Working with GDELT from Spark 11 | environment 12 | 2015 13 | 14 | 15 | aamend 16 | Antoine Amend 17 | antoine.amend@gmail.com 18 | aamend.com 19 | 0 20 | 21 | big data scientist 22 | 23 | 24 | 25 | lamastex 26 | Raazesh Sainudiin 27 | raazesh.sainudiin@gmail.com 28 | lamastex.org 29 | 0 30 | 31 | Associate Professor of Mathematics with Specialisation in Data Science and Senior Researcher in Data Science 32 | 33 | 34 | 35 | JohannesGraner 36 | Johannes Graner 37 | johannes.graner@hotmail.com 38 | Combient Competence Centre for Data Engineering Sciences, Department of Mathematics, Uppsala University 39 | 0 40 | 41 | Summer Internship in Data Science 42 | 43 | 44 | 45 | AlbertNilsson 46 | Albert Nilsson 47 | albertnilsson1998@gmail.com 48 | Combient Competence Centre for Data Engineering Sciences, Department of Mathematics, Uppsala University 49 | 0 50 | 51 | Summer Internship in Data Science 52 | 53 | 54 | 55 | 56 | aamend.com 57 | 58 | 59 | 60 | Apache License, version 2.0 61 | http://www.apache.org/licenses/LICENSE-2.0 62 | 63 | 64 | 65 | https://github.com/aamend/gdelt-spark 66 | 67 | scm:git:git@github.com:aamend/spark-gdelt.git 68 | gdelt 69 | 70 | 71 | 72 | ossrh 73 | 74 | https://oss.sonatype.org/content/repositories/snapshots 75 | 76 | 77 | ossrh 78 | 79 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 80 | 81 | 82 | 83 | 84 | UTF-8 85 | 2.12.8 86 | 2.12 87 | 3.0.0 88 | 1.8 89 | 90 | 91 | 92 | 93 | org.scala-lang 94 | scala-library 95 | ${scala.version} 96 | provided 97 | 98 | 99 | org.scala-lang 100 | scala-reflect 101 | ${scala.version} 102 | provided 103 | 104 | 105 | 106 | org.apache.spark 107 | spark-core_${scala.binary.version} 108 | ${spark.version} 109 | provided 110 | 111 | 112 | org.apache.spark 113 | spark-sql_${scala.binary.version} 114 | ${spark.version} 115 | provided 116 | 117 | 118 | org.apache.spark 119 | spark-mllib_${scala.binary.version} 120 | ${spark.version} 121 | provided 122 | 123 | 124 | 125 | joda-time 126 | joda-time 127 | 2.9.9 128 | 129 | 130 | commons-lang 131 | commons-lang 132 | 2.6 133 | 134 | 135 | com.typesafe.scala-logging 136 | 137 | scala-logging_${scala.binary.version} 138 | 3.7.1 139 | 140 | 141 | 142 | org.jsoup 143 | jsoup 144 | 1.14.2 145 | 146 | 147 | org.apache.httpcomponents 148 | httpclient 149 | 4.5.13 150 | 151 | 152 | commons-io 153 | commons-io 154 | 2.7 155 | 156 | 157 | 158 | org.scalatest 159 | scalatest_${scala.binary.version} 160 | 3.1.1 161 | test 162 | 163 | 164 | junit 165 | junit 166 | 4.13.1 167 | test 168 | 169 | 170 | 171 | 172 | 173 | org.apache.maven.plugins 174 | maven-compiler-plugin 175 | 2.5.1 176 | 177 | ${java.version} 178 | ${java.version} 179 | 180 | 181 | 182 | org.apache.maven.plugins 183 | maven-surefire-plugin 184 | 2.7 185 | 186 | true 187 | 188 | 189 | 190 | org.sonatype.plugins 191 | nexus-staging-maven-plugin 192 | 1.6.7 193 | true 194 | 195 | ossrh 196 | https://oss.sonatype.org/ 197 | true 198 | 199 | 200 | 201 | org.apache.maven.plugins 202 | maven-source-plugin 203 | 2.2.1 204 | 205 | 206 | attach-sources 207 | 208 | jar-no-fork 209 | 210 | 211 | 212 | 213 | 214 | net.alchim31.maven 215 | scala-maven-plugin 216 | 4.3.1 217 | 218 | 219 | 220 | compile 221 | testCompile 222 | 223 | 224 | 225 | attach-javadocs 226 | 227 | doc-jar 228 | 229 | 230 | 231 | 232 | 233 | org.scalatest 234 | scalatest-maven-plugin 235 | 1.0 236 | 237 | 238 | ${project.build.directory}/surefire-reports 239 | . 240 | once 241 | WDF TestSuite.txt 242 | 243 | 244 | 245 | test 246 | 247 | test 248 | 249 | 250 | 251 | 252 | 253 | 254 | org.apache.maven.plugins 255 | maven-shade-plugin 256 | 3.2.2 257 | 258 | 259 | package 260 | 261 | shade 262 | 263 | 264 | 265 | 266 | org.slf4j:slf4j-api 267 | 268 | org.apache.httpcomponents:httpclient 269 | 270 | org.apache.httpcomponents:httpcore 271 | 272 | commons-logging:commons-logging 273 | commons-codec:commons-codec 274 | commons-io:commons-io 275 | 276 | joda-time:joda-time 277 | commons-lang:commons-lang 278 | 279 | com.typesafe.scala-logging:scala-logging_2.12 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | *:* 293 | 294 | META-INF/maven/** 295 | META-INF/*.SF 296 | META-INF/*.DSA 297 | META-INF/*.RSA 298 | 299 | 300 | 301 | 302 | 303 | org 304 | repackaged.org 305 | 306 | org.jsoup.** 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | org.apache.maven.plugins 316 | maven-release-plugin 317 | 2.5.3 318 | 319 | true 320 | false 321 | release 322 | deploy 323 | 324 | 325 | 326 | org.apache.maven.scm 327 | maven-scm-provider-gitexe 328 | 1.8.1 329 | 330 | 331 | 332 | 333 | org.apache.maven.plugins 334 | maven-gpg-plugin 335 | 1.5 336 | 337 | 338 | sign-artifacts 339 | verify 340 | 341 | sign 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | -------------------------------------------------------------------------------- /src/main/resources/com/aamend/spark/gdelt/reference/cameoEvent.txt: -------------------------------------------------------------------------------- 1 | CAMEOEVENTCODE EVENTDESCRIPTION 2 | 01 MAKE PUBLIC STATEMENT 3 | 010 Make statement, not specified below 4 | 011 Decline comment 5 | 012 Make pessimistic comment 6 | 013 Make optimistic comment 7 | 014 Consider policy option 8 | 015 Acknowledge or claim responsibility 9 | 016 Deny responsibility 10 | 017 Engage in symbolic act 11 | 018 Make empathetic comment 12 | 019 Express accord 13 | 02 APPEAL 14 | 020 Appeal, not specified below 15 | 021 Appeal for material cooperation, not specified below 16 | 0211 Appeal for economic cooperation 17 | 0212 Appeal for military cooperation 18 | 0213 Appeal for judicial cooperation 19 | 0214 Appeal for intelligence 20 | 022 Appeal for diplomatic cooperation, such as policy support 21 | 023 Appeal for aid, not specified below 22 | 0231 Appeal for economic aid 23 | 0232 Appeal for military aid 24 | 0233 Appeal for humanitarian aid 25 | 0234 Appeal for military protection or peacekeeping 26 | 024 Appeal for political reform, not specified below 27 | 0241 Appeal for change in leadership 28 | 0242 Appeal for policy change 29 | 0243 Appeal for rights 30 | 0244 Appeal for change in institutions, regime 31 | 025 Appeal to yield 32 | 0251 Appeal for easing of administrative sanctions 33 | 0252 Appeal for easing of popular dissent 34 | 0253 Appeal for release of persons or property 35 | 0254 Appeal for easing of economic sanctions, boycott, or embargo 36 | 0255 Appeal for target to allow international involvement (non-mediation) 37 | 0256 Appeal for de-escalation of military engagement 38 | 026 Appeal to others to meet or negotiate 39 | 027 Appeal to others to settle dispute 40 | 028 Appeal to others to engage in or accept mediation 41 | 03 EXPRESS INTENT TO COOPERATE 42 | 030 Express intent to cooperate, not specified below 43 | 031 Express intent to engage in material cooperation, not specified below 44 | 0311 Express intent to cooperate economically 45 | 0312 Express intent to cooperate militarily 46 | 0313 Express intent to cooperate on judicial matters 47 | 0314 Express intent to cooperate on intelligence 48 | 032 Express intent to provide diplomatic cooperation such as policy support 49 | 033 Express intent to provide matyerial aid, not specified below 50 | 0331 Express intent to provide economic aid 51 | 0332 Express intent to provide military aid 52 | 0333 Express intent to provide humanitarian aid 53 | 0334 Express intent to provide military protection or peacekeeping 54 | 034 Express intent to institute political reform, not specified below 55 | 0341 Express intent to change leadership 56 | 0342 Express intent to change policy 57 | 0343 Express intent to provide rights 58 | 0344 Express intent to change institutions, regime 59 | 035 Express intent to yield, not specified below 60 | 0351 Express intent to ease administrative sanctions 61 | 0352 Express intent to ease popular dissent 62 | 0353 Express intent to release persons or property 63 | 0354 Express intent to ease economic sanctions, boycott, or embargo 64 | 0355 Express intent allow international involvement (not mediation) 65 | 0356 Express intent to de-escalate military engagement 66 | 036 Express intent to meet or negotiate 67 | 037 Express intent to settle dispute 68 | 038 Express intent to accept mediation 69 | 039 Express intent to mediate 70 | 04 CONSULT 71 | 040 Consult, not specified below 72 | 041 Discuss by telephone 73 | 042 Make a visit 74 | 043 Host a visit 75 | 044 Meet at a Ã’hirdÓlocation 76 | 045 Mediate 77 | 046 Engage in negotiation 78 | 05 ENGAGE IN DIPLOMATIC COOPERATION 79 | 050 Engage in diplomatic cooperation, not specified below 80 | 051 Praise or endorse 81 | 052 Defend verbally 82 | 053 Rally support on behalf of 83 | 054 Grant diplomatic recognition 84 | 055 Apologize 85 | 056 Forgive 86 | 057 Sign formal agreement 87 | 06 ENGAGE IN MATERIAL COOPERATION 88 | 060 Engage in material cooperation, not specified below 89 | 061 Cooperate economically 90 | 062 Cooperate militarily 91 | 063 Engage in judicial cooperation 92 | 064 Share intelligence or information 93 | 07 PROVIDE AID 94 | 070 Provide aid, not specified below 95 | 071 Provide economic aid 96 | 072 Provide military aid 97 | 073 Provide humanitarian aid 98 | 074 Provide military protection or peacekeeping 99 | 075 Grant asylum 100 | 08 YIELD 101 | 080 Yield, not specified below 102 | 081 Ease administrative sanctions, not specified below 103 | 0811 Ease restrictions on political freedoms 104 | 0812 Ease ban on political parties or politicians 105 | 0813 Ease curfew 106 | 0814 Ease state of emergency or martial law 107 | 082 Ease political dissent 108 | 083 Accede to requests or demands for political reform not specified below 109 | 0831 Accede to demands for change in leadership 110 | 0832 Accede to demands for change in policy 111 | 0833 Accede to demands for rights 112 | 0834 Accede to demands for change in institutions, regime 113 | 084 Return, release, not specified below 114 | 0841 Return, release person(s) 115 | 0842 Return, release property 116 | 085 Ease economic sanctions, boycott, embargo 117 | 086 Allow international involvement not specified below 118 | 0861 Receive deployment of peacekeepers 119 | 0862 Receive inspectors 120 | 0863 Allow delivery of humanitarian aid 121 | 087 De-escalate military engagement 122 | 0871 Declare truce, ceasefire 123 | 0872 Ease military blockade 124 | 0873 Demobilize armed forces 125 | 0874 Retreat or surrender militarily 126 | 09 INVESTIGATE 127 | 090 Investigate, not specified below 128 | 091 Investigate crime, corruption 129 | 092 Investigate human rights abuses 130 | 093 Investigate military action 131 | 094 Investigate war crimes 132 | 10 DEMAND 133 | 100 Demand, not specified below 134 | 101 Demand information, investigation 135 | 1011 Demand economic cooperation 136 | 1012 Demand military cooperation 137 | 1013 Demand judicial cooperation 138 | 1014 Demand intelligence cooperation 139 | 102 Demand policy support 140 | 103 Demand aid, protection, or peacekeeping 141 | 1031 Demand economic aid 142 | 1032 Demand military aid 143 | 1033 Demand humanitarian aid 144 | 1034 Demand military protection or peacekeeping 145 | 104 Demand political reform, not specified below 146 | 1041 Demand change in leadership 147 | 1042 Demand policy change 148 | 1043 Demand rights 149 | 1044 Demand change in institutions, regime 150 | 105 Demand mediation 151 | 1051 Demand easing of administrative sanctions 152 | 1052 Demand easing of political dissent 153 | 1053 Demand release of persons or property 154 | 1054 Demand easing of economic sanctions, boycott, or embargo 155 | 1055 Demand that target allows international involvement (non-mediation) 156 | 1056 Demand de-escalation of military engagement106:[-5.0] Demand withdrawal 157 | 107 Demand ceasefire 158 | 108 Demand meeting, negotiation 159 | 11 DISAPPROVE 160 | 110 Disapprove, not specified below 161 | 111 Criticize or denounce 162 | 112 Accuse, not specified below 163 | 1121 Accuse of crime, corruption 164 | 1122 Accuse of human rights abuses 165 | 1123 Accuse of aggression 166 | 1124 Accuse of war crimes 167 | 1125 Accuse of espionage, treason 168 | 113 Rally opposition against 169 | 114 Complain officially 170 | 115 Bring lawsuit against 171 | 116 Find guilty or liable (legally) 172 | 12 REJECT 173 | 120 Reject, not specified below 174 | 121 Reject material cooperation 175 | 1211 Reject economic cooperation 176 | 1212 Reject military cooperation 177 | 122 Reject request or demand for material aid, not specified below 178 | 1221 Reject request for economic aid 179 | 1222 Reject request for military aid 180 | 1223 Reject request for humanitarian aid 181 | 1224 Reject request for military protection or peacekeeping 182 | 123 Reject request or demand for political reform, not specified below 183 | 1231 Reject request for change in leadership 184 | 1232 Reject request for policy change 185 | 1233 Reject request for rights 186 | 1234 Reject request for change in institutions, regime 187 | 124 Refuse to yield, not specified below 188 | 1241 Refuse to ease administrative sanctions 189 | 1242 Refuse to ease popular dissent 190 | 1243 Refuse to release persons or property 191 | 1244 Refuse to ease economic sanctions, boycott, or embargo 192 | 1245 Refuse to allow international involvement (non mediation) 193 | 1246 Refuse to de-escalate military engagement 194 | 125 Reject proposal to meet, discuss, or negotiate 195 | 126 Reject mediation 196 | 127 Reject plan, agreement to settle dispute 197 | 128 Defy norms, law 198 | 129 Veto 199 | 13 THREATEN 200 | 130 Threaten, not specified below 201 | 131 Threaten non-force, not specified below 202 | 1311 Threaten to reduce or stop aid 203 | 1312 Threaten to boycott, embargo, or sanction 204 | 1313 Threaten to reduce or break relations 205 | 132 Threaten with administrative sanctions, not specified below 206 | 1321 Threaten to impose restrictions on political freedoms 207 | 1322 Threaten to ban political parties or politicians 208 | 1323 Threaten to impose curfew 209 | 1324 Threaten to impose state of emergency or martial law 210 | 133 Threaten political dissent, protest 211 | 134 Threaten to halt negotiations 212 | 135 Threaten to halt mediation 213 | 136 Threaten to halt international involvement (non-mediation) 214 | 137 Threaten with violent repression 215 | 138 Threaten to use military force, not specified below 216 | 1381 Threaten blockade 217 | 1382 Threaten occupation 218 | 1383 Threaten unconventional violence 219 | 1384 Threaten conventional attack 220 | 1385 Threaten attack with WMD 221 | 139 Give ultimatum 222 | 14 PROTEST 223 | 140 Engage in political dissent, not specified below 224 | 141 Demonstrate or rally 225 | 1411 Demonstrate for leadership change 226 | 1412 Demonstrate for policy change 227 | 1413 Demonstrate for rights 228 | 1414 Demonstrate for change in institutions, regime 229 | 142 Conduct hunger strike, not specified below 230 | 1421 Conduct hunger strike for leadership change 231 | 1422 Conduct hunger strike for policy change 232 | 1423 Conduct hunger strike for rights 233 | 1424 Conduct hunger strike for change in institutions, regime 234 | 143 Conduct strike or boycott, not specified below 235 | 1431 Conduct strike or boycott for leadership change 236 | 1432 Conduct strike or boycott for policy change 237 | 1433 Conduct strike or boycott for rights 238 | 1434 Conduct strike or boycott for change in institutions, regime 239 | 144 Obstruct passage, block 240 | 1441 Obstruct passage to demand leadership change 241 | 1442 Obstruct passage to demand policy change 242 | 1443 Obstruct passage to demand rights 243 | 1444 Obstruct passage to demand change in institutions, regime 244 | 145 Protest violently, riot 245 | 1451 Engage in violent protest for leadership change 246 | 1452 Engage in violent protest for policy change 247 | 1453 Engage in violent protest for rights 248 | 1454 Engage in violent protest for change in institutions, regime 249 | 15 EXHIBIT FORCE POSTURE 250 | 150 Demonstrate military or police power, not specified below 251 | 151 Increase police alert status 252 | 152 Increase military alert status 253 | 153 Mobilize or increase police power 254 | 154 Mobilize or increase armed forces 255 | 16 REDUCE RELATIONS 256 | 160 Reduce relations, not specified below 257 | 161 Reduce or break diplomatic relations 258 | 162 Reduce or stop aid, not specified below 259 | 1621 Reduce or stop economic assistance 260 | 1622 Reduce or stop military assistance 261 | 1623 Reduce or stop humanitarian assistance 262 | 163 Impose embargo, boycott, or sanctions 263 | 164 Halt negotiations 264 | 165 Halt mediation 265 | 166 Expel or withdraw, not specified below 266 | 1661 Expel or withdraw peacekeepers 267 | 1662 Expel or withdraw inspectors, observers 268 | 1663 Expel or withdraw aid agencies 269 | 17 COERCE 270 | 170 Coerce, not specified below 271 | 171 Seize or damage property, not specified below 272 | 1711 Confiscate property 273 | 1712 Destroy property 274 | 172 Impose administrative sanctions, not specified below 275 | 1721 Impose restrictions on political freedoms 276 | 1722 Ban political parties or politicians 277 | 1723 Impose curfew 278 | 1724 Impose state of emergency or martial law 279 | 173 Arrest, detain, or charge with legal action 280 | 174 Expel or deport individuals 281 | 175 Use tactics of violent repression 282 | 18 ASSAULT 283 | 180 Use unconventional violence, not specified below 284 | 181 Abduct, hijack, or take hostage 285 | 182 Physically assault, not specified below 286 | 1821 Sexually assault 287 | 1822 Torture 288 | 1823 Kill by physical assault 289 | 183 Conduct suicide, car, or other non-military bombing, not spec below 290 | 1831 Carry out suicide bombing 291 | 1832 Carry out car bombing 292 | 1833 Carry out roadside bombing 293 | 184 Use as human shield 294 | 185 Attempt to assassinate 295 | 186 Assassinate 296 | 19 FIGHT 297 | 190 Use conventional military force, not specified below 298 | 191 Impose blockade, restrict movement 299 | 192 Occupy territory 300 | 193 Fight with small arms and light weapons 301 | 194 Fight with artillery and tanks 302 | 195 Employ aerial weapons 303 | 196 Violate ceasefire 304 | 20 USE UNCONVENTIONAL MASS VIOLENCE 305 | 200 Use unconventional mass violence, not specified below 306 | 201 Engage in mass expulsion 307 | 202 Engage in mass killings 308 | 203 Engage in ethnic cleansing 309 | 204 Use weapons of mass destruction, not specified below 310 | 2041 Use chemical, biological, or radiologicalweapons 311 | 2042 Detonate nuclear weapons -------------------------------------------------------------------------------- /src/main/scala/com/gravity/goose/network/HtmlFetcher.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to Gravity.com under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Gravity.com licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.gravity.goose.network 20 | 21 | import org.apache.http.HttpEntity 22 | import org.apache.http.HttpResponse 23 | import org.apache.http.HttpVersion 24 | import org.apache.http.client.CookieStore 25 | import org.apache.http.client.HttpClient 26 | import org.apache.http.client.methods.HttpGet 27 | import org.apache.http.client.params.CookiePolicy 28 | import org.apache.http.client.protocol.ClientContext 29 | import org.apache.http.conn.scheme.PlainSocketFactory 30 | import org.apache.http.conn.ssl.SSLSocketFactory 31 | import org.apache.http.conn.scheme.Scheme 32 | import org.apache.http.conn.scheme.SchemeRegistry 33 | import org.apache.http.cookie.Cookie 34 | import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager 35 | import org.apache.http.params.BasicHttpParams 36 | import org.apache.http.params.HttpConnectionParams 37 | import org.apache.http.params.HttpParams 38 | import org.apache.http.params.HttpProtocolParams 39 | import org.apache.http.protocol.BasicHttpContext 40 | import org.apache.http.protocol.HttpContext 41 | import org.apache.http.util.EntityUtils 42 | import java.io._ 43 | import java.net.SocketException 44 | import java.net.SocketTimeoutException 45 | import java.net.URLConnection 46 | import java.util.ArrayList 47 | import java.util.Date 48 | import java.util.List 49 | import com.gravity.goose.utils.Logging 50 | import com.gravity.goose.Configuration 51 | import org.apache.http.impl.client.{DefaultHttpRequestRetryHandler, AbstractHttpClient, DefaultHttpClient} 52 | 53 | 54 | /** 55 | * User: Jim Plush 56 | * Date: 12/16/10 57 | * This guy is kind of a doozy because goose is meant to pull millions of articles per day so the legitimacy of these links 58 | * is in question. For example many times you'll see mp3, mov, wav, etc.. files mislabeled as HTML with HTML content types, 59 | * only through inspection of the actual content will you learn what the real type of content is. Also spam sites could 60 | * contain up to 1GB of text that is just wasted resources so we set a max bytes level on how much content we're going 61 | * to try and pull back before we say screw it. 62 | */ 63 | object HtmlFetcher extends AbstractHtmlFetcher with Logging { 64 | /** 65 | * holds a reference to our override cookie store, we don't want to store 66 | * cookies for head requests, only slows shit down 67 | */ 68 | var emptyCookieStore: CookieStore = null 69 | /** 70 | * holds the HttpClient object for making requests 71 | */ 72 | private var httpClient: HttpClient = null 73 | initClient() 74 | 75 | 76 | def getHttpClient: HttpClient = { 77 | httpClient 78 | } 79 | 80 | /** 81 | * Makes an http fetch to go retrieve the HTML from a url, store it to disk and pass it off 82 | * @param config Goose Configuration 83 | * @param url The web address to fetch 84 | * @return If all goes well, a `Some[String]` otherwise `None` 85 | * @throws NotFoundException(String) 86 | * @throws BadRequestException(String) 87 | * @throws NotAuthorizedException(String, Int) 88 | * @throws ServerErrorException(String, Int) 89 | * @throws UnhandledStatusCodeException(String, Int) 90 | * @throws MaxBytesException() 91 | */ 92 | def getHtml(config: Configuration, url: String): Option[String] = { 93 | var httpget: HttpGet = null 94 | var htmlResult: String = null 95 | var entity: HttpEntity = null 96 | var instream: InputStream = null 97 | 98 | // Identified the the apache http client does not drop URL fragments before opening the request to the host 99 | // more info: http://stackoverflow.com/questions/4251841/400-error-with-httpclient-for-a-link-with-an-anchor 100 | val cleanUrl = { 101 | val foundAt = url.indexOf("#") 102 | if (foundAt >= 0) url.substring(0, foundAt) else url 103 | } 104 | 105 | try { 106 | val localContext: HttpContext = new BasicHttpContext 107 | localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore) 108 | httpget = new HttpGet(cleanUrl) 109 | HttpProtocolParams.setUserAgent(httpClient.getParams, config.getBrowserUserAgent()); 110 | 111 | val params = httpClient.getParams 112 | HttpConnectionParams.setConnectionTimeout(params, config.getConnectionTimeout()) 113 | HttpConnectionParams.setSoTimeout(params, config.getSocketTimeout()) 114 | 115 | trace("Setting UserAgent To: " + HttpProtocolParams.getUserAgent(httpClient.getParams)) 116 | val response: HttpResponse = httpClient.execute(httpget, localContext) 117 | 118 | HttpStatusValidator.validate(cleanUrl, response.getStatusLine.getStatusCode) match { 119 | case Left(ex) => throw ex 120 | case _ => 121 | } 122 | 123 | entity = response.getEntity 124 | if (entity != null) { 125 | instream = entity.getContent 126 | var encodingType: String = "UTF-8" 127 | try { 128 | encodingType = EntityUtils.getContentCharSet(entity) 129 | if (encodingType == null) { 130 | encodingType = "UTF-8" 131 | } 132 | } 133 | catch { 134 | case e: Exception => { 135 | if (logger.isDebugEnabled) { 136 | trace("Unable to get charset for: " + cleanUrl) 137 | trace("Encoding Type is: " + encodingType) 138 | } 139 | } 140 | } 141 | try { 142 | htmlResult = HtmlFetcher.convertStreamToString(instream, 15728640, encodingType).trim 143 | } 144 | finally { 145 | EntityUtils.consume(entity) 146 | } 147 | } 148 | else { 149 | trace("Unable to fetch URL Properly: " + cleanUrl) 150 | } 151 | } 152 | catch { 153 | case e: NullPointerException => { 154 | logger.warn(e.toString + " " + e.getMessage + " Caught for URL: " + cleanUrl) 155 | } 156 | case e: MaxBytesException => { 157 | trace("GRVBIGFAIL: " + cleanUrl + " Reached max bytes size") 158 | throw e 159 | } 160 | case e: SocketException => { 161 | logger.warn(e.getMessage + " Caught for URL: " + cleanUrl) 162 | } 163 | case e: SocketTimeoutException => { 164 | trace(e.toString) 165 | } 166 | case e: LoggableException => { 167 | logger.warn(e.getMessage) 168 | return None 169 | } 170 | case e: Exception => { 171 | trace("FAILURE FOR LINK: " + cleanUrl + " " + e.toString) 172 | return None 173 | } 174 | } 175 | finally { 176 | if (instream != null) { 177 | try { 178 | instream.close() 179 | } 180 | catch { 181 | case e: Exception => { 182 | logger.warn(e.getMessage + " Caught for URL: " + cleanUrl) 183 | } 184 | } 185 | } 186 | if (httpget != null) { 187 | try { 188 | httpget.abort() 189 | entity = null 190 | } 191 | catch { 192 | case e: Exception => { 193 | } 194 | } 195 | } 196 | } 197 | if (logger.isDebugEnabled) { 198 | logger.debug("starting...") 199 | } 200 | if (htmlResult == null || htmlResult.length < 1) { 201 | if (logger.isDebugEnabled) { 202 | logger.debug("HTMLRESULT is empty or null") 203 | } 204 | throw new NotHtmlException(cleanUrl) 205 | } 206 | var is: InputStream = null 207 | var mimeType: String = null 208 | try { 209 | is = new ByteArrayInputStream(htmlResult.getBytes("UTF-8")) 210 | mimeType = URLConnection.guessContentTypeFromStream(is) 211 | if (mimeType != null) { 212 | if ((mimeType == "text/html") == true || (mimeType == "application/xml") == true) { 213 | return Some(htmlResult) 214 | } 215 | else { 216 | if (htmlResult.contains("") == true && htmlResult.contains("<p>") == true) { 217 | return Some(htmlResult) 218 | } 219 | trace("GRVBIGFAIL: " + mimeType + " - " + cleanUrl) 220 | throw new NotHtmlException(cleanUrl) 221 | } 222 | } 223 | else { 224 | throw new NotHtmlException(cleanUrl) 225 | } 226 | } 227 | catch { 228 | case e: UnsupportedEncodingException => { 229 | logger.warn(e.getMessage + " Caught for URL: " + cleanUrl) 230 | } 231 | case e: IOException => { 232 | logger.warn(e.getMessage + " Caught for URL: " + cleanUrl) 233 | } 234 | } 235 | None 236 | } 237 | 238 | private def initClient() { 239 | 240 | trace("Initializing HttpClient") 241 | 242 | val httpParams: HttpParams = new BasicHttpParams 243 | HttpConnectionParams.setConnectionTimeout(httpParams, 10 * 1000) 244 | HttpConnectionParams.setSoTimeout(httpParams, 10 * 1000) 245 | HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1) 246 | emptyCookieStore = new CookieStore { 247 | def addCookie(cookie: Cookie) { 248 | } 249 | 250 | def getCookies: List[Cookie] = { 251 | emptyList 252 | } 253 | 254 | def clearExpired(date: Date): Boolean = { 255 | false 256 | } 257 | 258 | def clear() { 259 | } 260 | 261 | private[network] var emptyList: ArrayList[Cookie] = new ArrayList[Cookie] 262 | } 263 | httpParams.setParameter("http.protocol.cookie-policy", CookiePolicy.BROWSER_COMPATIBILITY) 264 | httpParams.setParameter("http.User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8") 265 | httpParams.setParameter("http.language.Accept-Language", "en-us") 266 | httpParams.setParameter("http.protocol.content-charset", "UTF-8") 267 | httpParams.setParameter("Accept", "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5") 268 | httpParams.setParameter("Cache-Control", "max-age=0") 269 | httpParams.setParameter("http.connection.stalecheck", false) 270 | val schemeRegistry: SchemeRegistry = new SchemeRegistry 271 | schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory)) 272 | schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory)) 273 | val cm = new ThreadSafeClientConnManager(schemeRegistry) 274 | cm.setMaxTotal(20000) 275 | cm.setDefaultMaxPerRoute(500) 276 | httpClient = new DefaultHttpClient(cm, httpParams) 277 | httpClient.asInstanceOf[AbstractHttpClient].setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false)) 278 | httpClient.getParams.setParameter("http.conn-manager.timeout", 120000L) 279 | httpClient.getParams.setParameter("http.protocol.wait-for-continue", 10000L) 280 | httpClient.getParams.setParameter("http.tcp.nodelay", true) 281 | } 282 | 283 | /** 284 | * reads bytes off the string and returns a string 285 | * 286 | * @param is the source stream from the response 287 | * @param maxBytes The max bytes that we want to read from the input stream 288 | * @return String 289 | */ 290 | def convertStreamToString(is: InputStream, maxBytes: Int, encodingType: String): String = { 291 | val buf: Array[Char] = new Array[Char](2048) 292 | var r: Reader = null 293 | val s = new StringBuilder 294 | try { 295 | r = new InputStreamReader(is, encodingType) 296 | var bytesRead: Int = 2048 297 | var inLoop = true 298 | while (inLoop) { 299 | if (bytesRead >= maxBytes) { 300 | throw new MaxBytesException 301 | } 302 | var n: Int = r.read(buf) 303 | bytesRead += 2048 304 | 305 | if (n < 0) inLoop = false 306 | if (inLoop) s.appendAll(buf, 0, n) 307 | } 308 | return s.toString() 309 | } 310 | catch { 311 | case e: SocketTimeoutException => { 312 | logger.warn(e.toString + " " + e.getMessage) 313 | } 314 | case e: UnsupportedEncodingException => { 315 | logger.warn(e.toString + " Encoding: " + encodingType) 316 | } 317 | case e: IOException => { 318 | logger.warn(e.toString + " " + e.getMessage) 319 | } 320 | } 321 | finally { 322 | if (r != null) { 323 | try { 324 | r.close() 325 | } 326 | catch { 327 | case e: Exception => { 328 | } 329 | } 330 | } 331 | } 332 | null 333 | } 334 | 335 | 336 | } 337 | 338 | 339 | -------------------------------------------------------------------------------- /src/main/resources/com/gravity/goose/statichtml/guardian1_result.txt: -------------------------------------------------------------------------------- 1 | Kristen Wiig has had the kind of summer one might imagine to be life-changing. For four years, she andAnnie Mumolo, her friend and co-writer, slaved over a comedy script commissioned by Judd Apatow about a woman whose best friend is getting married. It was the 38-year-old's first lead in a film, and her first full-length script to be produced. As an indication of how it played with audiences, I watched it on a plane last month with a friend who, during the scene in which the bride squats in the street to relieve herself after a bad kebab, laughed so long and hard a woman passing in the aisle leant over and said, "What are you watching?" 2 | 3 | Wiig smiles when I tell her this. "Proud," she says, of the day they filmed the shitting-in-the-street scene. "A proud moment." 4 | 5 | We are in the tearoom of a fashionable hotel in Tribeca, the Manhattan neighbourhood where Wiig lives. Before Bridesmaids, she was known to US audiences as a long-running cast member of Saturday Night Live and elsewhere for scene-stealing cameos in films such as Ghost Town and Knocked Up. 6 | 7 | That the film, by midsummer, had grossed more than $150m in the US and outstripped not only all of Apatow's other films, but every "R-rated female comedy" in history, puts Wiig in the zone of woman of the moment, although she chafes against this, with its implication that before Bridesmaids she was an ingenue. 8 | 9 | "In most ways my life hasn't changed," she says. "I know that's a boring answer. People want to hear that I bought all gold, fur…" She allows a perfectly timed beat. "I would never wear fur." 10 | 11 | But hasn't she had to turn down lots of offers? 12 | 13 | "I mean. Yes, no. It feels weird to say that; you don't want to be like, 'Everyone wants me!' I mean. I guess Bridesmaids was definitely the biggest role I've ever had. And the fact that I co-wrote it and everything. But, um…" Wiig, who is slight, with very straight hair and an eager tilt to her body language, looks mortified. "It's not like I have boxes of scripts arriving at my door." 14 | 15 | Her understatement is fuelled perhaps by the inevitable and awkward comparisons she has gained with other women in her business, as if the culture can sustain only a couple at a time. Wiig has been getting "the new Tina Fey" quite a lot – Fey was head writer at SNL when Wiig joined – although the comparison is faulty. Wiig is an actor first and a comedian second, and with a film directed by Sean Pennin the pipeline and another, Imogene, in which she stars alongside Annette Bening and Matt Dillon, wants to develop her career away from comedy. "People always call me a comedian. And I don't really see myself like that. I guess I just consider myself an actor who does comedy. But who wants to do other things as well." 16 | 17 | It took her a long time to get here. After growing up in upstate New York, she went to university in Arizona and studied art before dropping out after the first year and going to LA to try to make it as an actor. Arizona is a notorious party college, but Wiig says all of that was out of her system by the time she left high school, where she had a few shaky years. "I was not that good a student because I was very… social. I cared more about going out with my friends. I didn't quite realise the importance of school. But then when I went to college I took it much more seriously, because I enjoyed it." 18 | 19 | How social was she? Suspended? 20 | 21 | "Um. Not for more than a couple of days. There were suspensions." Her expression fixes. "That's the past." Before the spotlight was so firmly on her, Wiig talked publicly about her minor-league acts of teenage hooliganism, including being caught underage drinking at a Grateful Deadgig, skipping school and, what she called the worst of it, smashing pot plants on a neighbour's porch, which she feels terrible about. As she entered her 20s her parents were still worried, she says, and then when she kicked in her degree and told them she wanted to be an actor, "probably the most worried they could be". 22 | 23 | "Yes. Also, they didn't want me to get disappointed. They would always mention the numbers – do you know how many people are trying to do what you're doing? Your chances are really slim. And they're right. Technically. But when you're 20, you're like, why can't you just support me?! Can't you be proud that I'm trying to go after my dream?" She pulls a whiny face and tilts her head. "But they came around quickly when they saw how happy it made me. They would come and see me in the horrible little shows that I was in." 24 | 25 | Wiig hadn't any great sense of being funny when she was growing up. Her dad, she says, tells a lot of jokes. Her mother is funny, but "mom funny, where she isn't trying to be funny, but is". Before retiring, her father ran a marina on one of the lakes upstate in New York (the name Wiig is from his Norwegian heritage). Her mother was an artist. Even after all these years, they haven't quite shed the sense of precariousness around their daughter's life; when she tells them she's in a movie, her mother will say tentatively, "Is that something we can see in theatres?" Wiig smiles and says, "They're still getting used to the idea that I'm working and it's OK." 26 | 27 | With good reason. Wiig was 11 years in LA before she got the call from Saturday Night Live, during which time her income was erratic. She had arrived in the city with no professional contacts and a nagging sense of insurmountable competition. "I was incredibly intimidated and had no experience. I felt very scared and unsure and I didn't have any résumé, and everyone around me was very beautiful and young and I thought, oh, maybe I should work in a store and enjoy the weather. But I started taking improv classes and that's what got me started." 28 | 29 | Improv was something she had never heard of before. But when she turned up to watch a gig one day at the Groundlings, the famous LA improv troupe with alumni such as Lisa Kudrow, Conan O'Brienand Will Ferrell, something resonated. The idea of standing on stage and making up stuff was, she says, less scary to her than the notion of saying lines, with the lurking fear there was a right and a wrong way to say them. With improvisation, there was no right and wrong: "You can't mess it up and you can't forget your lines." 30 | 31 | Her enthusiasm wouldn't pay the bills, however, and Wiig worked at a series of day jobs, including at a floral design studio for a couple of years, and as a waitress in the refectory at Universal Studios. Now and then she'll run into someone on a TV show or a movie, and wonder where she knows them from. "And then I'll remember: oh yeah, I used to serve you Cobb salad." 32 | 33 | There were many long, dark nights of the soul. "Oh my God, every month, yeah, because you don't have a lot of money coming in. When I look back, it was one of the best times of my life, because you're so in it with your friends. But you do have those moments when you're like: have I given it a try, should I stop, should I quit? But, no. You have a family there, you have a space to put shows on. I would rather be doing what I love and living above a garage – which I did – than not." 34 | 35 | The call came in 2005. Wiig flew to New York for the first of several auditions with the Saturday Night Live creators. The audition format was standup, which she had never done before, and in front of a terrifying panel includingLorne Michaels, the legendary SNL producer, and Tina Fey. Wiig was required to unveil a range of characters of her own creation that might be suitable for the sketch show and, quivering up there alone on stage, she fully expected to be met with silence. When she heard a few laughs, she gathered strength, got through it and was called back for a second audition. After which, nothing. And then the new season started. "So I thought, right, pretty clear – thanks for coming. And then after the third show I got a call saying I was hired, come in…" 36 | 37 | Wiig joined the show at a time when it was undergoing a cultural transition. Fey was the first female head writer and has written about the formally macho culture of the show – men pissing in jars by their desks, etc, which she put to comic use in 30 Rock. It was tough, she says, walking into a workplace where everyone knew each other: "Kind of like going into someone's living room for a party and they are really comfortable and have their shoes off and are sitting on the couch and I walk in and am a little dressed up and don't know where to stand? They were all very welcoming and nice but I knew I wasn't at that place yet where I could take my shoes off." 38 | 39 | She was excited to be working with the likes of Fey, Amy Poehlerand Rachel Dratch, although Wiig is reluctant to describe the still testosterone-heavy environment at SNL as off-putting. "I mean, I mean, merely by numbers there are more men that work there, but I don't consider it… I don't even think about it. Men work there, women work there, we have a lot of amazing female writers on staff right now… There are more men, but I don't think anyone really…" 40 | 41 | Was she a fan of Fey's before she joined the show? 42 | 43 | "Um. I've watched the show since I was born. I mean I definitely admire all the stuff that she's accomplished, especially coming from SNL and being head writer, and then doing 30 Rock and all these movies and her book, I mean it's definitely something where you go, oh, that can happen. Someone can do that. She's done it. She deserves it." 44 | 45 | To date, Bridesmaids has earned in the region of $286m worldwide; it doesn't need the qualifier "best female comedy" since it outgrossed Apatow's entire back catalogue, including Anchorman and The 40-Year-Old Virgin. Still, Wiig does not claim feminist dividends for the film – that it allowed women actors to be as gross on screen as men. She says when she and Mumolo were writing the shitting-in-the-street scene ("Can that be the title of the piece?"), it wasn't with an eye on levelling the playing field, nor was there much discussion of whether the market would tolerate that kind of vulgarity from women. No. "I think when you are doing anything creative and you think, 'What are the critics going to think?' instead of what you want to express, it can get a little muddy, and – I'm talking so seriously about this shitting-in-the-street – but with that in particular we were like, oh, this is a fun way to end the scene, and Annie used to do an impression of someone slowly realising they were shitting their pants, kind of slowly going down on to the ground. She would just do it as a joke, and it would always make me laugh really hard. She took it to a whole new level." 46 | 47 | Apatow had approached Wiig and asked her to write a script for him after they worked together on Knocked Up, in which she played a small pivotal role as Katherine Heigl's bitchy boss. In her five minutes on screen, Wiig managed to communicate brilliantly the gap between what her character was saying and thinking. She and Mumolo first conceived of Bridesmaids not as a wedding movie per se, but as a movie about friendship. "I mean, it's called Bridesmaids, I get that. But it's about women who, when they reach that age, whether it's in their 30s or not, thought they were supposed to be somewhere else. That's where we started from. And the fact that Annie had been to seven weddings in two years. And that she had friends who were marrying money and she'd showed up at the country club for the bridal shower with her wing mirrors duct-taped to her car, and at the end of the night had to crawl through her window because the front door would always swell when it was hot out. But if it's your best friend, you don't want to be complaining…" 48 | 49 | On paper at least, it didn't look too promising, with the generic title and the number of lame wedding movies in a seemingly exhausted genre. Apatow's name raised suspicions, too, about the use to which certain characters would be put, especially that of Megan, played by Melissa McCarthy, who looked like the inevitable one-fat-girl-in-the-group and the obvious butt of fat-girl jokes. In fact, McCarthy is the other break-out star of the film, and "the character that didn't care what anybody else thought. It was a lesson my character needed to learn. She doesn't care what anyone thinks, she's in her own world, but is generous and sweet. We wanted to have that opposite look on life, the character who seems at first like there was nothing she could say that would help, but…" 50 | 51 | The writing of the dialogue was relatively easy, says Wiig, compared with figuring out what should happen in each scene, and the film went through countless draft versions, crammed in around other work commitments, so that Mumolo, for example, would fly out to Mexico where Wiig was filming, to work on it for a weekend. In early drafts, the women ended up in Vegas, but that got chucked out when, over the four years of writing, it was used up in other wedding films such as The Hangover. 52 | 53 | Apart from the fact that it is very funny, Bridesmaids ultimately works because it has a kind of sweet sincerity and the friendship between the two lead characters seems real. It bemuses Wiig that the film has widely been described as "raunchy". It's really not raunchy. "Raunchy means like Porky's," she says and smiles. "Which is my next movie; it's going to be a Porky's prequel." 54 | 55 | After six years in New York, Wiig is finally at home in the city. It was tough in the early days, she says, and when friends came to visit she would burst into tears as they left. ("I was so embarrassed. I thought, oh my God, they're going to go back and say, 'Kristen's not good. She is noooot coping well.'") If accounts are to be believed, she was briefly married to an actor called Hayes Hargrove and currently lives with her partner, a film-maker called Brian Petsos, but she responds to even the mildest question about her domestic life with a frozen smile. She would, of course, rather talk about acting, and her success in her first lead role – "I felt like I had to do a good job or no one would ever invite me to the party again" – has, despite her scrupulous modesty, been rewarded with the kind of films she always hoped she'd walk into. In the Sean Penn film The Comedian, which is still in the early stages of production, Wiig will co-star with Robert De Niro. It will be the real test of whether she is leading lady material, and whether she can carry a film without jokes. "I don't really think about it," says Wiig. "When you're in it, you're in it." 56 | 57 | In the meantime, she has sketches to write and shoot as part of the gruelling schedule of Saturday Night Live. After the interview, she is due in at the office for the weekly writing night, when everyone is required to be in at 4pm and stay until the following morning. Wiig is riding so high at the moment that when, as we leave, I ask her to confirm her age, I'm surprised when she grimaces. Yes, she says, she's 38. Why the face? Under her breath, like a dangerous heresy, she says, "I feel like women are asked their age more than men." And she snaps on a smile and leaves the restaurant. --------------------------------------------------------------------------------