├── images
├── gdelt.png
└── article.jpeg
├── .gitignore
├── .travis.yml
├── src
├── main
│ ├── resources
│ │ ├── com
│ │ │ ├── gravity
│ │ │ │ └── goose
│ │ │ │ │ ├── images
│ │ │ │ │ └── known-image-css.txt
│ │ │ │ │ ├── statichtml
│ │ │ │ │ ├── msn1_result.txt
│ │ │ │ │ ├── issue_24_result.txt
│ │ │ │ │ ├── issue_24.txt
│ │ │ │ │ └── guardian1_result.txt
│ │ │ │ │ └── text
│ │ │ │ │ └── stopwords-en.txt
│ │ │ └── aamend
│ │ │ │ └── spark
│ │ │ │ └── gdelt
│ │ │ │ └── reference
│ │ │ │ ├── cameoReligion.txt
│ │ │ │ ├── cameoType.txt
│ │ │ │ ├── cameoGroup.txt
│ │ │ │ ├── cameoCountry.txt
│ │ │ │ ├── cameoEthnic.txt
│ │ │ │ └── cameoEvent.txt
│ │ └── log4j.properties
│ └── scala
│ │ └── com
│ │ ├── gravity
│ │ └── goose
│ │ │ ├── network
│ │ │ ├── MaxBytesException.scala
│ │ │ ├── AbstractHtmlFetcher.scala
│ │ │ ├── NotHtmlException.scala
│ │ │ ├── HttpExceptions.scala
│ │ │ └── HtmlFetcher.scala
│ │ │ ├── images
│ │ │ ├── DepthTraversal.scala
│ │ │ ├── SecretGifException.scala
│ │ │ ├── Image.scala
│ │ │ ├── ImageDetails.scala
│ │ │ ├── ImageExtractor.scala
│ │ │ ├── ImageSaver.scala
│ │ │ └── ImageUtils.scala
│ │ │ ├── extractors
│ │ │ ├── TagsEvaluator.scala
│ │ │ ├── StandardContentExtractor.scala
│ │ │ ├── AdditionalDataExtractor.scala
│ │ │ ├── Extractor.scala
│ │ │ └── PublishDateExtractor.scala
│ │ │ ├── cleaners
│ │ │ └── StandardDocumentCleaner.scala
│ │ │ ├── outputformatters
│ │ │ ├── StandardOutputFormatter.scala
│ │ │ └── OutputFormatter.scala
│ │ │ ├── text
│ │ │ ├── HashUtils.scala
│ │ │ ├── StringSplitter.scala
│ │ │ ├── string.scala
│ │ │ ├── StringReplacement.scala
│ │ │ ├── WordStats.scala
│ │ │ ├── StopWords.scala
│ │ │ └── ReplaceSequence.scala
│ │ │ ├── utils
│ │ │ ├── FileHelper.scala
│ │ │ ├── URLHelper.scala
│ │ │ └── Logging.scala
│ │ │ ├── spark
│ │ │ ├── package.scala
│ │ │ └── GooseFetcher.scala
│ │ │ ├── Goose.scala
│ │ │ ├── Article.scala
│ │ │ ├── Configuration.scala
│ │ │ └── Crawler.scala
│ │ └── aamend
│ │ └── spark
│ │ └── gdelt
│ │ ├── reference
│ │ ├── CountryCodes.scala
│ │ ├── GcamCodes.scala
│ │ └── CameoCodes.scala
│ │ └── ContentFetcher.scala
└── test
│ ├── scala
│ └── com
│ │ └── aamend
│ │ └── spark
│ │ └── gdelt
│ │ ├── SparkSpec.scala
│ │ ├── ContentFetcherTest.scala
│ │ ├── TTest.scala
│ │ └── GdeltParserTest.scala
│ └── resources
│ └── com
│ └── aamend
│ └── spark
│ └── gdelt
│ ├── normDaily.csv
│ └── normDailyByCountry.csv
├── LICENSE
└── pom.xml
/images/gdelt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aamend/spark-gdelt/HEAD/images/gdelt.png
--------------------------------------------------------------------------------
/images/article.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aamend/spark-gdelt/HEAD/images/article.jpeg
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | target
4 | derby.log
5 | spark-warehouse
6 | metastore_db
7 | .DS_Store
8 | movejar.sh
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | install: mvn --quiet install -Dmaven.javadoc.skip=true -Dgpg.skip=true
3 | script: mvn test -Dmaven.javadoc.skip=true -DargLine="-Xmx2G"
--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/images/known-image-css.txt:
--------------------------------------------------------------------------------
1 | latimes.com^thumbnail
2 | cnn.com^storytext|cnn_strycntntlft
3 | foxnews.com^entry-content
4 | msn.com^articleText
5 | go.com^mediaimage
6 | buzznet.com^itembody
7 | time.com^entry-content
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/MaxBytesException.scala:
--------------------------------------------------------------------------------
1 | package com.gravity.goose.network
2 |
3 | /**
4 | * Created by Jim Plush
5 | * User: jim
6 | * Date: 8/14/11
7 | */
8 |
9 | class MaxBytesException extends Exception {
10 |
11 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/DepthTraversal.scala:
--------------------------------------------------------------------------------
1 | package com.gravity.goose.images
2 |
3 | import org.jsoup.nodes.Element
4 |
5 | /**
6 | * Created by Jim Plush
7 | * User: jim
8 | * Date: 8/18/11
9 | */
10 |
11 | case class DepthTraversal(node: Element, parentDepth: Int, siblingDepth: Int)
12 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 | # Direct log messages to stdout
4 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | log4j.appender.stdout.Target=System.out
6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %40.40c:%4L - %m%n
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/TagsEvaluator.scala:
--------------------------------------------------------------------------------
1 | package org.jsoup.select
2 |
3 | import org.jsoup.nodes.Element
4 |
5 | /**
6 | * Created by IntelliJ IDEA.
7 | * Author: Robbie Coleman
8 | * Date: 6/12/12
9 | * Time: 12:04 PM
10 | */
11 |
12 | class TagsEvaluator(tags: scala.collection.Set[String]) extends Evaluator {
13 | def matches(root: Element, element: Element) = tags.contains(element.tagName())
14 | }
15 |
16 | object TagsEvaluator {
17 | def apply(tags: String*): TagsEvaluator = new TagsEvaluator(tags.toSet)
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/resources/com/aamend/spark/gdelt/reference/cameoReligion.txt:
--------------------------------------------------------------------------------
1 | CODE LABEL
2 | ADR African Diasporic Religion
3 | ALE Alewi
4 | ATH Agnostic
5 | BAH Bahai Faith
6 | BUD Buddhism
7 | CHR Christianity
8 | CON Confucianism
9 | CPT Coptic
10 | CTH Catholic
11 | DOX Orthodox
12 | DRZ Druze
13 | HIN Hinduism
14 | HSD Hasidic
15 | ITR Indigenous Tribal Religion
16 | JAN Jainism
17 | JEW Judaism
18 | JHW Jehovah's Witness
19 | LDS Latter Day Saints
20 | MOS Muslim
21 | MRN Maronite
22 | NRM New Religious Movement
23 | PAG Pagan
24 | PRO Protestant
25 | SFI Sufi
26 | SHI Shia
27 | SHN Old Shinto School
28 | SIK Sikh
29 | SUN Sunni
30 | TAO Taoist
31 | UDX Ultra-Orthodox
32 | ZRO Zoroastrianism
--------------------------------------------------------------------------------
/src/test/scala/com/aamend/spark/gdelt/SparkSpec.scala:
--------------------------------------------------------------------------------
1 | package com.aamend.spark.gdelt
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.sql.SparkSession
5 | import org.scalatest.FunSuite
6 |
7 | trait SparkSpec extends FunSuite {
8 |
9 | Logger.getLogger("org").setLevel(Level.OFF)
10 | Logger.getLogger("akka").setLevel(Level.OFF)
11 |
12 | def sparkTest(name: String)(f: SparkSession => Unit): Unit = {
13 |
14 | this.test(name) {
15 |
16 | val spark = SparkSession
17 | .builder()
18 | .appName(name)
19 | .master("local")
20 | .config("spark.default.parallelism", "1")
21 | .getOrCreate()
22 |
23 | try {
24 | f(spark)
25 | } finally {
26 | spark.stop()
27 | }
28 | }
29 | }
30 | }
--------------------------------------------------------------------------------
/src/main/scala/com/aamend/spark/gdelt/reference/CountryCodes.scala:
--------------------------------------------------------------------------------
1 | package com.aamend.spark.gdelt.reference
2 |
3 | import com.aamend.spark.gdelt.CountryCode
4 | import com.aamend.spark.gdelt.T
5 | import org.apache.spark.sql.{Dataset, SparkSession}
6 |
7 | import scala.io.Source
8 |
9 | object CountryCodes {
10 |
11 | def load(spark: SparkSession): Dataset[CountryCode] = {
12 | import spark.implicits._
13 | Source.fromInputStream(this.getClass.getResourceAsStream("countryInfo.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
14 | val tokens = line.split("\t")
15 | CountryCode(
16 | iso = T(()=>tokens(0)),
17 | iso3 = T(()=>tokens(1)),
18 | isoNumeric = T(()=>tokens(2)),
19 | fips = T(()=>tokens(3)),
20 | country = T(()=>tokens(4).toLowerCase())
21 | )
22 | }).toDS()
23 | }
24 |
25 | }
26 |
27 |
--------------------------------------------------------------------------------
/src/test/scala/com/aamend/spark/gdelt/ContentFetcherTest.scala:
--------------------------------------------------------------------------------
1 | package com.aamend.spark.gdelt
2 |
3 | import org.apache.spark.ml.Pipeline
4 | import org.scalatest.Matchers
5 |
6 | class ContentFetcherTest extends SparkSpec with Matchers {
7 |
8 | sparkTest("testing E2E pipeline") { spark =>
9 |
10 | import spark.implicits._
11 | val gdeltDf = List("https://www.theguardian.com/world/2018/jun/01/mariano-rajoy-ousted-as-spain-prime-minister").toDF("sourceUrl")
12 |
13 | val contentFetcher = new ContentFetcher()
14 | .setInputCol("sourceUrl")
15 | .setOutputImageUrlCol("imageUrl")
16 | .setOutputImageBase64Col("imageBase64")
17 | .setImagemagickConvert("/usr/local/bin/convert")
18 | .setImagemagickIdentify("/usr/local/bin/identify")
19 |
20 | val contentDF = contentFetcher.transform(gdeltDf)
21 | contentDF.show(false)
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/test/scala/com/aamend/spark/gdelt/TTest.scala:
--------------------------------------------------------------------------------
1 | package com.aamend.spark.gdelt
2 |
3 | import org.scalatest.{FlatSpec, Matchers}
4 |
5 | class TTest extends FlatSpec with Matchers {
6 |
7 | "null" should "return None" in {
8 | T(()=>null) should be(None)
9 | T(()=>null.toString) should be(None)
10 | }
11 |
12 | "Integer" should "return Int" in {
13 | T(()=>"1".toInt) should be(Some(1))
14 | T(()=>"a".toInt) should be(None)
15 | }
16 |
17 | "Long" should "return Long" in {
18 | T(()=>"1".toLong) should be(Some(1L))
19 | T(()=>"a".toLong) should be(None)
20 | }
21 |
22 | "Float" should "return Float" in {
23 | T(()=>"1.0".toFloat) should be(Some(1.0))
24 | T(()=>"a".toFloat) should be(None)
25 | }
26 |
27 | "String" should "return String" in {
28 | T(()=>"1") should be(Some("1"))
29 | T(()=>" 1 ") should be(Some("1"))
30 | T(()=>" ") should be(None)
31 | T(()=>"") should be(None)
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/resources/com/aamend/spark/gdelt/reference/cameoType.txt:
--------------------------------------------------------------------------------
1 | CODE LABEL
2 | COP Police forces
3 | GOV Government
4 | INS Insurgents
5 | JUD Judiciary
6 | MIL Military
7 | OPP Political Opposition
8 | REB Rebels
9 | SEP Separatist Rebels
10 | SPY State Intelligence
11 | UAF Unaligned Armed Forces
12 | AGR Agriculture
13 | BUS Business
14 | CRM Criminal
15 | CVL Civilian
16 | DEV Development
17 | EDU Education
18 | ELI Elites
19 | ENV Environmental
20 | HLH Health
21 | HRI Human Rights
22 | LAB Labor
23 | LEG Legislature
24 | MED Media
25 | REF Refugees
26 | MOD Moderate
27 | RAD Radical
28 | AMN Amnesty International
29 | IRC Red Cross
30 | GRP Greenpeace
31 | UNO United Nations
32 | PKO Peacekeepers
33 | UIS Unidentified State Actor
34 | IGO Inter-Governmental Organization
35 | IMG International Militarized Group
36 | INT International/Transnational Generic
37 | MNC Multinational Corporation
38 | NGM Non-Governmental Movement
39 | NGO Non-Governmental Organization
40 | UIS Unidentified State Actor
41 | SET Settler
--------------------------------------------------------------------------------
/src/main/scala/com/aamend/spark/gdelt/reference/GcamCodes.scala:
--------------------------------------------------------------------------------
1 | package com.aamend.spark.gdelt.reference
2 |
3 | import com.aamend.spark.gdelt.GcamCode
4 | import com.aamend.spark.gdelt.T
5 | import org.apache.spark.sql.{Dataset, SparkSession}
6 |
7 | import scala.io.Source
8 |
9 | object GcamCodes {
10 |
11 | def load(spark: SparkSession): Dataset[GcamCode] = {
12 | import spark.implicits._
13 | Source.fromInputStream(this.getClass.getResourceAsStream("gcam.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
14 | val tokens = line.split("\t")
15 | GcamCode(
16 | gcamCode = T(()=>tokens(0)),
17 | dictionaryId = T(()=>tokens(1)),
18 | dimensionId = T(()=>tokens(2)),
19 | dictionaryType = T(()=>tokens(3)),
20 | languageCode = T(()=>tokens(4)),
21 | dictionaryHumanName = T(()=>tokens(5)),
22 | dimensionHumanName = T(()=>tokens(6)),
23 | dictionaryCitation = T(()=>tokens(7))
24 | )
25 | }).toDS()
26 | }
27 |
28 | }
29 |
30 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/AbstractHtmlFetcher.scala:
--------------------------------------------------------------------------------
1 | package com.gravity.goose.network
2 |
3 | import com.gravity.goose.Configuration
4 | import org.apache.http.client.HttpClient
5 |
6 | /**
7 | * Created by IntelliJ IDEA.
8 | * Author: Robbie Coleman
9 | * Date: 10/13/12
10 | * Time: 1:02 AM
11 | *
12 | * The workhorse of goose. Override the {@see com.gravity.goose.network.HtmlFetcher} within your configuration for complete control.
13 | */
14 | trait AbstractHtmlFetcher {
15 | /**
16 | * Access the `url` over the internet and retrieve the HTML from it
17 | * @param config overrides and tweaks
18 | * @param url the address to access and retrieve content from
19 | * @return `Some` `String` of the response from the specified `url` or `None` if failed to retrieve HTML.
20 | */
21 | def getHtml(config: Configuration, url: String): Option[String]
22 |
23 | /**
24 | * A shared accessor for making image calls
25 | * @return a fully configured and initialized instance for shared use
26 | */
27 | def getHttpClient: HttpClient
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/SecretGifException.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.images
19 |
20 | /**
21 | * Created by Jim Plush
22 | * User: jim
23 | * Date: 8/18/11
24 | */
25 | class SecretGifException extends Exception {
26 | }
27 |
28 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/cleaners/StandardDocumentCleaner.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.cleaners
19 |
20 | /**
21 | * Created by Jim Plush
22 | * User: jim
23 | * Date: 8/16/11
24 | */
25 |
26 | class StandardDocumentCleaner extends DocumentCleaner {
27 |
28 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/Image.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.images
19 |
20 | /**
21 | * Created by Jim Plush
22 | * User: jim
23 | * Date: 8/18/11
24 | */
25 |
26 | class Image {
27 | var imageSrc: String = ""
28 | var imageBase64: String = ""
29 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/StandardContentExtractor.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.extractors
19 |
20 | import com.gravity.goose.utils.Logging
21 |
22 |
23 | /**
24 | * Created by Jim Plush
25 | * User: jim
26 | * Date: 8/15/11
27 | */
28 |
29 | object StandardContentExtractor extends ContentExtractor
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/NotHtmlException.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.network
20 |
21 | /**
22 | * Created by Jim Plush
23 | * User: jim
24 | * Date: 8/14/11
25 | */
26 |
27 | class NotHtmlException(url: String) extends Exception {
28 | override val getMessage = "No HTML returned for url: " + url
29 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/outputformatters/StandardOutputFormatter.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.outputformatters
20 |
21 | import com.gravity.goose.utils.Logging
22 |
23 | /**
24 | * Created by Jim Plush
25 | * User: jim
26 | * Date: 8/17/11
27 | */
28 |
29 | object StandardOutputFormatter extends OutputFormatter with Logging
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/AdditionalDataExtractor.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.extractors
19 |
20 | import org.jsoup.nodes.Element
21 |
22 | /**
23 | * Implement this abstract class to extract anything not currently contained within the {@link com.jimplush.goose.Article} class
24 | */
25 | class AdditionalDataExtractor extends Extractor[Map[String, String]] {
26 | def extract(rootElement: Element): Map[String, String] = {
27 | Map.empty
28 | }
29 | }
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/statichtml/msn1_result.txt:
--------------------------------------------------------------------------------
1 | "Head to the supermarket an hour before closing time. Some stores mark down prepared foods and bakery items then because they can't sell them the following day. You could get a rotisserie chicken or freshly baked cookies for 50 percent off, or nab two sushi meals for the price of one. If you're planning to host a party or some other gathering, it's worth your time to ask the deli or bakery manager for a 5 to 10 percent discount off your catering order. Also, keep an eye out for online coupons: Some grocery stores accept coupons printed out from sites like TheGroceryGame.com, ShopAtHome.com, and CouponMom.com, even though they rarely publicize the fact. (Find out your store's policy at the customer-service counter.) It also pays to check the market's own website. You could find weekly deals there that it doesn't advertise anywhere else, including its in-store flyers.
2 |
3 | "And even though it's convenient to do all your shopping in one place, avoid going to a grocery store for kitchen supplies, like measuring cups and cookie sheets, or seasonal items, like holiday decorations and gift bags. These products will have inflated prices. Buy them at a big-box chain, like Target or Walmart, instead."
4 |
5 | More from Bing and MSN Lifestyle Site Search: Get additional content on saving on your grocery bill
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/HashUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.text
20 |
21 | import java.security.MessageDigest
22 |
23 | /**
24 | * Created by Jim Plush
25 | * User: jim
26 | * Date: 8/14/11
27 | */
28 |
29 | object HashUtils {
30 |
31 | def md5(s: String): String = {
32 | val md5 = MessageDigest.getInstance("MD5")
33 |
34 | md5.reset()
35 | md5.update(s.getBytes)
36 |
37 | val result = md5.digest().map(0xFF & _).map { "%02x".format(_) }.mkString
38 |
39 | result
40 | }
41 |
42 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/StringSplitter.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.text
20 |
21 | /**
22 | * Created by IntelliJ IDEA.
23 | * User: robbie
24 | * Date: 5/13/11
25 | * Time: 3:53 PM
26 | */
27 |
28 | import java.util.regex.Pattern
29 |
30 | class StringSplitter {
31 | def this(pattern: String) {
32 | this ()
33 | this.pattern = Pattern.compile(pattern)
34 | }
35 |
36 | def split(input: String): Array[String] = {
37 | if (string.isNullOrEmpty(input)) return string.emptyArray
38 | pattern.split(input)
39 | }
40 |
41 | private var pattern: Pattern = null
42 | }
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/utils/FileHelper.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.utils
20 |
21 | import org.apache.commons.io.IOUtils
22 | import java.io.{IOException, InputStream}
23 |
24 |
25 | /**
26 | * Created by Jim Plush
27 | * User: jim
28 | * Date: 8/16/11
29 | */
30 |
31 | object FileHelper extends Logging {
32 |
33 | def loadResourceFile[A](filename: String, cls: Class[A]): String = {
34 | var filedata: String = ""
35 | val is: InputStream = cls.getResourceAsStream(filename)
36 | try {
37 | filedata = IOUtils.toString(is, "UTF-8")
38 | }
39 | catch {
40 | case e: IOException => warn(e, e.toString)
41 | }
42 | filedata
43 | }
44 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/string.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.text
20 |
21 | /**
22 | * Created by IntelliJ IDEA.
23 | * User: robbie
24 | * Date: 5/13/11
25 | * Time: 12:11 AM
26 | */
27 |
28 | object string {
29 | def isNullOrEmpty(input: String): Boolean = {
30 | if (input == null) return true
31 | if (input.length == 0) return true
32 | false
33 | }
34 |
35 | val empty: String = ""
36 | val emptyArray: Array[String] = Array[String](empty)
37 | var SPACE_SPLITTER: StringSplitter = new StringSplitter(" ")
38 |
39 | def tryToInt(input: String): Option[Int] = {
40 | try {
41 | Some(input.toInt)
42 | } catch {
43 | case _: Exception => None
44 | }
45 | }
46 | }
47 |
48 |
49 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/Extractor.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.extractors
19 |
20 | import org.jsoup.nodes.Element
21 |
22 | /**
23 | * Created by IntelliJ IDEA.
24 | * User: robbie
25 | * Date: 5/19/11
26 | * Time: 2:45 PM
27 | */
28 | /**
29 | * Encapsulates the process of extracting some type This will be called by the {@link com.jimplush.goose.ContentExtractor#extractContent(String)} method and will be passed to {@link com.jimplush.goose.Article#setPublishDate(java.sql.Date)} Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically.
14 | The classpath for compilation is taken from the Eclipse project .classpath file. You may take the source directory from there as well if you wish (exercise to the reader).
15 | The params are not passed to the fsc in the command line since in my project's case the line is too long for the OS to handle. The alternative is to put it into a file and let fsc handle it for you. Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.T from an article
30 | * @param T
38 | */
39 | def extract(rootElement: Element): T
40 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/ImageDetails.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.images
19 |
20 | /**
21 | * Created by Jim Plush
22 | * User: jim
23 | * Date: 8/18/11
24 | */
25 |
26 | /**
27 | * holds the details of the result of inspecting an image
28 | * @author Jim Plush
29 | *
30 | */
31 | class ImageDetails {
32 | def getWidth: Int = width
33 |
34 | def setWidth(width: Int) {
35 | this.width = width
36 | }
37 |
38 | def getHeight: Int = height
39 |
40 | def setHeight(height: Int) {
41 | this.height = height
42 | }
43 |
44 | def getMimeType: String = mimeType
45 |
46 | def setMimeType(mimeType: String) {
47 | this.mimeType = mimeType
48 | }
49 |
50 | /**
51 | * the width of the image
52 | */
53 | private var width: Int = 0
54 | /**
55 | * height of the image
56 | */
57 | private var height: Int = 0
58 | /**
59 | * the mimeType of the image JPEG / PNG
60 | */
61 | private var mimeType: String = _
62 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/StringReplacement.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.text
20 |
21 | /**
22 | * Created by IntelliJ IDEA.
23 | * User: robbie
24 | * Date: 5/13/11
25 | * Time: 11:38 AM
26 | */
27 |
28 | import java.util.regex.Pattern
29 |
30 | object StringReplacement {
31 | def compile(pattern: String, replaceWith: String): StringReplacement = {
32 | if (string.isNullOrEmpty(pattern)) throw new IllegalArgumentException("Patterns must not be null or empty!")
33 | var p: Pattern = Pattern.compile(pattern)
34 | return new StringReplacement(p, replaceWith)
35 | }
36 | }
37 |
38 | class StringReplacement {
39 | private def this(pattern: Pattern, replaceWith: String) {
40 | this ()
41 | this.pattern = pattern
42 | this.replaceWith = replaceWith
43 | }
44 |
45 | def replaceAll(input: String): String = {
46 | if (string.isNullOrEmpty(input)) return string.empty
47 | return pattern.matcher(input).replaceAll(replaceWith)
48 | }
49 |
50 | private var pattern: Pattern = null
51 | private var replaceWith: String = null
52 | }
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/src/test/resources/com/aamend/spark/gdelt/normDaily.csv:
--------------------------------------------------------------------------------
1 | 19200101,87765
2 | 19200102,121054
3 | 19200103,153580
4 | 19200104,110228
5 | 19200105,63362
6 | 19200106,85
7 | 19790101,661
8 | 19790102,976
9 | 19790103,1060
10 | 19790104,950
11 | 19790105,1027
12 | 19790106,644
13 | 19790107,884
14 | 19790108,1291
15 | 19790109,1287
16 | 19790110,1290
17 | 19790111,866
18 | 19790112,1443
19 | 19790113,726
20 | 19790114,851
21 | 19790115,1138
22 | 19790116,1290
23 | 19790117,1279
24 | 19790118,1353
25 | 19790119,1324
26 | 19790120,1165
27 | 19790121,744
28 | 19790122,1065
29 | 19790123,1277
30 | 19790124,1079
31 | 19790125,1156
32 | 19790126,1563
33 | 19790127,780
34 | 19790128,983
35 | 19790129,1285
36 | 19790130,1168
37 | 19790131,925
38 | 19790201,902
39 | 19790202,1625
40 | 19790203,738
41 | 19790204,602
42 | 19790205,1332
43 | 19790206,1170
44 | 19790207,1218
45 | 19790208,1286
46 | 19790209,1383
47 | 19790210,825
48 | 19790211,714
49 | 19790212,944
50 | 19790213,1056
51 | 19790214,1184
52 | 19790215,1159
53 | 19790216,1473
54 | 19790217,1216
55 | 19790218,1072
56 | 19790219,1581
57 | 19790220,1246
58 | 19790221,1617
59 | 19790222,1165
60 | 19790223,1685
61 | 19790224,875
62 | 19790225,1255
63 | 19790226,1364
64 | 19790227,1247
65 | 19790228,1516
66 | 19790301,789
67 | 19790302,1237
68 | 19790303,507
69 | 19790304,648
70 | 19790305,839
71 | 19790306,864
72 | 19790307,842
73 | 19790308,648
74 | 19790309,1145
75 | 19790310,738
76 | 19790311,719
77 | 19790312,1465
78 | 19790313,969
79 | 19790314,1034
80 | 19790315,1420
81 | 19790316,2019
82 | 19790317,1349
83 | 19790318,1056
84 | 19790319,1312
85 | 19790320,1450
86 | 19790321,1387
87 | 19790322,1354
88 | 19790323,1630
89 | 19790324,1125
90 | 19790325,967
91 | 19790326,1043
92 | 19790327,1297
93 | 19790328,1244
94 | 19790329,1286
95 | 19790330,1661
96 | 19790331,1120
97 | 19790401,1240
98 | 19790402,1038
99 | 19790403,1193
100 | 19790404,1276
101 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/network/HttpExceptions.scala:
--------------------------------------------------------------------------------
1 | package com.gravity.goose.network
2 |
3 | /**
4 | * Created by IntelliJ IDEA.
5 | * Author: Robbie Coleman
6 | * Date: 11/2/11
7 | * Time: 10:25 AM
8 | */
9 |
10 | class LoggableException(msg: String, innerEx: Exception = null) extends Exception(msg, innerEx) {
11 | override lazy val getMessage = {
12 | val innerMessage = if (innerEx != null) {
13 | "%n\tand inner Exception of type %s:%n\t\tmessage: %s".format(innerEx.getClass.getName, innerEx.getMessage)
14 | } else {
15 | ""
16 | }
17 | getClass.getName + " ==> " + msg + innerMessage
18 | }
19 | }
20 |
21 | class NotFoundException(url: String) extends LoggableException("SERVER RETURNED 404 FOR LINK: " + url)
22 | class BadRequestException(url: String) extends LoggableException("Bad Request for URL: " + url)
23 | class NotAuthorizedException(url: String, statusCode: Int = 403) extends LoggableException("Not authorized (statusCode: %d) to access URL: %s".format(statusCode, url))
24 | class ServerErrorException(url: String, statusCode: Int = 500) extends LoggableException("Server Error! Status code returned: %d for URL: %s".format(statusCode, url))
25 | class UnhandledStatusCodeException(url: String, statusCode: Int) extends LoggableException("Received HTTP statusCode: %d from URL: %s and did not know how to handle it!".format(statusCode, url))
26 |
27 | object HttpStatusValidator {
28 | def validate(url: String, statusCode: Int): Either[Exception, String] = statusCode match {
29 | case 200 => Right("OK")
30 | case 400 => Left(new BadRequestException(url))
31 | case 404 => Left(new NotFoundException(url))
32 | case auth if (auth > 400 && auth < 500) => Left(new NotAuthorizedException(url, auth))
33 | case error if (error > 499) => Left(new ServerErrorException(url, error))
34 | case unk => Left(new UnhandledStatusCodeException(url, statusCode))
35 | }
36 | }
37 |
38 | class ImageFetchException(imgSrc: String, ex: Exception = null) extends LoggableException("Failed to fetch image file from imgSrc: " + imgSrc, ex)
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/WordStats.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.text
20 |
21 | import java.util.ArrayList
22 | import java.util.List
23 |
24 | /**
25 | * User: Jim Plush
26 | * Date: Oct 29, 2010
27 | * Time: 3:59:44 PM
28 | */
29 | object WordStats {
30 | var EMPTY: WordStats = new WordStats
31 | }
32 |
33 | class WordStats {
34 |
35 |
36 | import WordStats._
37 |
38 | /**
39 | * total number of stopwords or good words that we can calculate
40 | */
41 | var stopWordCount: Int = 0
42 | /**
43 | * total number of words on a node
44 | */
45 | var wordCount: Int = 0
46 | /**
47 | * holds an actual list of the stop words we found
48 | */
49 | var stopWords: List[String] = new ArrayList[String]
50 |
51 | def getStopWords: List[String] = {
52 | stopWords
53 | }
54 |
55 | def setStopWords(words: List[String]) {
56 | stopWords = words
57 | }
58 |
59 | def getStopWordCount: Int = {
60 | stopWordCount
61 | }
62 |
63 | def setStopWordCount(wordcount: Int) {
64 | stopWordCount = wordcount
65 | }
66 |
67 | def getWordCount: Int = {
68 | wordCount
69 | }
70 |
71 | def setWordCount(cnt: Int) {
72 | wordCount = cnt
73 | }
74 |
75 |
76 | }
77 |
78 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/spark/package.scala:
--------------------------------------------------------------------------------
1 | package com.gravity.goose
2 |
3 | import java.sql.Date
4 |
5 | import org.apache.commons.lang.StringUtils
6 |
7 | import scala.util.Try
8 |
9 | package object spark {
10 |
11 | val ANNOTATOR_TITLE = "title"
12 | val ANNOTATOR_CONTENT = "content"
13 | val ANNOTATOR_DESCRIPTION = "description"
14 | val ANNOTATOR_KEYWORDS = "keywords"
15 | val ANNOTATOR_PUBLISH_DATE = "publishDate"
16 |
17 | // List of supported annotators
18 | val ANNOTATORS = Array(
19 | ANNOTATOR_TITLE,
20 | ANNOTATOR_CONTENT,
21 | ANNOTATOR_DESCRIPTION,
22 | ANNOTATOR_KEYWORDS,
23 | ANNOTATOR_PUBLISH_DATE
24 | )
25 |
26 | def scrapeArticles(it: Iterator[String], goose: Goose): Iterator[GooseArticle] = {
27 | it.map(url => {
28 | Try {
29 | val article = goose.extractContent(url)
30 | GooseArticle(
31 | url = url,
32 | title = if(StringUtils.isNotEmpty(article.title)) Some(article.title) else None,
33 | content = if(StringUtils.isNotEmpty(article.cleanedArticleText)) Some(article.cleanedArticleText.replaceAll("\\n+", "\n")) else None,
34 | description = if(StringUtils.isNotEmpty(article.metaDescription)) Some(article.metaDescription) else None,
35 | keywords = if(StringUtils.isNotEmpty(article.metaKeywords)) article.metaKeywords.split(",").map(_.trim.toUpperCase) else Array.empty[String],
36 | publishDate = if(article.publishDate != null) Some(new Date(article.publishDate.getTime)) else None,
37 | image = None
38 | )
39 | } getOrElse GooseArticle(url)
40 | })
41 | }
42 |
43 | case class GooseArticle(
44 | url: String,
45 | title: Option[String] = None,
46 | content: Option[String] = None,
47 | description: Option[String] = None,
48 | keywords: Array[String] = Array.empty[String],
49 | publishDate: Option[Date] = None,
50 | image: Option[String] = None
51 | )
52 | }
53 |
--------------------------------------------------------------------------------
/src/test/resources/com/aamend/spark/gdelt/normDailyByCountry.csv:
--------------------------------------------------------------------------------
1 | 19200101,,2396
2 | 19200101,AC,10
3 | 19200101,AE,200
4 | 19200101,AF,699
5 | 19200101,AG,55
6 | 19200101,AJ,48
7 | 19200101,AL,20
8 | 19200101,AM,37
9 | 19200101,AO,76
10 | 19200101,AR,68
11 | 19200101,AS,2650
12 | 19200101,AU,165
13 | 19200101,AY,12
14 | 19200101,BA,72
15 | 19200101,BB,47
16 | 19200101,BC,18
17 | 19200101,BD,9
18 | 19200101,BE,104
19 | 19200101,BF,68
20 | 19200101,BG,456
21 | 19200101,BH,28
22 | 19200101,BK,15
23 | 19200101,BL,64
24 | 19200101,BM,70
25 | 19200101,BN,49
26 | 19200101,BO,67
27 | 19200101,BR,246
28 | 19200101,BT,180
29 | 19200101,BU,41
30 | 19200101,BX,36
31 | 19200101,BY,9
32 | 19200101,CA,1720
33 | 19200101,CB,174
34 | 19200101,CD,43
35 | 19200101,CE,351
36 | 19200101,CF,40
37 | 19200101,CG,11
38 | 19200101,CH,2595
39 | 19200101,CI,93
40 | 19200101,CJ,1
41 | 19200101,CM,90
42 | 19200101,CO,100
43 | 19200101,CS,14
44 | 19200101,CT,12
45 | 19200101,CU,96
46 | 19200101,CW,17
47 | 19200101,CY,306
48 | 19200101,DA,76
49 | 19200101,DJ,19
50 | 19200101,DR,18
51 | 19200101,EC,19
52 | 19200101,EG,350
53 | 19200101,EI,700
54 | 19200101,EK,3
55 | 19200101,EN,25
56 | 19200101,ER,1
57 | 19200101,ES,47
58 | 19200101,ET,145
59 | 19200101,EZ,18
60 | 19200101,FI,50
61 | 19200101,FJ,53
62 | 19200101,FR,974
63 | 19200101,GA,29
64 | 19200101,GB,6
65 | 19200101,GG,19
66 | 19200101,GH,450
67 | 19200101,GI,2
68 | 19200101,GJ,9
69 | 19200101,GK,7
70 | 19200101,GL,7
71 | 19200101,GM,604
72 | 19200101,GQ,11
73 | 19200101,GR,254
74 | 19200101,GT,80
75 | 19200101,GV,72
76 | 19200101,GY,60
77 | 19200101,GZ,8
78 | 19200101,HA,88
79 | 19200101,HK,518
80 | 19200101,HO,36
81 | 19200101,HR,51
82 | 19200101,HU,99
83 | 19200101,IC,23
84 | 19200101,ID,349
85 | 19200101,IN,5068
86 | 19200101,IR,3040
87 | 19200101,IS,1931
88 | 19200101,IT,421
89 | 19200101,IV,45
90 | 19200101,IZ,3500
91 | 19200101,JA,1203
92 | 19200101,JE,101
93 | 19200101,JM,138
94 | 19200101,JO,146
95 | 19200101,KE,353
96 | 19200101,KG,17
97 | 19200101,KN,776
98 | 19200101,KR,19
99 | 19200101,KS,552
100 | 19200101,KU,249
101 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/text/StopWords.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.text
20 |
21 | /**
22 | * Created by Jim Plush
23 | * User: jim
24 | * Date: 8/16/11
25 | */
26 |
27 | import java.util._
28 | import com.gravity.goose.utils.FileHelper
29 |
30 | object StopWords {
31 |
32 | // the confusing pattern below is basically just match any non-word character excluding white-space.
33 | private val PUNCTUATION: StringReplacement = StringReplacement.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]", string.empty)
34 |
35 | val STOP_WORDS = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass).split(sys.props("line.separator")).toSet
36 |
37 |
38 | def removePunctuation(str: String): String = {
39 | PUNCTUATION.replaceAll(str)
40 | }
41 |
42 | def getStopWordCount(content: String): WordStats = {
43 |
44 | if (string.isNullOrEmpty(content)) return WordStats.EMPTY
45 | val ws: WordStats = new WordStats
46 | val strippedInput: String = removePunctuation(content)
47 |
48 | val candidateWords: Array[String] = string.SPACE_SPLITTER.split(strippedInput)
49 |
50 | val overlappingStopWords: List[String] = new ArrayList[String]
51 |
52 | candidateWords.foreach(w => {
53 | if (STOP_WORDS.contains(w.toLowerCase)) overlappingStopWords.add(w.toLowerCase)
54 | })
55 | ws.setWordCount(candidateWords.length)
56 | ws.setStopWordCount(overlappingStopWords.size)
57 | ws.setStopWords(overlappingStopWords)
58 | ws
59 | }
60 |
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/test/scala/com/aamend/spark/gdelt/GdeltParserTest.scala:
--------------------------------------------------------------------------------
1 | package com.aamend.spark.gdelt
2 |
3 | import org.scalatest.Matchers
4 |
5 | import scala.io.Source
6 |
7 | class GdeltParserTest extends SparkSpec with Matchers {
8 |
9 | // I simply test all my dataframes can be loaded, no exception should be thrown
10 | sparkTest("loading GDELT universe") { spark =>
11 | import spark.implicits._
12 | Source.fromInputStream(this.getClass.getResourceAsStream("gkg.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV2).show()
13 | Source.fromInputStream(this.getClass.getResourceAsStream("gkgT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV2).show()
14 | Source.fromInputStream(this.getClass.getResourceAsStream("gkg1.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgV1).show()
15 | Source.fromInputStream(this.getClass.getResourceAsStream("gkg1Count.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseGkgCountV1).show()
16 | Source.fromInputStream(this.getClass.getResourceAsStream("events.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV2).show()
17 | Source.fromInputStream(this.getClass.getResourceAsStream("events1.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV1).show()
18 | Source.fromInputStream(this.getClass.getResourceAsStream("eventsT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseEventV2).show()
19 | Source.fromInputStream(this.getClass.getResourceAsStream("mentions.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseMentionV2).show()
20 | Source.fromInputStream(this.getClass.getResourceAsStream("mentionsT.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseMentionV2).show()
21 | Source.fromInputStream(this.getClass.getResourceAsStream("normDaily.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseNormDaily).show()
22 | Source.fromInputStream(this.getClass.getResourceAsStream("normDailyByCountry.csv"), "UTF-8").getLines().toSeq.toDS().map(GdeltParser.parseNormDailyByCountry).show()
23 | }
24 |
25 | // I simply test all my dataframes can be loaded, no exception should be thrown
26 | sparkTest("loading GDELT reference data") { spark =>
27 | spark.loadCountryCodes.show()
28 | spark.loadGcams.show()
29 | spark.loadCameoEventCodes.show()
30 | spark.loadCameoTypeCodes.show()
31 | spark.loadCameoGroupCodes.show()
32 | spark.loadCameoEthnicCodes.show()
33 | spark.loadCameoReligionCodes.show()
34 | spark.loadCameoCountryCodes.show()
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/Goose.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose
20 |
21 | import network.HtmlFetcher
22 | import java.io.File
23 |
24 | /**
25 | * Created by Jim Plush - Gravity.com
26 | * Date: 8/14/11
27 | */
28 | class Goose(config: Configuration = new Configuration) {
29 |
30 |
31 | initializeEnvironment()
32 |
33 | /**
34 | * Main method to extract an article object from a URL, pass in a url and get back a Article
35 | * @param url The url that you want to extract
36 | */
37 | def extractContent(url: String, rawHTML: String): Article = {
38 | val cc = CrawlCandidate(config, url, rawHTML)
39 | sendToActor(cc)
40 | }
41 |
42 | def extractContent(url: String): Article = {
43 | val cc = CrawlCandidate(config, url, null)
44 | sendToActor(cc)
45 | }
46 |
47 | def shutdownNetwork() {
48 | HtmlFetcher.getHttpClient.getConnectionManager.shutdown()
49 | }
50 |
51 | def sendToActor(crawlCandidate: CrawlCandidate): Article = {
52 | val crawler = new Crawler(config)
53 | val article = crawler.crawl(crawlCandidate)
54 | article
55 | }
56 |
57 | def initializeEnvironment() {
58 |
59 | val f = new File(config.localStoragePath)
60 | try {
61 | if (!f.isDirectory) f.mkdirs()
62 | } catch {
63 | case _: Exception =>
64 | }
65 | if (!f.isDirectory)
66 | throw new Exception(config.localStoragePath + " directory does not seem to exist, you need to set this for image processing downloads")
67 | if (!f.canWrite)
68 | throw new Exception(config.localStoragePath + " directory is not writable, you need to set this for image processing downloads")
69 |
70 | // todo cleanup any jank that may be in the tmp folder currently
71 | }
72 |
73 | }
74 |
75 | object Goose {
76 | implicit val config = new Configuration
77 | val logPrefix = "goose: "
78 | }
--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/statichtml/issue_24_result.txt:
--------------------------------------------------------------------------------
1 | TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. Most of our team is using Eclipse as the main IDE, its incrimental compilation in Java with its tight JUnit integration are great for fast TDD programming. Unfortunately the Eclipse Scala plugin is not there yet, it may hangs the IDE and messes up Java compilation - especially in large (more then 1000 source files) Java/Scala projects. Though the plugin is getting better over time some developers would find the plugin as a majore drag on their productivity. For developers who do not write Scala at all or rather edit Scala with other editors, you can use this alternate path which lets them work on their Java or Scala code without messing with the plugin.
2 |
3 | Paragraph 1 - The Following script is using the Fast Scala Compiler (fsc). The fsc is a compilation server which always run in the background, as in a warm scalac always ready to receive new work. Is will reduce compilation time dramatically. The classpath for compilation is taken from the Eclipse project .classpath file. You may take the source directory from there as well if you wish (exercise to the reader). The params are not passed to the fsc in the command line since in my project's case the line is too long for the OS to handle. The alternative is to put it into a file and let fsc handle it for you.
4 |
5 | TextNode 2 - As you may know, kaChing is an test driven engineering organization. Test driven is not an option, its a must. We move fast and push code to production few dozens of times a day in a five minutes release cycle, so we must have high confidence in our code. In complex systems there is no end to testings, each test system is an another line of defense which eventually gets broken but the more you have, the less chances bugs will reach production. We do not have QA team and do not want to have one, the reasoning is that if a human is involved in testing then there is a higher chance of missing things and you simply can't test all the site dozens of times a day.
6 |
7 | Paragraph 2 - In the next few weeks we are adding a new rule from the "not critical" list every few days. The goal is to have all the rules we think are important without the common "its to noisy, lets ignore it" approche. Only after we're done with that we're going to add the next static analysis tool to build. The good thing about these tools and hudson is that you can run them in parallel to the unit/integration tests, on another machine, so they won't slow down the overall release cycle.
--------------------------------------------------------------------------------
/src/main/scala/com/aamend/spark/gdelt/reference/CameoCodes.scala:
--------------------------------------------------------------------------------
1 | package com.aamend.spark.gdelt.reference
2 |
3 | import com.aamend.spark.gdelt.CameoCode
4 | import com.aamend.spark.gdelt.T
5 | import org.apache.spark.sql.{Dataset, SparkSession}
6 |
7 | import scala.io.Source
8 |
9 | object CameoCodes {
10 |
11 | def loadEventCode(spark: SparkSession): Dataset[CameoCode] = {
12 | import spark.implicits._
13 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoEvent.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
14 | val tokens = line.split("\t")
15 | CameoCode(
16 | cameoCode = T(()=>tokens(0).toUpperCase()),
17 | cameoValue = T(()=>tokens(1).toLowerCase())
18 | )
19 | }).toDS()
20 | }
21 |
22 | def loadTypeCode(spark: SparkSession): Dataset[CameoCode] = {
23 | import spark.implicits._
24 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoType.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
25 | val tokens = line.split("\t")
26 | CameoCode(
27 | cameoCode = T(()=>tokens(0).toUpperCase()),
28 | cameoValue = T(()=>tokens(1).toLowerCase())
29 | )
30 | }).toDS()
31 | }
32 |
33 | def loadGroupCode(spark: SparkSession): Dataset[CameoCode] = {
34 | import spark.implicits._
35 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoGroup.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
36 | val tokens = line.split("\t")
37 | CameoCode(
38 | cameoCode = T(()=>tokens(0).toUpperCase()),
39 | cameoValue = T(()=>tokens(1).toLowerCase())
40 | )
41 | }).toDS()
42 | }
43 |
44 | def loadEthnicCode(spark: SparkSession): Dataset[CameoCode] = {
45 | import spark.implicits._
46 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoEthnic.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
47 | val tokens = line.split("\t")
48 | CameoCode(
49 | cameoCode = T(()=>tokens(0).toUpperCase()),
50 | cameoValue = T(()=>tokens(1).toLowerCase())
51 | )
52 | }).toDS()
53 | }
54 |
55 | def loadReligionCode(spark: SparkSession): Dataset[CameoCode] = {
56 | import spark.implicits._
57 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoReligion.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
58 | val tokens = line.split("\t")
59 | CameoCode(
60 | cameoCode = T(()=>tokens(0).toUpperCase()),
61 | cameoValue = T(()=>tokens(1).toLowerCase())
62 | )
63 | }).toDS()
64 | }
65 |
66 | def loadCountryCode(spark: SparkSession): Dataset[CameoCode] = {
67 | import spark.implicits._
68 | Source.fromInputStream(this.getClass.getResourceAsStream("cameoCountry.txt"), "UTF-8").getLines().toSeq.drop(1).map(line => {
69 | val tokens = line.split("\t")
70 | CameoCode(
71 | cameoCode = T(()=>tokens(0).toUpperCase()),
72 | cameoValue = T(()=>tokens(1).toLowerCase())
73 | )
74 | }).toDS()
75 | }
76 | }
77 |
78 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/extractors/PublishDateExtractor.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.extractors
19 |
20 | import java.sql.Date
21 | import javax.xml.datatype.DatatypeFactory
22 |
23 | import com.gravity.goose.utils.Logging
24 | import org.jsoup.nodes.Element
25 |
26 | /**
27 | * Implement this class to extract the {@link Date} of when this article was published.
28 | */
29 | /**
30 | * Created by IntelliJ IDEA.
31 | * User: robbie
32 | * Date: 5/19/11
33 | * Time: 2:50 PM
34 | */
35 | abstract class PublishDateExtractor extends Extractor[Date] {
36 | /**
37 | * Intended to search the DOM and identify the {@link Date} of when this article was published.
38 | * null if no date could be found.
42 | */
43 | def extract(rootElement: Element): Date
44 | }
45 |
46 | object PublishDateExtractor extends Logging {
47 | val logPrefix = "PublishDateExtractor: "
48 |
49 | lazy val datatypeFactory: DatatypeFactory = DatatypeFactory.newInstance()
50 |
51 | /**
52 | * Helper function to return the minimum of two non-null Java Dates.
53 | */
54 | def minDate(lhs: java.sql.Date, rhs: java.sql.Date): java.sql.Date = {
55 | if (lhs.getTime < rhs.getTime)
56 | lhs
57 | else
58 | rhs
59 | }
60 |
61 | /**
62 | * Helper function to parse ISO 8601 date/time strings safely.
63 | */
64 | def safeParseISO8601Date(txt: String): Option[java.sql.Date] = {
65 | if (txt == null || txt.isEmpty)
66 | return None
67 |
68 | try {
69 | Option(new Date(datatypeFactory.newXMLGregorianCalendar(txt).toGregorianCalendar.getTime.getTime))
70 | } catch {
71 | case ex: Exception =>
72 | info(s"`$txt` could not be parsed to date as it did not meet the ISO 8601 spec")
73 | None
74 | }
75 | }
76 | }
77 |
78 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/ImageExtractor.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.images
19 |
20 | import org.jsoup.nodes.{Document, Element}
21 | import com.gravity.goose.utils.{CanLog, Logging}
22 | import org.slf4j.Logger
23 |
24 | /**
25 | * Created by Jim Plush
26 | * User: jim
27 | * Date: 8/18/11
28 | */
29 |
30 | // represents a file stored on disk that we've downloaded
31 | case class LocallyStoredImage(
32 | imgSrc: String,
33 | mimeType: String,
34 | base64: String,
35 | bytes: Long,
36 | height: Int = 0,
37 | width: Int = 0
38 | )
39 |
40 | trait ImageExtractor extends CanLog {
41 |
42 | def getBestImage(doc: Document, topNode: Element): Image
43 |
44 | def logPrefix: String = ImageExtractor.loggingPrefix
45 |
46 | def critical(msg: String, refs: Any*) {
47 | ImageExtractor.critical(msg, refs: _*)
48 | }
49 |
50 | def critical(t: Throwable, msg: String, refs: Any*) {
51 | ImageExtractor.critical(t, msg, refs: _*)
52 | }
53 |
54 | def debug(msg: String, refs: Any*) {
55 | ImageExtractor.debug(msg, refs: _*)
56 | }
57 |
58 | def debug(t: Throwable, msg: String, refs: Any*) {
59 | ImageExtractor.debug(t, msg, refs: _*)
60 | }
61 |
62 | def info(msg: String, refs: Any*) {
63 | ImageExtractor.info(msg, refs: _*)
64 | }
65 |
66 | def info(t: Throwable, msg: String, refs: Any*) {
67 | ImageExtractor.info(t, msg, refs: _*)
68 | }
69 |
70 | def logger: Logger = ImageExtractor.logger
71 |
72 | def trace(msg: String, refs: Any*) {
73 | ImageExtractor.trace(msg, refs: _*)
74 | }
75 |
76 | def trace(t: Throwable, msg: String, refs: Any*) {
77 | ImageExtractor.trace(t, msg, refs: _*)
78 | }
79 |
80 | def warn(msg: String, refs: Any*) {
81 | ImageExtractor.warn(msg, refs: _*)
82 | }
83 |
84 | def warn(t: Throwable, msg: String, refs: Any*) {
85 | ImageExtractor.warn(t, msg, refs: _*)
86 | }
87 | }
88 |
89 | object ImageExtractor extends Logging {
90 | val loggingPrefix = "images: "
91 | }
92 |
93 |
--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/statichtml/issue_24.txt:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
38 | *
39 | * String text = " Some example text ";
40 | * text = text.{@link String#replaceAll(String, String) replaceAll}("e", "E");
41 | * text = text.{@link String#replaceAll(String, String) replaceAll}(" ", "_");
42 | * text = text.{@link String#replaceAll(String, String) replaceAll}("^\\s+$", "");
43 | *
44 | *
45 | * You can use a ReplaceSequence like this:
46 | *
47 | *
48 | * static final betterReplacements = ReplaceSequence.{@link #create(String, String) create}("e", "E").{@link #append(String, String) append}(" ", "_").{@link #append(String) append}("^\\s+$");
49 | *
50 | * void fixMyString(String text) {
51 | * return betterReplacements.{@link #replaceAll(String) replaceAll}(text);
52 | * }
53 | *
54 | *
55 | *
56 | * Internally, an ordered list of {@link Matcher}s and its associated replacement is built as the {@link #append} method is called.ReplaceSequence with the first pattern to be replaced with an empty String
64 | * @param firstPattern The regex {@link Pattern pattern} string for the first replacement
65 | * @return a new instance
66 | */
67 | def create(firstPattern: String): ReplaceSequence = {
68 | create(firstPattern, string.empty)
69 | }
70 |
71 | /**
72 | * Creates a new ReplaceSequence with the first pattern to be replaced with the specified replaceWith parameter.
73 | * @param firstPattern The regex {@link Pattern pattern} {@link String} for the first replacement
74 | * @param replaceWith The {@link String} to replace matches of the specified pattern
75 | * @return a new instance
76 | */
77 | def create(firstPattern: String, replaceWith: String): ReplaceSequence = {
78 | val result: ReplaceSequence = new ReplaceSequence(StringReplacement.compile(firstPattern, replaceWith))
79 | result
80 | }
81 | }
82 |
83 | class ReplaceSequence {
84 | /**
85 | * Appends a new pattern to this instance in a builder pattern
86 | * @param pattern The regex {@link Pattern pattern} {@link String} for this replacement
87 | * @return this instance of itself for use in a builder pattern
88 | */
89 | def append(pattern: String): ReplaceSequence = {
90 | append(pattern, string.empty)
91 | }
92 |
93 | /**
94 | * Appends a new pattern to this instance in a builder pattern
95 | * @param pattern The regex {@link Pattern pattern} {@link String} for this replacement
96 | * @param replaceWith The {@link String} to replace matches of the specified pattern
97 | * @return this instance of itself for use in a builder pattern
98 | */
99 | def append(pattern: String, replaceWith: String): ReplaceSequence = {
100 | replacements.add(StringReplacement.compile(pattern, replaceWith))
101 | this
102 | }
103 |
104 | /**
105 | * Applies each of the replacements specified via the initial {@link #create(String)} and/or any additional via {@link #append(String)}
106 | * @param input the {@link String} to apply all of the replacements to
107 | * @return the resulting {@link String} after all replacements have been applied
108 | */
109 | def replaceAll(input: String): String = {
110 | if (string.isNullOrEmpty(input)) return string.empty
111 | var mutatedString = input
112 | import scala.collection.JavaConversions._
113 | for (rp <- replacements) {
114 | mutatedString = rp.replaceAll(mutatedString)
115 | }
116 | mutatedString
117 | }
118 |
119 | private def this(pair: StringReplacement) {
120 | this ()
121 | replacements.add(pair)
122 | }
123 |
124 | var replacements: List[StringReplacement] = new ArrayList[StringReplacement]
125 | }
126 |
127 |
128 |
129 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/spark/GooseFetcher.scala:
--------------------------------------------------------------------------------
1 | package com.gravity.goose.spark
2 |
3 | import com.gravity.goose.{Configuration, Goose}
4 | import org.apache.spark.ml.Transformer
5 | import org.apache.spark.ml.param._
6 | import org.apache.spark.ml.util._
7 | import org.apache.spark.sql.types._
8 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
9 |
10 | trait GooseFetcherParams extends Params with DefaultParamsWritable {
11 | val annotators = new Param[Map[String, String]](this, "annotators", s"The list of annotators [${ANNOTATORS.mkString(",")}]")
12 | val urlColumn = new Param[String](this, "urlColumn", "The input column containing URLs")
13 | val userAgent = new Param[String](this, "userAgent", "User agent that is sent with your web requests to extract URL content")
14 | val socketTimeout = new Param[Int](this, "socketTimeout", "Socket timeout (ms)")
15 | val connectionTimeout = new Param[Int](this, "connectionTimeout", "Connection timeout (ms)")
16 | val enableImageFetching = new Param[Boolean](this, "enableImageFetching", "(Experimental) Fetching image header as base64")
17 | }
18 |
19 | class GooseFetcher(override val uid: String) extends Transformer with GooseFetcherParams {
20 |
21 | def setAnnotators(value: Map[String, String]): this.type = {
22 | require(value.nonEmpty, "At least one annotator must be provided")
23 | require(value.values.toSet.size == value.keys.size, "Annotator fields must be unique")
24 | value.keys.foreach(annotator => require(ANNOTATORS.contains(annotator), s"Annotator [$annotator] is not valid, supported are [${ANNOTATORS.mkString(",")}]"))
25 | set(annotators, value)
26 | }
27 |
28 | setDefault(annotators -> ANNOTATORS.zip(ANNOTATORS).toMap)
29 |
30 | def setUserAgent(value: String): this.type = set(userAgent, value)
31 |
32 | setDefault(userAgent -> "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8")
33 |
34 | def setSocketTimeout(value: Int): this.type = set(socketTimeout, value)
35 |
36 | setDefault(socketTimeout -> 10000)
37 |
38 | def setConnectionTimeout(value: Int): this.type = set(connectionTimeout, value)
39 |
40 | setDefault(connectionTimeout -> 10000)
41 |
42 | def setEnableImageFetching(value: Boolean): this.type = set(enableImageFetching, value)
43 |
44 | setDefault(enableImageFetching -> false)
45 |
46 | def setUrlColumn(value: String): this.type = set(urlColumn, value)
47 |
48 | setDefault(urlColumn -> "url")
49 |
50 | def this() = this(Identifiable.randomUID("goose"))
51 |
52 | override def transform(origDS: Dataset[_]): DataFrame = {
53 |
54 | // Make sure the URL field exist
55 | require(origDS.schema.exists(s => s.name == $(urlColumn) && s.dataType == StringType), "Field [" + $(urlColumn) + "] is not valid")
56 |
57 | // Make sure annotators field do not exist
58 | $(annotators).values.foreach(annotator => {
59 | require(!origDS.schema.exists(s => s.name == annotator), s"Annotator field [$annotator] already exist")
60 | })
61 |
62 | // This intermediate dataset to make sure we don't scrape more than once a same URL
63 | val urlDF = origDS.select($(urlColumn)).dropDuplicates($(urlColumn))
64 |
65 | // Append URL dataframe with article annotators
66 | val urlContentRDD = urlDF.rdd.mapPartitions(rows => {
67 |
68 | // Initialize Goose only once for each partition
69 | val conf = new Configuration()
70 | conf.setEnableImageFetching($(enableImageFetching))
71 | conf.setBrowserUserAgent($(userAgent))
72 | conf.setSocketTimeout($(socketTimeout))
73 | conf.setConnectionTimeout($(connectionTimeout))
74 | val goose = new Goose(conf)
75 |
76 | // Scrape each URL individually
77 | val articles = scrapeArticles(rows.map(_.getAs[String]($(urlColumn))), goose)
78 |
79 | // Convert articles as Row
80 | articles.map(article => {
81 | val appended: Seq[Any] = $(annotators).map { case (key, _) =>
82 | key match {
83 | case ANNOTATOR_TITLE => article.title.getOrElse("")
84 | case ANNOTATOR_DESCRIPTION => article.description.getOrElse("")
85 | case ANNOTATOR_CONTENT => article.content.getOrElse("")
86 | case ANNOTATOR_KEYWORDS => article.keywords
87 | case ANNOTATOR_PUBLISH_DATE => article.publishDate.orNull
88 | }
89 | }.toSeq
90 | Row.fromSeq(Seq(article.url) ++ appended)
91 | })
92 | })
93 |
94 | // Transform RDD of Row to Dataframe
95 | val contentDF = origDS.sqlContext.createDataFrame(urlContentRDD, transformSchema(urlDF.schema))
96 |
97 | // Join articles back to any duplicate URL dataset
98 | contentDF.join(origDS, List($(urlColumn)))
99 |
100 | }
101 |
102 | override def transformSchema(schema: StructType): StructType = {
103 | StructType(
104 | schema.seq ++ $(annotators).map { case (key, value) =>
105 | key match {
106 | case ANNOTATOR_TITLE => StructField(value, StringType, nullable = false)
107 | case ANNOTATOR_DESCRIPTION => StructField(value, StringType, nullable = false)
108 | case ANNOTATOR_CONTENT => StructField(value, StringType, nullable = false)
109 | case ANNOTATOR_KEYWORDS => StructField(value, ArrayType.apply(StringType), nullable = false)
110 | case ANNOTATOR_PUBLISH_DATE => StructField(value, DateType, nullable = true)
111 | }
112 | }
113 | )
114 | }
115 |
116 | override def copy(extra: ParamMap): Transformer = {
117 | defaultCopy(extra)
118 | }
119 | }
120 |
121 | object GooseFetcher extends DefaultParamsReadable[GooseFetcher] {
122 | override def load(path: String): GooseFetcher = super.load(path)
123 | }
124 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/Configuration.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose
20 |
21 | import network.{HtmlFetcher, AbstractHtmlFetcher}
22 | import org.jsoup.nodes.Element
23 | import scala.beans.BeanProperty
24 | import com.gravity.goose.extractors.{StandardContentExtractor, ContentExtractor, AdditionalDataExtractor, PublishDateExtractor}
25 |
26 |
27 | /**
28 | * Created by Jim Plush
29 | * User: jim
30 | * Date: 8/16/11
31 | */
32 |
33 |
34 | class Configuration {
35 |
36 | /**
37 | * this is the local storage path used to place images to inspect them, should be writable
38 | */
39 | @BeanProperty
40 | var localStoragePath: String = "/tmp/goose"
41 | /**
42 | * What's the minimum bytes for an image we'd accept is, alot of times we want to filter out the author's little images
43 | * in the beginning of the article
44 | */
45 | @BeanProperty
46 | var minBytesForImages: Int = 4500
47 | /**
48 | * set this guy to false if you don't care about getting images, otherwise you can either use the default
49 | * image extractor to implement the ImageExtractor interface to build your own
50 | */
51 | @BeanProperty
52 | var enableImageFetching: Boolean = true
53 | /**
54 | * path to your imagemagick convert executable, on the mac using mac ports this is the default listed
55 | */
56 | @BeanProperty
57 | var imagemagickConvertPath: String = "/opt/local/bin/convert"
58 | /**
59 | * path to your imagemagick identify executable
60 | */
61 | @BeanProperty
62 | var imagemagickIdentifyPath: String = "/opt/local/bin/identify"
63 |
64 | @BeanProperty
65 | var connectionTimeout: Int = 10000
66 |
67 | @BeanProperty
68 | var socketTimeout: Int = 10000
69 |
70 | /**
71 | * used as the user agent that is sent with your web requests to extract an article
72 | */
73 | @BeanProperty
74 | var browserUserAgent: String = "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8"
75 |
76 | var contentExtractor: ContentExtractor = StandardContentExtractor
77 |
78 | var publishDateExtractor: PublishDateExtractor = new PublishDateExtractor {
79 | import PublishDateExtractor._
80 |
81 | def extractCandidate(rootElement: Element, selector: String): Seq[java.sql.Date] = {
82 | import scala.collection.JavaConversions._
83 |
84 | try {
85 | rootElement.select(selector).flatMap(item => safeParseISO8601Date(item.attr("content")))
86 | }
87 | catch {
88 | case e: Exception =>
89 | Nil
90 | }
91 | }
92 |
93 | final val pubSelectors = Seq(
94 | "meta[property~=article:published_time]"
95 | )
96 |
97 | final val modSelectors = Seq(
98 | "meta[property~=article:modified_time]",
99 | "meta[property~=og:updated_time]"
100 | )
101 |
102 | def extract(rootElement: Element): java.sql.Date = {
103 | // A few different ways to get a date.
104 | def bestPubDate = pubSelectors.flatMap(extractCandidate(rootElement, _)).reduceOption(minDate)
105 | def bestModDate = modSelectors.flatMap(extractCandidate(rootElement, _)).reduceOption(minDate)
106 |
107 | // Return the oldest 'published' date, or else the oldest 'modified' date, or null if none.
108 | bestPubDate.orElse(bestModDate).getOrElse(null)
109 | }
110 | }
111 |
112 | var additionalDataExtractor: AdditionalDataExtractor = new AdditionalDataExtractor
113 |
114 | def getPublishDateExtractor: PublishDateExtractor = {
115 | publishDateExtractor
116 | }
117 |
118 | def setContentExtractor(extractor: ContentExtractor) {
119 | if (extractor == null) throw new IllegalArgumentException("extractor must not be null!")
120 | contentExtractor = extractor
121 | }
122 |
123 | /**
124 | * Pass in to extract article publish dates.
125 | * @param extractor a concrete instance of {@link PublishDateExtractor}
126 | * @throws IllegalArgumentException if the instance passed in is null
127 | */
128 | def setPublishDateExtractor(extractor: PublishDateExtractor) {
129 | if (extractor == null) throw new IllegalArgumentException("extractor must not be null!")
130 | this.publishDateExtractor = extractor
131 | }
132 |
133 | def getAdditionalDataExtractor: AdditionalDataExtractor = {
134 | additionalDataExtractor
135 | }
136 |
137 | /**
138 | * Pass in to extract any additional data not defined within {@link Article}
139 | * @param extractor a concrete instance of {@link AdditionalDataExtractor}
140 | * @throws IllegalArgumentException if the instance passed in is null
141 | */
142 | def setAdditionalDataExtractor(extractor: AdditionalDataExtractor) {
143 | this.additionalDataExtractor = extractor
144 | }
145 |
146 | var htmlFetcher: AbstractHtmlFetcher = HtmlFetcher
147 |
148 | def setHtmlFetcher(fetcher: AbstractHtmlFetcher) {
149 | require(fetcher != null, "fetcher MUST NOT be null!")
150 | this.htmlFetcher = fetcher
151 | }
152 |
153 | def getHtmlFetcher: AbstractHtmlFetcher = htmlFetcher
154 |
155 | }
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/Crawler.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose
20 |
21 | import cleaners.{StandardDocumentCleaner, DocumentCleaner}
22 | import extractors.ContentExtractor
23 | import images.{Image, UpgradedImageIExtractor, ImageExtractor}
24 | import org.apache.http.client.HttpClient
25 | import org.jsoup.nodes.{Document, Element}
26 | import org.jsoup.Jsoup
27 | import java.io.File
28 | import utils.{ParsingCandidate, URLHelper, Logging}
29 | import com.gravity.goose.outputformatters.{StandardOutputFormatter, OutputFormatter}
30 |
31 | /**
32 | * Created by Jim Plush
33 | * User: jim
34 | * Date: 8/18/11
35 | */
36 |
37 | case class CrawlCandidate(config: Configuration, url: String, rawHTML: String = null)
38 |
39 | class Crawler(config: Configuration) {
40 |
41 | import Crawler._
42 |
43 | def crawl(crawlCandidate: CrawlCandidate): Article = {
44 | val article = new Article()
45 | for {
46 | parseCandidate <- URLHelper.getCleanedUrl(crawlCandidate.url)
47 | rawHtml <- getHTML(crawlCandidate, parseCandidate)
48 | doc <- getDocument(parseCandidate.url.toString, rawHtml)
49 | } {
50 | trace("Crawling url: " + parseCandidate.url)
51 |
52 | val extractor = getExtractor
53 | val docCleaner = getDocCleaner
54 | val outputFormatter = getOutputFormatter
55 |
56 | article.finalUrl = parseCandidate.url.toString
57 | article.domain = parseCandidate.url.getHost
58 | article.linkhash = parseCandidate.linkhash
59 | article.rawHtml = rawHtml
60 | article.doc = doc
61 | article.rawDoc = doc.clone()
62 |
63 | article.title = extractor.getTitle(article)
64 | article.publishDate = config.publishDateExtractor.extract(doc)
65 | article.additionalData = config.getAdditionalDataExtractor.extract(doc)
66 | article.metaDescription = extractor.getMetaDescription(article)
67 | article.metaKeywords = extractor.getMetaKeywords(article)
68 | article.canonicalLink = extractor.getCanonicalLink(article)
69 | article.tags = extractor.extractTags(article)
70 |
71 | // before we do any calcs on the body itself let's clean up the document
72 | article.doc = docCleaner.clean(article)
73 |
74 | extractor.calculateBestNodeBasedOnClustering(article) match {
75 | case Some(node: Element) =>
76 | article.topNode = node
77 | article.movies = extractor.extractVideos(article.topNode)
78 |
79 | if (config.enableImageFetching) {
80 | trace(logPrefix + "Image fetching enabled...")
81 | val imageExtractor = getImageExtractor(article)
82 | try {
83 | if (article.rawDoc == null) {
84 | article.topImage = new Image
85 | } else {
86 | article.topImage = imageExtractor.getBestImage(article.rawDoc, article.topNode)
87 | }
88 | } catch {
89 | case e: Exception => warn(e, e.toString)
90 | }
91 | }
92 | article.topNode = extractor.postExtractionCleanup(article.topNode)
93 |
94 |
95 | article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode)
96 | case _ => trace("NO ARTICLE FOUND")
97 | }
98 | releaseResources(article)
99 | article
100 | }
101 |
102 | article
103 | }
104 |
105 | def getHTML(crawlCandidate: CrawlCandidate, parsingCandidate: ParsingCandidate): Option[String] = {
106 | if (crawlCandidate.rawHTML != null) {
107 | Some(crawlCandidate.rawHTML)
108 | } else {
109 | config.getHtmlFetcher.getHtml(config, parsingCandidate.url.toString) match {
110 | case Some(html) =>
111 | Some(html)
112 | case _ => None
113 | }
114 | }
115 | }
116 |
117 |
118 | def getImageExtractor(article: Article): ImageExtractor = {
119 | val httpClient: HttpClient = config.getHtmlFetcher.getHttpClient
120 | new UpgradedImageIExtractor(httpClient, article, config)
121 | }
122 |
123 | def getOutputFormatter: OutputFormatter = {
124 | StandardOutputFormatter
125 | }
126 |
127 | def getDocCleaner: DocumentCleaner = {
128 | new StandardDocumentCleaner
129 | }
130 |
131 | def getDocument(url: String, rawlHtml: String): Option[Document] = {
132 |
133 | try {
134 | Some(Jsoup.parse(rawlHtml))
135 | } catch {
136 | case e: Exception => {
137 | trace("Unable to parse " + url + " properly into JSoup Doc")
138 | None
139 | }
140 | }
141 | }
142 |
143 | def getExtractor: ContentExtractor = {
144 | config.contentExtractor
145 | }
146 |
147 | /**
148 | * cleans up any temp files we have laying around like temp images
149 | * removes any image in the temp dir that starts with the linkhash of the url we just parsed
150 | */
151 | def releaseResources(article: Article) {
152 | trace(logPrefix + "STARTING TO RELEASE ALL RESOURCES")
153 |
154 | val dir: File = new File(config.localStoragePath)
155 |
156 | dir.list.foreach(filename => {
157 | if (filename.startsWith(article.linkhash)) {
158 | val f: File = new File(dir.getAbsolutePath + "/" + filename)
159 | if (!f.delete) {
160 | warn("Unable to remove temp file: " + filename)
161 | }
162 | }
163 | })
164 | }
165 |
166 | }
167 |
168 | object Crawler extends Logging {
169 | val logPrefix = "crawler: "
170 | }
--------------------------------------------------------------------------------
/src/main/resources/com/gravity/goose/text/stopwords-en.txt:
--------------------------------------------------------------------------------
1 | a's
2 | able
3 | about
4 | above
5 | according
6 | accordingly
7 | across
8 | actually
9 | after
10 | afterwards
11 | again
12 | against
13 | ain't
14 | all
15 | allow
16 | allows
17 | almost
18 | alone
19 | along
20 | already
21 | also
22 | although
23 | always
24 | am
25 | among
26 | amongst
27 | an
28 | and
29 | another
30 | any
31 | anybody
32 | anyhow
33 | anyone
34 | anything
35 | anyway
36 | anyways
37 | anywhere
38 | apart
39 | appear
40 | appreciate
41 | appropriate
42 | are
43 | aren't
44 | around
45 | as
46 | aside
47 | ask
48 | asking
49 | associated
50 | at
51 | available
52 | away
53 | awfully
54 | be
55 | became
56 | because
57 | become
58 | becomes
59 | becoming
60 | been
61 | before
62 | beforehand
63 | behind
64 | being
65 | believe
66 | below
67 | beside
68 | besides
69 | best
70 | better
71 | between
72 | beyond
73 | both
74 | brief
75 | but
76 | by
77 | c
78 | c'mon
79 | c's
80 | came
81 | campaign
82 | can
83 | can't
84 | cannot
85 | cant
86 | cause
87 | causes
88 | certain
89 | certainly
90 | changes
91 | clearly
92 | co
93 | com
94 | come
95 | comes
96 | concerning
97 | consequently
98 | consider
99 | considering
100 | contain
101 | containing
102 | contains
103 | corresponding
104 | could
105 | couldn't
106 | course
107 | currently
108 | definitely
109 | described
110 | despite
111 | did
112 | didn't
113 | different
114 | do
115 | does
116 | doesn't
117 | doing
118 | don't
119 | done
120 | down
121 | downwards
122 | during
123 | each
124 | edu
125 | eight
126 | either
127 | else
128 | elsewhere
129 | enough
130 | endorsed
131 | entirely
132 | especially
133 | et
134 | etc
135 | even
136 | ever
137 | every
138 | everybody
139 | everyone
140 | everything
141 | everywhere
142 | ex
143 | exactly
144 | example
145 | except
146 | far
147 | few
148 | fifth
149 | first
150 | financial
151 | five
152 | followed
153 | following
154 | follows
155 | for
156 | former
157 | formerly
158 | forth
159 | four
160 | from
161 | further
162 | furthermore
163 | get
164 | gets
165 | getting
166 | given
167 | gives
168 | go
169 | goes
170 | going
171 | gone
172 | got
173 | gotten
174 | greetings
175 | had
176 | hadn't
177 | happens
178 | hardly
179 | has
180 | hasn't
181 | have
182 | haven't
183 | having
184 | he
185 | he's
186 | hello
187 | help
188 | hence
189 | her
190 | here
191 | here's
192 | hereafter
193 | hereby
194 | herein
195 | hereupon
196 | hers
197 | herself
198 | hi
199 | him
200 | himself
201 | his
202 | hither
203 | hopefully
204 | how
205 | howbeit
206 | however
207 | i'd
208 | i'll
209 | i'm
210 | i've
211 | if
212 | ignored
213 | immediate
214 | in
215 | inasmuch
216 | inc
217 | indeed
218 | indicate
219 | indicated
220 | indicates
221 | inner
222 | insofar
223 | instead
224 | into
225 | inward
226 | is
227 | isn't
228 | it
229 | it'd
230 | it'll
231 | it's
232 | its
233 | itself
234 | just
235 | keep
236 | keeps
237 | kept
238 | know
239 | knows
240 | known
241 | last
242 | lately
243 | later
244 | latter
245 | latterly
246 | least
247 | less
248 | lest
249 | let
250 | let's
251 | like
252 | liked
253 | likely
254 | little
255 | look
256 | looking
257 | looks
258 | ltd
259 | mainly
260 | many
261 | may
262 | maybe
263 | me
264 | mean
265 | meanwhile
266 | merely
267 | might
268 | more
269 | moreover
270 | most
271 | mostly
272 | much
273 | must
274 | my
275 | myself
276 | name
277 | namely
278 | nd
279 | near
280 | nearly
281 | necessary
282 | need
283 | needs
284 | neither
285 | never
286 | nevertheless
287 | new
288 | next
289 | nine
290 | no
291 | nobody
292 | non
293 | none
294 | noone
295 | nor
296 | normally
297 | not
298 | nothing
299 | novel
300 | now
301 | nowhere
302 | obviously
303 | of
304 | off
305 | often
306 | oh
307 | ok
308 | okay
309 | old
310 | on
311 | once
312 | one
313 | ones
314 | only
315 | onto
316 | or
317 | other
318 | others
319 | otherwise
320 | ought
321 | our
322 | ours
323 | ourselves
324 | out
325 | outside
326 | over
327 | overall
328 | own
329 | particular
330 | particularly
331 | per
332 | perhaps
333 | placed
334 | please
335 | plus
336 | possible
337 | presumably
338 | probably
339 | provides
340 | quite
341 | quote
342 | quarterly
343 | rather
344 | really
345 | reasonably
346 | regarding
347 | regardless
348 | regards
349 | relatively
350 | respectively
351 | right
352 | said
353 | same
354 | saw
355 | say
356 | saying
357 | says
358 | second
359 | secondly
360 | see
361 | seeing
362 | seem
363 | seemed
364 | seeming
365 | seems
366 | seen
367 | self
368 | selves
369 | sensible
370 | sent
371 | serious
372 | seriously
373 | seven
374 | several
375 | shall
376 | she
377 | should
378 | shouldn't
379 | since
380 | six
381 | so
382 | some
383 | somebody
384 | somehow
385 | someone
386 | something
387 | sometime
388 | sometimes
389 | somewhat
390 | somewhere
391 | soon
392 | sorry
393 | specified
394 | specify
395 | specifying
396 | still
397 | sub
398 | such
399 | sup
400 | sure
401 | t's
402 | take
403 | taken
404 | tell
405 | tends
406 | than
407 | thank
408 | thanks
409 | thanx
410 | that
411 | that's
412 | thats
413 | the
414 | their
415 | theirs
416 | them
417 | themselves
418 | then
419 | thence
420 | there
421 | there's
422 | thereafter
423 | thereby
424 | therefore
425 | therein
426 | theres
427 | thereupon
428 | these
429 | they
430 | they'd
431 | they'll
432 | they're
433 | they've
434 | think
435 | third
436 | this
437 | thorough
438 | thoroughly
439 | those
440 | though
441 | three
442 | through
443 | throughout
444 | thru
445 | thus
446 | to
447 | together
448 | too
449 | took
450 | toward
451 | towards
452 | tried
453 | tries
454 | truly
455 | try
456 | trying
457 | twice
458 | two
459 | under
460 | unfortunately
461 | unless
462 | unlikely
463 | until
464 | unto
465 | up
466 | upon
467 | us
468 | use
469 | used
470 | useful
471 | uses
472 | using
473 | usually
474 | uucp
475 | value
476 | various
477 | very
478 | via
479 | viz
480 | vs
481 | want
482 | wants
483 | was
484 | wasn't
485 | way
486 | we
487 | we'd
488 | we'll
489 | we're
490 | we've
491 | welcome
492 | well
493 | went
494 | were
495 | weren't
496 | what
497 | what's
498 | whatever
499 | when
500 | whence
501 | whenever
502 | where
503 | where's
504 | whereafter
505 | whereas
506 | whereby
507 | wherein
508 | whereupon
509 | wherever
510 | whether
511 | which
512 | while
513 | whither
514 | who
515 | who's
516 | whoever
517 | whole
518 | whom
519 | whose
520 | why
521 | will
522 | willing
523 | wish
524 | with
525 | within
526 | without
527 | won't
528 | wonder
529 | would
530 | would
531 | wouldn't
532 | yes
533 | yet
534 | you
535 | you'd
536 | you'll
537 | you're
538 | you've
539 | your
540 | yours
541 | yourself
542 | yourselves
543 | zero
544 | official
545 | sharply
546 | criticized
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/images/ImageSaver.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package com.gravity.goose.images
19 |
20 | /**
21 | * Created by Jim Plush
22 | * User: jim
23 | * Date: 8/18/11
24 | */
25 |
26 | import org.apache.commons.io.IOUtils
27 | import org.apache.http.HttpEntity
28 | import org.apache.http.client.ClientProtocolException
29 | import org.apache.http.client.HttpClient
30 | import org.apache.http.client.methods.HttpGet
31 | import org.apache.http.client.protocol.ClientContext
32 | import org.apache.http.protocol.BasicHttpContext
33 | import org.apache.http.protocol.HttpContext
34 | import java.io._
35 | import java.util.Random
36 | import com.gravity.goose.utils.Logging
37 | import com.gravity.goose.Configuration
38 | import com.gravity.goose.network.HtmlFetcher
39 |
40 | /**
41 | * This class will be responsible for storing images to disk
42 | *
43 | * @author Jim Plush
44 | */
45 | object ImageSaver extends Logging {
46 | private def getFileExtension(config: Configuration, fileName: String): String = {
47 | var fileExtension: String = ""
48 | var mimeType: String = null
49 | try {
50 | val imageDims: ImageDetails = ImageUtils.getImageDimensions(config.imagemagickIdentifyPath, fileName)
51 | mimeType = imageDims.getMimeType
52 | if (mimeType == "GIF") {
53 | if (logger.isDebugEnabled) {
54 | logger.debug("SNEAKY GIF! " + fileName)
55 | }
56 | throw new SecretGifException
57 | }
58 | if (mimeType == "JPEG") {
59 | fileExtension = ".jpg"
60 | }
61 | else if (mimeType == "PNG") {
62 | fileExtension = ".png"
63 | }
64 | else {
65 | throw new IOException("BAD MIME TYPE: " + mimeType + " FILENAME:" + fileName)
66 | }
67 | }
68 | catch {
69 | case e: SecretGifException =>
70 | throw e
71 | case e: FileNotFoundException =>
72 | logger.error(e.getMessage)
73 | case e: IOException =>
74 | logger.error(e.getMessage)
75 | throw e
76 | }
77 | finally {
78 | }
79 | fileExtension
80 | }
81 |
82 | def fetchEntity(httpClient: HttpClient, imageSrc: String): Option[HttpEntity] = {
83 |
84 | val localContext: HttpContext = new BasicHttpContext
85 | localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore)
86 | val httpget = new HttpGet(imageSrc)
87 | val response = httpClient.execute(httpget, localContext)
88 | val respStatus: String = response.getStatusLine.toString
89 | if (!respStatus.contains("200")) {
90 | None
91 | } else {
92 | try {
93 | Some(response.getEntity)
94 | } catch {
95 | case e: Exception => warn(e, e.toString); None
96 | } finally {
97 | httpget.abort()
98 | }
99 | }
100 | }
101 |
102 |
103 | def copyInputStreamToLocalImage(entity: HttpEntity, linkhash: String, config: Configuration): String = {
104 | val generator: Random = new Random
105 | val randInt: Int = generator.nextInt
106 | val localSrcPath = config.localStoragePath + "/" + linkhash + "_" + randInt
107 | val instream: InputStream = entity.getContent
108 | val outstream: OutputStream = new FileOutputStream(localSrcPath)
109 | try {
110 | trace("Storing image locally: " + localSrcPath)
111 | IOUtils.copy(instream, outstream)
112 | val fileExtension = ImageSaver.getFileExtension(config, localSrcPath)
113 | if (fileExtension == "" || fileExtension == null) {
114 | trace("EMPTY FILE EXTENSION: " + localSrcPath)
115 | return null
116 | }
117 | val f: File = new File(localSrcPath)
118 | if (f.length < config.minBytesForImages) {
119 | if (logger.isDebugEnabled) {
120 | logger.debug("TOO SMALL AN IMAGE: " + localSrcPath + " bytes: " + f.length)
121 | }
122 | return null
123 | }
124 | val newFilename = localSrcPath + fileExtension
125 | val newFile: File = new File(newFilename)
126 | f.renameTo(newFile)
127 | trace("Image successfully Written to Disk")
128 | newFilename
129 | }
130 | catch {
131 | case e: Exception =>
132 | throw e
133 | }
134 | finally {
135 | instream.close()
136 | outstream.close()
137 | }
138 | }
139 |
140 | /**
141 | * stores an image to disk and returns the path where the file was written
142 | *
143 | * @return
144 | */
145 | def storeTempImage(httpClient: HttpClient, linkhash: String, imageSrcMaster: String, config: Configuration): String = {
146 |
147 | var imageSrc = imageSrcMaster
148 | try {
149 | imageSrc = imageSrc.replace(" ", "%20")
150 | trace("Starting to download image: " + imageSrc)
151 |
152 | fetchEntity(httpClient, imageSrc) match {
153 | case Some(entity) =>
154 | try {
155 | return copyInputStreamToLocalImage(entity, linkhash, config)
156 | }
157 | catch {
158 | case e: SecretGifException =>
159 | throw e
160 | case e: Exception =>
161 | logger.error(e.getMessage)
162 | return null
163 | }
164 | case None =>
165 | trace("Unable to get entity for: " + imageSrc)
166 | return null
167 | }
168 |
169 | }
170 | catch {
171 | case e: IllegalArgumentException =>
172 | logger.warn(e.getMessage)
173 | case e: SecretGifException =>
174 | raise(e)
175 | case e: ClientProtocolException =>
176 | logger.error(e.toString)
177 | case e: IOException =>
178 | logger.error(e.toString)
179 | case e: Exception =>
180 | e.printStackTrace()
181 | logger.error(e.toString)
182 | e.printStackTrace()
183 | }
184 | finally {
185 |
186 | }
187 | null
188 | }
189 |
190 | private def raise(e: SecretGifException): Unit = {
191 | }
192 |
193 |
194 | }
195 |
196 |
197 |
--------------------------------------------------------------------------------
/src/main/scala/com/gravity/goose/outputformatters/OutputFormatter.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to Gravity.com under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Gravity.com licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package com.gravity.goose.outputformatters
20 |
21 | import org.jsoup.nodes._
22 | import org.apache.commons.lang.StringEscapeUtils
23 | import org.jsoup.select.Elements
24 | import com.gravity.goose.text.StopWords
25 | import scala.collection.JavaConversions._
26 | import org.slf4j.Logger
27 |
28 | /**
29 | * Created by Jim Plush
30 | * User: jim
31 | * Date: 8/17/11
32 | */
33 |
34 | trait OutputFormatter {
35 | val logPrefix = "outformat: "
36 |
37 | // used to remove tags within tags
38 | val tagReplace = "<[^>]+>".r
39 |
40 | def logger: Logger
41 |
42 | private def selectElements(query: String, topNode: Element): Elements = topNode match {
43 | case null => new Elements(List.empty[Element])
44 | case n => n.select(query)
45 | }
46 |
47 | /**
48 | * Depricated use {@link #getFormattedText(Element)}
49 | * @param topNode the top most node to format
50 | * @return the prepared Element
51 | */
52 | @Deprecated def getFormattedElement(topNode: Element): Element = {
53 | removeNodesWithNegativeScores(topNode)
54 | convertLinksToText(topNode)
55 | replaceTagsWithText(topNode)
56 | removeParagraphsWithFewWords(topNode)
57 | topNode
58 | }
59 |
60 | /**
61 | * Removes all unnecessarry elements and formats the selected text nodes
62 | * @param topNode the top most node to format
63 | * @return a formatted string with all HTML removed
64 | */
65 | def getFormattedText(topNode: Element): String = {
66 | removeNodesWithNegativeScores(topNode)
67 | convertLinksToText(topNode)
68 | replaceTagsWithText(topNode)
69 | removeParagraphsWithFewWords(topNode)
70 | convertToText(topNode)
71 | }
72 |
73 | /**
74 | * Depricated use {@link #getFormattedText(Element)}
75 | * takes an element and turns the P tags into \n\n
76 | *
77 | * @return
78 | */
79 | def convertToText(topNode: Element): String = topNode match {
80 | case null => ""
81 | case node => {
82 | (node.children().map((e: Element) => {
83 | StringEscapeUtils.unescapeHtml(e.text).trim
84 | })).toList.mkString("\n\n")
85 | }
86 |
87 | }
88 |
89 | /**
90 | * cleans up and converts any nodes that should be considered text into text
91 | */
92 | private def convertLinksToText(topNode: Element) {
93 | if (topNode != null) {
94 | logger.trace(logPrefix + "Turning links to text")
95 | val baseUri = topNode.baseUri()
96 |
97 | val links = topNode.getElementsByTag("a")
98 | for (item <- links) {
99 | if (item.getElementsByTag("img").isEmpty) {
100 | val tn = new TextNode(item.text, baseUri)
101 | item.replaceWith(tn)
102 | }
103 | }
104 | }
105 |
106 | }
107 |
108 | /**
109 | * if there are elements inside our top node that have a negative gravity score, let's
110 | * give em the boot
111 | */
112 | private def removeNodesWithNegativeScores(topNode: Element) {
113 | def tryInt(text: String): Int = try {
114 | Integer.parseInt(text)
115 | } catch {
116 | case _: Exception => 0
117 | }
118 |
119 | val gravityItems = selectElements("*[gravityScore]", topNode)
120 | for (item <- gravityItems) {
121 | val score = tryInt(item.attr("gravityScore"))
122 | if (score < 1) {
123 | item.remove()
124 | }
125 | }
126 | }
127 |
128 | /**
129 | * replace common tags with just text so we don't have any crazy formatting issues
130 | * so replace ") == true) { 217 | return Some(htmlResult) 218 | } 219 | trace("GRVBIGFAIL: " + mimeType + " - " + cleanUrl) 220 | throw new NotHtmlException(cleanUrl) 221 | } 222 | } 223 | else { 224 | throw new NotHtmlException(cleanUrl) 225 | } 226 | } 227 | catch { 228 | case e: UnsupportedEncodingException => { 229 | logger.warn(e.getMessage + " Caught for URL: " + cleanUrl) 230 | } 231 | case e: IOException => { 232 | logger.warn(e.getMessage + " Caught for URL: " + cleanUrl) 233 | } 234 | } 235 | None 236 | } 237 | 238 | private def initClient() { 239 | 240 | trace("Initializing HttpClient") 241 | 242 | val httpParams: HttpParams = new BasicHttpParams 243 | HttpConnectionParams.setConnectionTimeout(httpParams, 10 * 1000) 244 | HttpConnectionParams.setSoTimeout(httpParams, 10 * 1000) 245 | HttpProtocolParams.setVersion(httpParams, HttpVersion.HTTP_1_1) 246 | emptyCookieStore = new CookieStore { 247 | def addCookie(cookie: Cookie) { 248 | } 249 | 250 | def getCookies: List[Cookie] = { 251 | emptyList 252 | } 253 | 254 | def clearExpired(date: Date): Boolean = { 255 | false 256 | } 257 | 258 | def clear() { 259 | } 260 | 261 | private[network] var emptyList: ArrayList[Cookie] = new ArrayList[Cookie] 262 | } 263 | httpParams.setParameter("http.protocol.cookie-policy", CookiePolicy.BROWSER_COMPATIBILITY) 264 | httpParams.setParameter("http.User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; de; rv:1.9.2.8) Gecko/20100723 Ubuntu/10.04 (lucid) Firefox/3.6.8") 265 | httpParams.setParameter("http.language.Accept-Language", "en-us") 266 | httpParams.setParameter("http.protocol.content-charset", "UTF-8") 267 | httpParams.setParameter("Accept", "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5") 268 | httpParams.setParameter("Cache-Control", "max-age=0") 269 | httpParams.setParameter("http.connection.stalecheck", false) 270 | val schemeRegistry: SchemeRegistry = new SchemeRegistry 271 | schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory)) 272 | schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory)) 273 | val cm = new ThreadSafeClientConnManager(schemeRegistry) 274 | cm.setMaxTotal(20000) 275 | cm.setDefaultMaxPerRoute(500) 276 | httpClient = new DefaultHttpClient(cm, httpParams) 277 | httpClient.asInstanceOf[AbstractHttpClient].setHttpRequestRetryHandler(new DefaultHttpRequestRetryHandler(0, false)) 278 | httpClient.getParams.setParameter("http.conn-manager.timeout", 120000L) 279 | httpClient.getParams.setParameter("http.protocol.wait-for-continue", 10000L) 280 | httpClient.getParams.setParameter("http.tcp.nodelay", true) 281 | } 282 | 283 | /** 284 | * reads bytes off the string and returns a string 285 | * 286 | * @param is the source stream from the response 287 | * @param maxBytes The max bytes that we want to read from the input stream 288 | * @return String 289 | */ 290 | def convertStreamToString(is: InputStream, maxBytes: Int, encodingType: String): String = { 291 | val buf: Array[Char] = new Array[Char](2048) 292 | var r: Reader = null 293 | val s = new StringBuilder 294 | try { 295 | r = new InputStreamReader(is, encodingType) 296 | var bytesRead: Int = 2048 297 | var inLoop = true 298 | while (inLoop) { 299 | if (bytesRead >= maxBytes) { 300 | throw new MaxBytesException 301 | } 302 | var n: Int = r.read(buf) 303 | bytesRead += 2048 304 | 305 | if (n < 0) inLoop = false 306 | if (inLoop) s.appendAll(buf, 0, n) 307 | } 308 | return s.toString() 309 | } 310 | catch { 311 | case e: SocketTimeoutException => { 312 | logger.warn(e.toString + " " + e.getMessage) 313 | } 314 | case e: UnsupportedEncodingException => { 315 | logger.warn(e.toString + " Encoding: " + encodingType) 316 | } 317 | case e: IOException => { 318 | logger.warn(e.toString + " " + e.getMessage) 319 | } 320 | } 321 | finally { 322 | if (r != null) { 323 | try { 324 | r.close() 325 | } 326 | catch { 327 | case e: Exception => { 328 | } 329 | } 330 | } 331 | } 332 | null 333 | } 334 | 335 | 336 | } 337 | 338 | 339 | -------------------------------------------------------------------------------- /src/main/resources/com/gravity/goose/statichtml/guardian1_result.txt: -------------------------------------------------------------------------------- 1 | Kristen Wiig has had the kind of summer one might imagine to be life-changing. For four years, she andAnnie Mumolo, her friend and co-writer, slaved over a comedy script commissioned by Judd Apatow about a woman whose best friend is getting married. It was the 38-year-old's first lead in a film, and her first full-length script to be produced. As an indication of how it played with audiences, I watched it on a plane last month with a friend who, during the scene in which the bride squats in the street to relieve herself after a bad kebab, laughed so long and hard a woman passing in the aisle leant over and said, "What are you watching?" 2 | 3 | Wiig smiles when I tell her this. "Proud," she says, of the day they filmed the shitting-in-the-street scene. "A proud moment." 4 | 5 | We are in the tearoom of a fashionable hotel in Tribeca, the Manhattan neighbourhood where Wiig lives. Before Bridesmaids, she was known to US audiences as a long-running cast member of Saturday Night Live and elsewhere for scene-stealing cameos in films such as Ghost Town and Knocked Up. 6 | 7 | That the film, by midsummer, had grossed more than $150m in the US and outstripped not only all of Apatow's other films, but every "R-rated female comedy" in history, puts Wiig in the zone of woman of the moment, although she chafes against this, with its implication that before Bridesmaids she was an ingenue. 8 | 9 | "In most ways my life hasn't changed," she says. "I know that's a boring answer. People want to hear that I bought all gold, fur…" She allows a perfectly timed beat. "I would never wear fur." 10 | 11 | But hasn't she had to turn down lots of offers? 12 | 13 | "I mean. Yes, no. It feels weird to say that; you don't want to be like, 'Everyone wants me!' I mean. I guess Bridesmaids was definitely the biggest role I've ever had. And the fact that I co-wrote it and everything. But, um…" Wiig, who is slight, with very straight hair and an eager tilt to her body language, looks mortified. "It's not like I have boxes of scripts arriving at my door." 14 | 15 | Her understatement is fuelled perhaps by the inevitable and awkward comparisons she has gained with other women in her business, as if the culture can sustain only a couple at a time. Wiig has been getting "the new Tina Fey" quite a lot – Fey was head writer at SNL when Wiig joined – although the comparison is faulty. Wiig is an actor first and a comedian second, and with a film directed by Sean Pennin the pipeline and another, Imogene, in which she stars alongside Annette Bening and Matt Dillon, wants to develop her career away from comedy. "People always call me a comedian. And I don't really see myself like that. I guess I just consider myself an actor who does comedy. But who wants to do other things as well." 16 | 17 | It took her a long time to get here. After growing up in upstate New York, she went to university in Arizona and studied art before dropping out after the first year and going to LA to try to make it as an actor. Arizona is a notorious party college, but Wiig says all of that was out of her system by the time she left high school, where she had a few shaky years. "I was not that good a student because I was very… social. I cared more about going out with my friends. I didn't quite realise the importance of school. But then when I went to college I took it much more seriously, because I enjoyed it." 18 | 19 | How social was she? Suspended? 20 | 21 | "Um. Not for more than a couple of days. There were suspensions." Her expression fixes. "That's the past." Before the spotlight was so firmly on her, Wiig talked publicly about her minor-league acts of teenage hooliganism, including being caught underage drinking at a Grateful Deadgig, skipping school and, what she called the worst of it, smashing pot plants on a neighbour's porch, which she feels terrible about. As she entered her 20s her parents were still worried, she says, and then when she kicked in her degree and told them she wanted to be an actor, "probably the most worried they could be". 22 | 23 | "Yes. Also, they didn't want me to get disappointed. They would always mention the numbers – do you know how many people are trying to do what you're doing? Your chances are really slim. And they're right. Technically. But when you're 20, you're like, why can't you just support me?! Can't you be proud that I'm trying to go after my dream?" She pulls a whiny face and tilts her head. "But they came around quickly when they saw how happy it made me. They would come and see me in the horrible little shows that I was in." 24 | 25 | Wiig hadn't any great sense of being funny when she was growing up. Her dad, she says, tells a lot of jokes. Her mother is funny, but "mom funny, where she isn't trying to be funny, but is". Before retiring, her father ran a marina on one of the lakes upstate in New York (the name Wiig is from his Norwegian heritage). Her mother was an artist. Even after all these years, they haven't quite shed the sense of precariousness around their daughter's life; when she tells them she's in a movie, her mother will say tentatively, "Is that something we can see in theatres?" Wiig smiles and says, "They're still getting used to the idea that I'm working and it's OK." 26 | 27 | With good reason. Wiig was 11 years in LA before she got the call from Saturday Night Live, during which time her income was erratic. She had arrived in the city with no professional contacts and a nagging sense of insurmountable competition. "I was incredibly intimidated and had no experience. I felt very scared and unsure and I didn't have any résumé, and everyone around me was very beautiful and young and I thought, oh, maybe I should work in a store and enjoy the weather. But I started taking improv classes and that's what got me started." 28 | 29 | Improv was something she had never heard of before. But when she turned up to watch a gig one day at the Groundlings, the famous LA improv troupe with alumni such as Lisa Kudrow, Conan O'Brienand Will Ferrell, something resonated. The idea of standing on stage and making up stuff was, she says, less scary to her than the notion of saying lines, with the lurking fear there was a right and a wrong way to say them. With improvisation, there was no right and wrong: "You can't mess it up and you can't forget your lines." 30 | 31 | Her enthusiasm wouldn't pay the bills, however, and Wiig worked at a series of day jobs, including at a floral design studio for a couple of years, and as a waitress in the refectory at Universal Studios. Now and then she'll run into someone on a TV show or a movie, and wonder where she knows them from. "And then I'll remember: oh yeah, I used to serve you Cobb salad." 32 | 33 | There were many long, dark nights of the soul. "Oh my God, every month, yeah, because you don't have a lot of money coming in. When I look back, it was one of the best times of my life, because you're so in it with your friends. But you do have those moments when you're like: have I given it a try, should I stop, should I quit? But, no. You have a family there, you have a space to put shows on. I would rather be doing what I love and living above a garage – which I did – than not." 34 | 35 | The call came in 2005. Wiig flew to New York for the first of several auditions with the Saturday Night Live creators. The audition format was standup, which she had never done before, and in front of a terrifying panel includingLorne Michaels, the legendary SNL producer, and Tina Fey. Wiig was required to unveil a range of characters of her own creation that might be suitable for the sketch show and, quivering up there alone on stage, she fully expected to be met with silence. When she heard a few laughs, she gathered strength, got through it and was called back for a second audition. After which, nothing. And then the new season started. "So I thought, right, pretty clear – thanks for coming. And then after the third show I got a call saying I was hired, come in…" 36 | 37 | Wiig joined the show at a time when it was undergoing a cultural transition. Fey was the first female head writer and has written about the formally macho culture of the show – men pissing in jars by their desks, etc, which she put to comic use in 30 Rock. It was tough, she says, walking into a workplace where everyone knew each other: "Kind of like going into someone's living room for a party and they are really comfortable and have their shoes off and are sitting on the couch and I walk in and am a little dressed up and don't know where to stand? They were all very welcoming and nice but I knew I wasn't at that place yet where I could take my shoes off." 38 | 39 | She was excited to be working with the likes of Fey, Amy Poehlerand Rachel Dratch, although Wiig is reluctant to describe the still testosterone-heavy environment at SNL as off-putting. "I mean, I mean, merely by numbers there are more men that work there, but I don't consider it… I don't even think about it. Men work there, women work there, we have a lot of amazing female writers on staff right now… There are more men, but I don't think anyone really…" 40 | 41 | Was she a fan of Fey's before she joined the show? 42 | 43 | "Um. I've watched the show since I was born. I mean I definitely admire all the stuff that she's accomplished, especially coming from SNL and being head writer, and then doing 30 Rock and all these movies and her book, I mean it's definitely something where you go, oh, that can happen. Someone can do that. She's done it. She deserves it." 44 | 45 | To date, Bridesmaids has earned in the region of $286m worldwide; it doesn't need the qualifier "best female comedy" since it outgrossed Apatow's entire back catalogue, including Anchorman and The 40-Year-Old Virgin. Still, Wiig does not claim feminist dividends for the film – that it allowed women actors to be as gross on screen as men. She says when she and Mumolo were writing the shitting-in-the-street scene ("Can that be the title of the piece?"), it wasn't with an eye on levelling the playing field, nor was there much discussion of whether the market would tolerate that kind of vulgarity from women. No. "I think when you are doing anything creative and you think, 'What are the critics going to think?' instead of what you want to express, it can get a little muddy, and – I'm talking so seriously about this shitting-in-the-street – but with that in particular we were like, oh, this is a fun way to end the scene, and Annie used to do an impression of someone slowly realising they were shitting their pants, kind of slowly going down on to the ground. She would just do it as a joke, and it would always make me laugh really hard. She took it to a whole new level." 46 | 47 | Apatow had approached Wiig and asked her to write a script for him after they worked together on Knocked Up, in which she played a small pivotal role as Katherine Heigl's bitchy boss. In her five minutes on screen, Wiig managed to communicate brilliantly the gap between what her character was saying and thinking. She and Mumolo first conceived of Bridesmaids not as a wedding movie per se, but as a movie about friendship. "I mean, it's called Bridesmaids, I get that. But it's about women who, when they reach that age, whether it's in their 30s or not, thought they were supposed to be somewhere else. That's where we started from. And the fact that Annie had been to seven weddings in two years. And that she had friends who were marrying money and she'd showed up at the country club for the bridal shower with her wing mirrors duct-taped to her car, and at the end of the night had to crawl through her window because the front door would always swell when it was hot out. But if it's your best friend, you don't want to be complaining…" 48 | 49 | On paper at least, it didn't look too promising, with the generic title and the number of lame wedding movies in a seemingly exhausted genre. Apatow's name raised suspicions, too, about the use to which certain characters would be put, especially that of Megan, played by Melissa McCarthy, who looked like the inevitable one-fat-girl-in-the-group and the obvious butt of fat-girl jokes. In fact, McCarthy is the other break-out star of the film, and "the character that didn't care what anybody else thought. It was a lesson my character needed to learn. She doesn't care what anyone thinks, she's in her own world, but is generous and sweet. We wanted to have that opposite look on life, the character who seems at first like there was nothing she could say that would help, but…" 50 | 51 | The writing of the dialogue was relatively easy, says Wiig, compared with figuring out what should happen in each scene, and the film went through countless draft versions, crammed in around other work commitments, so that Mumolo, for example, would fly out to Mexico where Wiig was filming, to work on it for a weekend. In early drafts, the women ended up in Vegas, but that got chucked out when, over the four years of writing, it was used up in other wedding films such as The Hangover. 52 | 53 | Apart from the fact that it is very funny, Bridesmaids ultimately works because it has a kind of sweet sincerity and the friendship between the two lead characters seems real. It bemuses Wiig that the film has widely been described as "raunchy". It's really not raunchy. "Raunchy means like Porky's," she says and smiles. "Which is my next movie; it's going to be a Porky's prequel." 54 | 55 | After six years in New York, Wiig is finally at home in the city. It was tough in the early days, she says, and when friends came to visit she would burst into tears as they left. ("I was so embarrassed. I thought, oh my God, they're going to go back and say, 'Kristen's not good. She is noooot coping well.'") If accounts are to be believed, she was briefly married to an actor called Hayes Hargrove and currently lives with her partner, a film-maker called Brian Petsos, but she responds to even the mildest question about her domestic life with a frozen smile. She would, of course, rather talk about acting, and her success in her first lead role – "I felt like I had to do a good job or no one would ever invite me to the party again" – has, despite her scrupulous modesty, been rewarded with the kind of films she always hoped she'd walk into. In the Sean Penn film The Comedian, which is still in the early stages of production, Wiig will co-star with Robert De Niro. It will be the real test of whether she is leading lady material, and whether she can carry a film without jokes. "I don't really think about it," says Wiig. "When you're in it, you're in it." 56 | 57 | In the meantime, she has sketches to write and shoot as part of the gruelling schedule of Saturday Night Live. After the interview, she is due in at the office for the weekly writing night, when everyone is required to be in at 4pm and stay until the following morning. Wiig is riding so high at the moment that when, as we leave, I ask her to confirm her age, I'm surprised when she grimaces. Yes, she says, she's 38. Why the face? Under her breath, like a dangerous heresy, she says, "I feel like women are asked their age more than men." And she snaps on a smile and leaves the restaurant. --------------------------------------------------------------------------------