├── tests
    ├── .gitignore
    ├── testthat.R
    ├── testthat.R.cran
    └── testthat
    │   ├── test-source-yahooinplay.R
    │   ├── test-source-reutersnews.R
    │   ├── test-source-googlefinance.R
    │   ├── test-source-yahoonews.R
    │   ├── test-source-googlenews.R
    │   ├── test-source-yahoofinance.R
    │   └── test-source-nytimes.R
├── inst
    ├── .gitignore
    └── NEWS.Rd
├── data
    ├── yahoonews.rda
    └── nytimes_appid.rda
├── .gitignore
├── .Rbuildignore
├── man
    ├── nytimes_appid.Rd
    ├── trimWhiteSpaces.Rd
    ├── source.update.Rd
    ├── encloseHTML.Rd
    ├── getEmpty.Rd
    ├── extract.Rd
    ├── yahoonews.Rd
    ├── parse.Rd
    ├── LiberationSource.Rd
    ├── YahooInplaySource.Rd
    ├── removeNonASCII.Rd
    ├── YahooNewsSource.Rd
    ├── GoogleNewsSource.Rd
    ├── feedquery.Rd
    ├── extractHTMLStrip.Rd
    ├── YahooFinanceSource.Rd
    ├── GoogleFinanceSource.Rd
    ├── ReutersNewsSource.Rd
    ├── readWeb.Rd
    ├── corpus.update.Rd
    ├── extractContentDOM.Rd
    ├── WebCorpus.Rd
    ├── WebSource.Rd
    ├── tm.plugin.webmining-package.Rd
    ├── NYTimesSource.Rd
    └── getLinkContent.Rd
├── .travis.yml
├── R
    ├── trimWhiteSpaces.R
    ├── parser.R
    ├── feedquery.R
    ├── transform.R
    ├── tm.plugin.webmining-package.R
    ├── getLinkContent.R
    ├── corpus.R
    ├── extract.R
    ├── reader.R
    └── source.R
├── DESCRIPTION
├── vignettes
    ├── tables
    │   └── sources.tex
    ├── ShortIntro.Rnw
    └── references.bib
├── NAMESPACE
├── README.md
└── Makefile


/tests/.gitignore:
--------------------------------------------------------------------------------
1 | /.DS_Store
2 | 


--------------------------------------------------------------------------------
/inst/.gitignore:
--------------------------------------------------------------------------------
1 | /googleSearch.R
2 | 


--------------------------------------------------------------------------------
/data/yahoonews.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mannau/tm.plugin.webmining/HEAD/data/yahoonews.rda


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(tm)
3 | 
4 | test_check("tm.plugin.webmining")
5 | 


--------------------------------------------------------------------------------
/data/nytimes_appid.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mannau/tm.plugin.webmining/HEAD/data/nytimes_appid.rda


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .settings
2 | .project
3 | .README.md.html
4 | *.tar.gz
5 | *.Rcheck
6 | /release
7 | /.DS_Store
8 | .Rproj.user
9 | *.Rproj


--------------------------------------------------------------------------------
/tests/testthat.R.cran:
--------------------------------------------------------------------------------
1 | ### De-activate tests for CRAN 
2 | #library(testthat)
3 | #library(tm)
4 | #
5 | #test_check("tm.plugin.webmining")
6 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | .settings
 2 | .travis.yml
 3 | .gitignore
 4 | .project
 5 | .README.md.html
 6 | Makefile
 7 | README.md
 8 | release
 9 | .*tar.gz
10 | data/.gitignore
11 | travis-tool.sh
12 | inst/googleSearch.R
13 | tests/testthat.R.cran
14 | tests/testthat.R.temp
15 | ^.*\.Rproj$
16 | ^\.Rproj\.user$
17 | 


--------------------------------------------------------------------------------
/inst/NEWS.Rd:
--------------------------------------------------------------------------------
 1 | \name{NEWS}
 2 | \title{News for Package 'tm.plugin.webmining'}
 3 | \newcommand{\cpkg}{\href{http://CRAN.R-project.org/package=#1}{\pkg{#1}}}
 4 | 
 5 | \section{Changes in tm.plugin.webmining version 1.3 (2015-05-07)}{
 6 |   \itemize{
 7 |     \item Fix Issue #6: NYTimesSource
 8 |     \item Various fixes in tests and sources
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/man/nytimes_appid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/tm.plugin.webmining-package.R
 3 | \docType{data}
 4 | \name{nytimes_appid}
 5 | \alias{nytimes_appid}
 6 | \title{AppID for the NYtimes-API.}
 7 | \description{
 8 | USED ONLY FOR PACKAGE TESTING. PLEASE DOWNLOAD YOUR OWN KEY AT \url{http://developer.nytimes.com/}!!!
 9 | }
10 | \author{
11 | Mario Annau
12 | }
13 | \keyword{data}
14 | 
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | 
 3 | env:
 4 |   global:
 5 |     - _R_CHECK_CRAN_INCOMING_=FALSE
 6 | 
 7 | # Be strict when checking our package
 8 | warnings_are_errors: true
 9 | cran: http://cran.us.r-project.org
10 | 
11 | # System dependencies for HTTP calling
12 | apt_packages:
13 |   - libcurl4-openssl-dev
14 |   - libxml2-dev
15 |   - r-cran-rjava
16 | r_github_packages:
17 |   - jimhester/covr
18 | after_success:
19 |   - R --slave --vanilla -e 'library(covr); pc <- package_coverage(); pc; codecov(coverage = pc)'
20 |  


--------------------------------------------------------------------------------
/R/trimWhiteSpaces.R:
--------------------------------------------------------------------------------
 1 | #' @title Trim White Spaces from Text Document.
 2 | #' @description Transformation function, actually equal to stripWhiteSpace 
 3 | #' applicable for simple strings using Perl parser
 4 | #' @author Mario Annau
 5 | #' @param txt character
 6 | #' @seealso \code{\link{stripWhitespace}}
 7 | #' @export
 8 | trimWhiteSpaces <-
 9 | function(txt){
10 | 	txt <- sub("\\s+", "", txt, perl = TRUE)
11 | 	txt <- sub("\\s+$", "", txt, perl = TRUE)
12 | 	txt <- gsub("\\s\\s+", " ", txt, perl = TRUE)
13 | 	return(txt)
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/man/trimWhiteSpaces.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/trimWhiteSpaces.R
 3 | \name{trimWhiteSpaces}
 4 | \alias{trimWhiteSpaces}
 5 | \title{Trim White Spaces from Text Document.}
 6 | \usage{
 7 | trimWhiteSpaces(txt)
 8 | }
 9 | \arguments{
10 | \item{txt}{character}
11 | }
12 | \description{
13 | Transformation function, actually equal to stripWhiteSpace
14 | applicable for simple strings using Perl parser
15 | }
16 | \author{
17 | Mario Annau
18 | }
19 | \seealso{
20 | \code{\link{stripWhitespace}}
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/source.update.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{source.update}
 4 | \alias{source.update}
 5 | \alias{source.update.WebHTMLSource}
 6 | \alias{source.update.WebJSONSource}
 7 | \alias{source.update.WebXMLSource}
 8 | \title{Update WebXMLSource/WebHTMLSource/WebJSONSource}
 9 | \usage{
10 | source.update(x)
11 | }
12 | \arguments{
13 | \item{x}{Source object to be updated}
14 | }
15 | \description{
16 | Typically, update is called from \code{link{corpus.update}} and refreshes \code{$Content} in
17 | Source object.
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/man/encloseHTML.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/transform.R
 3 | \name{encloseHTML}
 4 | \alias{encloseHTML}
 5 | \alias{encloseHTML.PlainTextDocument}
 6 | \alias{encloseHTML.character}
 7 | \title{Enclose Text Content in HTML tags}
 8 | \usage{
 9 | encloseHTML(x)
10 | }
11 | \arguments{
12 | \item{x}{object of PlainTextDocument class}
13 | }
14 | \description{
15 | Simple helper function which encloses text content of character
16 | (or \code{\link[tm]{TextDocument}}) in HTML-tags. That way, HTML
17 | content can be easier parsed by \code{\link[XML]{htmlTreeParse}}
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/man/getEmpty.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/corpus.R
 3 | \name{getEmpty}
 4 | \alias{getEmpty}
 5 | \alias{getEmpty.WebCorpus}
 6 | \title{Retrieve Empty Corpus Elements through \code{$postFUN}.}
 7 | \usage{
 8 | getEmpty(x, ...)
 9 | }
10 | \arguments{
11 | \item{x}{object of type \code{\link{WebCorpus}}}
12 | 
13 | \item{...}{additional parameters to PostFUN}
14 | }
15 | \description{
16 | Retrieve content of all empty (textlength equals zero) corpus elements. If
17 | corpus element is empty, \code{$postFUN} is called (specified in \code{\link{meta}})
18 | }
19 | \seealso{
20 | \code{\link{WebCorpus}}
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/man/extract.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/extract.R
 3 | \name{extract}
 4 | \alias{extract}
 5 | \alias{extract.PlainTextDocument}
 6 | \title{Extract main content from \code{TextDocument}s.}
 7 | \usage{
 8 | extract(x, extractor, ...)
 9 | }
10 | \arguments{
11 | \item{x}{PlainTextDocument}
12 | 
13 | \item{extractor}{default extraction function to be used, defaults to \code{\link{extractContentDOM}}}
14 | 
15 | \item{...}{additional parameters to extractor function}
16 | }
17 | \description{
18 | Use implemented extraction functions (through boilerpipeR) to extract main content from
19 | \code{TextDocument}s.
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/man/yahoonews.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/tm.plugin.webmining-package.R
 3 | \docType{data}
 4 | \name{yahoonews}
 5 | \alias{yahoonews}
 6 | \title{WebCorpus retrieved from Yahoo! News for the search term "Microsoft"
 7 | through the YahooNewsSource. Length of retrieved corpus is 20.}
 8 | \description{
 9 | WebCorpus retrieved from Yahoo! News for the search term "Microsoft"
10 | through the YahooNewsSource. Length of retrieved corpus is 20.
11 | }
12 | \examples{
13 | #Data set has been generated as follows:
14 | \dontrun{
15 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft"))
16 | }
17 | }
18 | \author{
19 | Mario Annau
20 | }
21 | \keyword{data}
22 | 
23 | 


--------------------------------------------------------------------------------
/man/parse.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/parser.R
 3 | \name{parse}
 4 | \alias{parse}
 5 | \title{Wrapper/Convenience function to ensure right encoding for different Platforms}
 6 | \usage{
 7 | parse(..., asText = TRUE, type = c("XML", "HTML", "JSON"))
 8 | }
 9 | \arguments{
10 | \item{...}{arguments to be passed to specified parser function}
11 | 
12 | \item{asText}{defines if input should be treated as text/character, default to TRUE}
13 | 
14 | \item{type}{either "XML", "HTML" or "JSON". Defaults to "XML"}
15 | }
16 | \description{
17 | Depending on specified type one of the following parser functions is called:
18 | \describe{
19 | \item{XML}{\code{\link{xmlInternalTreeParse}}}
20 | \item{HTML}{\code{\link{htmlTreeParse}}}
21 | \item{JSON}{\code{\link{fromJSON}}}
22 | }
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/man/LiberationSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{LiberationSource}
 4 | \alias{LiberationSource}
 5 | \alias{readLiberationSource}
 6 | \title{Get news data from french Liberation News Paper (\url{http://rss.liberation.fr/rss}).}
 7 | \usage{
 8 | LiberationSource(query = "latest", ...)
 9 | }
10 | \arguments{
11 | \item{query}{feed to be retrieved, defaults to 'latest'}
12 | 
13 | \item{...}{additional parameters to \code{\link{WebSource}}}
14 | }
15 | \value{
16 | WebXMLSource
17 | }
18 | \description{
19 | Get news data from french Liberation News Paper (\url{http://rss.liberation.fr/rss}).
20 | }
21 | \examples{
22 | \dontrun{
23 | corpus <- WebCorpus(LiberationSource("latest"))
24 | }
25 | }
26 | \author{
27 | Mario Annau
28 | }
29 | \seealso{
30 | \code{\link{WebSource}}
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/man/YahooInplaySource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{YahooInplaySource}
 4 | \alias{YahooInplaySource}
 5 | \alias{readYahooInplay}
 6 | \title{Get News from Yahoo Inplay.}
 7 | \usage{
 8 | YahooInplaySource(...)
 9 | }
10 | \arguments{
11 | \item{...}{additional parameters to \code{\link{WebSource}}}
12 | }
13 | \value{
14 | WebHTMLSource
15 | }
16 | \description{
17 | Yahoo Inplay lists a range of company news provided by Briefing.com. Since Yahoo Inplay
18 | does not provide a structured XML news feed, content is parsed directly from the HTML page.
19 | Therefore, no further Source parameters can be specified. The number of feed items per
20 | request can vary substantially.
21 | }
22 | \examples{
23 | \dontrun{
24 | corpus <- WebCorpus(YahooInplaySource())
25 | }
26 | }
27 | \author{
28 | Mario Annau
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tm.plugin.webmining
 2 | Version: 1.3.2
 3 | Date: 2015-09-10
 4 | Title: Retrieve Structured, Textual Data from Various Web Sources
 5 | Authors@R: c(person("Mario", "Annau", role = c("aut", "cre"),
 6 |     email = "mario.annau@gmail.com"))
 7 | Depends:
 8 |     R (>= 3.1.0)
 9 | Imports:
10 |     NLP (>= 0.1-2),
11 |     tm (>= 0.6),
12 |     boilerpipeR,
13 |     RCurl,
14 |     XML,
15 |     RJSONIO
16 | Suggests:
17 |     testthat
18 | Description: Facilitate text retrieval from feed
19 |     formats like XML (RSS, ATOM) and JSON. Also direct retrieval from
20 |     HTML is supported. As most (news) feeds only incorporate small
21 |     fractions of the original text tm.plugin.webmining even retrieves
22 |     and extracts the text of the original text source.
23 | License: GPL-3
24 | URL: https://github.com/mannau/tm.plugin.webmining
25 | BugReports: https://github.com/mannau/tm.plugin.webmining/issues
26 | 


--------------------------------------------------------------------------------
/man/removeNonASCII.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/transform.R
 3 | \name{removeNonASCII}
 4 | \alias{removeNonASCII}
 5 | \alias{removeNonASCII.PlainTextDocument}
 6 | \title{Remove non-ASCII characters from Text.}
 7 | \usage{
 8 | removeNonASCII(x, fields = c("Content", "Heading", "Description"),
 9 |   from = "UTF-8", to = "ASCII//TRANSLIT")
10 | }
11 | \arguments{
12 | \item{x}{object of PlainTextDocument class}
13 | 
14 | \item{fields}{specifies fields to be converted, defaults to fields = c("Content", "Heading", "Description")}
15 | 
16 | \item{from}{specifies encoding from which conversion should be done, defaults to "UTF-8"}
17 | 
18 | \item{to}{speciefies target encoding, defaults to "ASCII//TRANSLIT"}
19 | }
20 | \description{
21 | This is a helper function to generate package data
22 | without non-ASCII character and omit the warning at R CMD check.
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/man/YahooNewsSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{YahooNewsSource}
 4 | \alias{YahooNewsSource}
 5 | \alias{readYahooHTML}
 6 | \title{Get news data from Yahoo! News (\url{https://news.search.yahoo.com/search/}).}
 7 | \usage{
 8 | YahooNewsSource(query, params = list(p = query), ...)
 9 | }
10 | \arguments{
11 | \item{query}{words to be searched in Yahoo News, multiple words must be separated by '+'}
12 | 
13 | \item{params,}{additional query parameters, see \url{http://developer.yahoo.com/rss/}}
14 | 
15 | \item{...}{additional parameters to \code{\link{WebSource}}}
16 | }
17 | \value{
18 | WebXMLSource
19 | }
20 | \description{
21 | Currently, only a maximum of 10 items can be retrieved.
22 | }
23 | \examples{
24 | \dontrun{
25 | corpus <- WebCorpus(YahooNewsSource("Microsoft"))
26 | }
27 | }
28 | \author{
29 | Mario Annau
30 | }
31 | \seealso{
32 | \code{\link{WebSource}}
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/vignettes/tables/sources.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{l|r|l|c|l}
 2 | \textbf{Source Name} & \textbf{Items} & \textbf{URL} & \textbf{Auth} &
 3 | \textbf{Format}\\
 4 | \hline    	\class{GoogleBlogSearchSource} & 100 &
 5 | \url{http://www.google.com/blogsearch} & - & RSS\\
 6 | \class{GoogleFinanceSource} & 20 & \url{http://www.google.com/finance} & - &
 7 | RSS\\
 8 | \class{GoogleNewsSource} & 100 & \url{http://news.google.com} & - & RSS\\
 9 | \class{NYTimesSource} & 100 & \url{http://api.nytimes.com} & x & JSON\\
10 | \class{ReutersNewsSource} & 20 & \url{http://www.reuters.com/tools/rss} & - &
11 | ATOM\\
12 | %\class{TwitterSource} & 1500 & \url{http://search.twitter.com/api} & - & ATOM\\
13 | \class{YahooFinanceSource} & 20 & \url{http://finance.yahoo.com} & - & RSS\\
14 | \class{YahooInplaySource} & 100+ &
15 | \url{http://finance.yahoo.com/marketupdate/inplay} & - & HTML\\
16 | \class{YahooNewsSource} & 20 & \url{http://news.search.yahoo.com/rss} & - & RSS
17 | \end{tabular} 


--------------------------------------------------------------------------------
/R/parser.R:
--------------------------------------------------------------------------------
 1 | #' @title Wrapper/Convenience function to ensure right encoding for different Platforms
 2 | #' @description Depending on specified type one of the following parser functions is called:
 3 | #' \describe{
 4 | #' \item{XML}{\code{\link{xmlInternalTreeParse}}}
 5 | #' \item{HTML}{\code{\link{htmlTreeParse}}}
 6 | #' \item{JSON}{\code{\link{fromJSON}}}
 7 | #' }
 8 | #' @param ... arguments to be passed to specified parser function
 9 | #' @param asText defines if input should be treated as text/character, default to TRUE
10 | #' @param type either "XML", "HTML" or "JSON". Defaults to "XML"
11 | #' @export
12 | parse <- function(..., asText = TRUE, type = c("XML", "HTML", "JSON")){
13 | 	parsetype <- match.arg(type)
14 | 	encoding <- switch(.Platform$OS.type,
15 | 						unix = "UTF-8",
16 | 						windows = "latin1")
17 | 	parser <- switch(parsetype,
18 | 						XML = xmlInternalTreeParse,
19 | 						HTML = htmlTreeParse,
20 | 						JSON = fromJSON)
21 | 	parser(..., encoding = encoding, asText = asText)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/GoogleNewsSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{GoogleNewsSource}
 4 | \alias{GoogleNewsSource}
 5 | \title{Get feed data from Google News Search \url{http://news.google.com/}}
 6 | \usage{
 7 | GoogleNewsSource(query, params = list(hl = "en", q = query, ie = "utf-8", num
 8 |   = 30, output = "rss"), ...)
 9 | }
10 | \arguments{
11 | \item{query}{Google News Search query}
12 | 
13 | \item{params,}{additional query parameters}
14 | 
15 | \item{...}{additional parameters to \code{\link{WebSource}}}
16 | }
17 | \value{
18 | WebXMLSource
19 | }
20 | \description{
21 | Google News Search is one of the most popular news aggregators on the web. News
22 | can be retrieved for any customized user query. Up to 30 can be retrieved per
23 | request.
24 | }
25 | \examples{
26 | \dontrun{
27 | corpus <- WebCorpus(GoogleNewsSource("Microsoft"))
28 | }
29 | }
30 | \author{
31 | Mario Annau
32 | }
33 | \seealso{
34 | \code{\link{WebSource}}
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/man/feedquery.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/feedquery.R
 3 | \name{feedquery}
 4 | \alias{feedquery}
 5 | \title{Buildup string for feedquery.}
 6 | \usage{
 7 | feedquery(url, params)
 8 | }
 9 | \arguments{
10 | \item{url}{character specifying feed url}
11 | 
12 | \item{params}{list which contains feed parameters, e.g. list(param1="value1", param2="value2")}
13 | }
14 | \description{
15 | Function has partly been taken from \code{\link[RCurl]{getForm}} function.
16 | Generally, a feed query is a string built up as follows: \cr
17 | \code{<url>?<param1=value1>&<param2=value2>&...&<paramN=valueN>} \cr
18 | By specifying a feed url and parameter--value pairs (as list) we can easily
19 | generate a feed query in R.
20 | }
21 | \examples{
22 | \dontrun{
23 | feedquery(url = "http://dummy.com",
24 | params = list(param1 = "value1", param2 = "value2"))
25 | }
26 | }
27 | \author{
28 | Mario Annau
29 | }
30 | \seealso{
31 | \code{\link{xmlNode}} \code{\link{getForm}}
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/man/extractHTMLStrip.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/extract.R
 3 | \name{extractHTMLStrip}
 4 | \alias{extractHTMLStrip}
 5 | \title{Simply strip HTML Tags from Document}
 6 | \usage{
 7 | extractHTMLStrip(url, asText = TRUE, encoding, ...)
 8 | }
 9 | \arguments{
10 | \item{url}{character, url or filename}
11 | 
12 | \item{asText}{specifies if url parameter is a \code{character}, defaults to TRUE}
13 | 
14 | \item{encoding}{specifies local encoding to be used, depending on platform}
15 | 
16 | \item{...}{Additional parameters for \code{\link{htmlTreeParse}}}
17 | }
18 | \description{
19 | \code{extractHTMLStrip} parses an url, character or filename, reads the DOM
20 | tree, removes all HTML tags in the tree and outputs the source text without
21 | markup.
22 | }
23 | \note{
24 | Input text should be enclosed in <html>'TEXT'</html> tags to ensure correct
25 | DOM parsing (issue especially under .Platform$os.type = 'windows')
26 | }
27 | \author{
28 | Mario Annau
29 | }
30 | \seealso{
31 | \code{\link{xmlNode}}
32 | 
33 | \code{\link{htmlTreeParse}} \code{\link{encloseHTML}}
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/man/YahooFinanceSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{YahooFinanceSource}
 4 | \alias{YahooFinanceSource}
 5 | \alias{readYahoo}
 6 | \title{Get feed data from Yahoo! Finance.}
 7 | \usage{
 8 | YahooFinanceSource(query, params = list(s = query, region = "US", lang =
 9 |   "en-US"), ...)
10 | }
11 | \arguments{
12 | \item{query}{ticker symbols of companies to be searched for, see \url{http://finance.yahoo.com/lookup}.}
13 | 
14 | \item{params,}{additional query parameters, see \url{http://developer.yahoo.com/rss/}}
15 | 
16 | \item{...}{additional parameters to \code{\link{WebSource}}}
17 | }
18 | \value{
19 | WebXMLSource
20 | }
21 | \description{
22 | Yahoo! Finance is a popular site which provides financial news and information. It is a large source
23 | for historical price data as well as financial news. Using the typical Yahoo! Finance ticker
24 | news items can easily be retrieved. However, the maximum number of items is 20.
25 | }
26 | \examples{
27 | \dontrun{
28 | corpus <- WebCorpus(YahooFinanceSource("MSFT"))
29 | }
30 | }
31 | \author{
32 | Mario Annau
33 | }
34 | \seealso{
35 | \code{\link{WebSource}}
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/man/GoogleFinanceSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{GoogleFinanceSource}
 4 | \alias{GoogleFinanceSource}
 5 | \alias{readGoogle}
 6 | \title{Get feed Meta Data from Google Finance.}
 7 | \usage{
 8 | GoogleFinanceSource(query, params = list(hl = "en", q = query, ie = "utf-8",
 9 |   start = 0, num = 20, output = "rss"), ...)
10 | }
11 | \arguments{
12 | \item{query}{ticker symbols of companies to be searched for, see \url{http://www.google.com/finance}.
13 | Please note that Google ticker symbols need to be prefixed with the exchange name, e.g. NASDAQ:MSFT}
14 | 
15 | \item{params}{additional query parameters}
16 | 
17 | \item{...}{additional parameters to \code{\link{WebSource}}}
18 | }
19 | \value{
20 | WebXMLSource
21 | }
22 | \description{
23 | Google Finance provides business and enterprise headlines for many companies. Coverage is
24 | particularly strong for US-Markets. However, only up to 20 feed items can be retrieved.
25 | }
26 | \examples{
27 | \dontrun{
28 | corpus <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT"))
29 | }
30 | }
31 | \author{
32 | Mario Annau
33 | }
34 | \seealso{
35 | \code{\link{WebSource}}
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/man/ReutersNewsSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{ReutersNewsSource}
 4 | \alias{ReutersNewsSource}
 5 | \alias{readReutersNews}
 6 | \title{Get feed data from Reuters News RSS feed channels. Reuters provides numerous feed}
 7 | \usage{
 8 | ReutersNewsSource(query = "businessNews", ...)
 9 | }
10 | \arguments{
11 | \item{query}{Reuters News RSS Feed, see \url{http://www.reuters.com/tools/rss} for a list of all feeds provided. Note that only string after 'http://feeds.reuters.com/reuters/' must be given. Defaults to 'businessNews'.}
12 | 
13 | \item{...}{additional parameters to \code{\link{WebSource}}}
14 | }
15 | \value{
16 | WebXMLSource
17 | }
18 | \description{
19 | channels (\url{http://www.reuters.com/tools/rss}) which can be retrieved through RSS
20 | feeds. Only up to 25 items can be retrieved---therefore an alternative retrieval
21 | through the Google Reader API (\code{link{GoogleReaderSource}}) could be considered.
22 | }
23 | \examples{
24 | \dontrun{
25 | corpus <- WebCorpus(ReutersNewsSource("businessNews"))
26 | }
27 | }
28 | \author{
29 | Mario Annau
30 | }
31 | \seealso{
32 | \code{\link{WebSource}}
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/man/readWeb.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/reader.R
 3 | \name{readWeb}
 4 | \alias{json_content}
 5 | \alias{readWeb}
 6 | \alias{readWebHTML}
 7 | \alias{readWebJSON}
 8 | \alias{readWebXML}
 9 | \title{Read content from WebXMLSource/WebHTMLSource/WebJSONSource.}
10 | \usage{
11 | readWeb(spec, doc, parser, contentparser, freeFUN = NULL)
12 | }
13 | \arguments{
14 | \item{spec}{specification of content reader}
15 | 
16 | \item{doc}{document to be parsed}
17 | 
18 | \item{parser}{parser function to be used}
19 | 
20 | \item{contentparser}{content parser function to be used, see also \code{tm:::xml_content} or \code{json_content}}
21 | 
22 | \item{freeFUN}{function to free memory from parsed object (actually only relevant for XML and HTML trees)}
23 | }
24 | \value{
25 | FunctionGenerator
26 | }
27 | \description{
28 | \code{readWeb} is a FunctionGenerator which specifies content retrieval from a \code{\link{WebSource}}
29 | content elements. Currently, it is defined for XML, HTML and JSON feeds through \code{readWebXML},
30 | \code{readWebHTML} and \code{readWebJSON}. Also content parsers (\code{xml_content}, \code{json_content})
31 | need to be defined.
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/man/corpus.update.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/corpus.R
 3 | \name{corpus.update}
 4 | \alias{corpus.update}
 5 | \alias{corpus.update.WebCorpus}
 6 | \title{Update/Extend \code{\link{WebCorpus}} with new feed items.}
 7 | \usage{
 8 | corpus.update(x, ...)
 9 | }
10 | \arguments{
11 | \item{x}{object of type \code{\link{WebCorpus}}}
12 | 
13 | \item{...}{\describe{
14 | \item{fieldname}{name of \code{\link{Corpus}} field name to be used as ID, defaults to "ID"}
15 | \item{retryempty}{specifies if empty corpus elements should be downloaded again, defaults to TRUE}
16 | \item{...}{additional parameters to \code{\link{Corpus}} function}
17 | }}
18 | }
19 | \description{
20 | The \code{corpus.update} method ensures, that the original
21 | \code{\link{WebCorpus}} feed sources are downloaded and checked against
22 | already included \code{TextDocument}s. Based on the \code{ID} included
23 | in the  \code{TextDocument}'s meta data, only new feed elements are
24 | downloaded and added to the \code{\link{WebCorpus}}.
25 | All relevant information regariding the original source feeds are stored
26 | in the \code{\link{WebCorpus}}' meta data (\code{\link[tm]{meta}}).
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/R/feedquery.R:
--------------------------------------------------------------------------------
 1 | #' @title Buildup string for feedquery. 
 2 | #' @description Function has partly been taken from \code{\link[RCurl]{getForm}} function. 
 3 | #' Generally, a feed query is a string built up as follows: \cr
 4 | #' \code{<url>?<param1=value1>&<param2=value2>&...&<paramN=valueN>} \cr
 5 | #' By specifying a feed url and parameter--value pairs (as list) we can easily
 6 | #' generate a feed query in R.
 7 | #' @author Mario Annau
 8 | #' @param url character specifying feed url
 9 | #' @param params list which contains feed parameters, e.g. list(param1="value1", param2="value2")
10 | #' @seealso \code{\link{xmlNode}} \code{\link{getForm}}
11 | #' @examples
12 | #' \dontrun{
13 | #' feedquery(url = "http://dummy.com", 
14 | #' params = list(param1 = "value1", param2 = "value2"))
15 | #' } 
16 | #' @export 
17 | #' @importFrom RCurl curlEscape
18 | feedquery <-
19 | function(url, params){
20 | 	els <- lapply(names(params), function(n) {		
21 | 		paste(n, curlEscape(params[[n]]), sep = "=")
22 | 	})
23 | 	names(els) <- names(params)
24 | 	
25 | 	feeds <- ""
26 | 	for(i in names(els)){
27 | 		if(feeds[1] == ""){
28 | 			sep = ""
29 | 		}
30 | 		else{
31 | 			sep = "&"
32 | 		}
33 | 		feeds <- paste(feeds, els[[i]], sep = sep)
34 | 	}
35 | 
36 | 	feeds <- paste(url, feeds, sep = "?")
37 | 	return(feeds)
38 | }


--------------------------------------------------------------------------------
/man/extractContentDOM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/extract.R
 3 | \name{extractContentDOM}
 4 | \alias{assignValues}
 5 | \alias{calcDensity}
 6 | \alias{extractContentDOM}
 7 | \alias{getMainText}
 8 | \alias{removeTags}
 9 | \title{Extract Main HTML Content from DOM}
10 | \usage{
11 | extractContentDOM(url, threshold, asText = TRUE, ...)
12 | }
13 | \arguments{
14 | \item{url}{character, url or filename}
15 | 
16 | \item{threshold}{threshold for extraction, defaults to 0.5}
17 | 
18 | \item{asText}{boolean, specifies if url should be interpreted as character}
19 | 
20 | \item{...}{Additional Parameters to \code{\link{htmlTreeParse}}}
21 | }
22 | \description{
23 | Function extracts main HTML Content using its Document Object Model.
24 | Idea comes basically from the fact, that main content of an HTML Document
25 | is in a subnode of the HTML DOM Tree with a high text-to-tag ratio.
26 | Internally, this function also calls
27 | \code{assignValues}, \code{calcDensity}, \code{getMainText}
28 | and \code{removeTags}.
29 | }
30 | \author{
31 | Mario Annau
32 | }
33 | \references{
34 | \url{http://www.elias.cn/En/ExtMainText},
35 | 				\url{http://ai-depot.com/articles/the-easy-way-to-extract-useful-text-from-arbitrary-html/}
36 | 				\cite{Gupta et al., DOM-based Content Extraction of HTML Documents},\url{http://www2003.org/cdrom/papers/refereed/p583/p583-gupta.html}
37 | }
38 | \seealso{
39 | \code{\link{xmlNode}}
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/man/WebCorpus.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/corpus.R
 3 | \name{WebCorpus}
 4 | \alias{WebCorpus}
 5 | \title{WebCorpus constructor function.}
 6 | \usage{
 7 | WebCorpus(x, readerControl = list(reader = reader(x), language = "en"),
 8 |   postFUN = x$postFUN, retryEmpty = TRUE, ...)
 9 | }
10 | \arguments{
11 | \item{x}{object of type Source, see also \code{\link{Corpus}}}
12 | 
13 | \item{readerControl}{specifies reader to be used for \code{Source}, defaults to
14 | list(reader = x$DefaultReader, language = "en"}
15 | 
16 | \item{postFUN}{function to be applied to WebCorpus after web retrieval has been completed,
17 | defaults to x$PostFUN}
18 | 
19 | \item{retryEmpty}{specifies if retrieval for empty content elements should be repeated,
20 | defaults to TRUE}
21 | 
22 | \item{...}{additional parameters for Corpus function (actually Corpus reader)}
23 | }
24 | \description{
25 | \code{WebCorpus} adds further methods and meta data to \code{\link[tm]{Corpus}} and therefore
26 | constructs a derived class of \code{\link[tm]{Corpus}}. Most importantly, \code{WebCorpus}
27 | calls \code{$PostFUN} on the generated \code{WebCorpus}, which retrieves the main content
28 | for most implemented \code{WebSource}s. Thus it enables an efficient retrieval of new feed items
29 | (\code{\link{corpus.update}}). All additional WebCorpus fields are added to \code{tm$meta}
30 | like \code{$source}, \code{$readerControl} and \code{$postFUN}.
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/tests/testthat/test-source-yahooinplay.R:
--------------------------------------------------------------------------------
 1 | context("YahooInPlaySource")
 2 | 
 3 | test_that("YahooInPlaySource",{
 4 | 	
 5 | 	minlengthcorp <- 1
 6 | 		
 7 | 	testcorp <- WebCorpus(YahooInplaySource())
 8 | 	lengthcorp <- length(testcorp)
 9 | 	# Check Corpus object
10 | 	expect_that(length(testcorp) >= minlengthcorp, is_true())
11 | 	expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus")))
12 | 	
13 | 	# Check Content
14 | 	#expect_that(all(sapply(testcorp, nchar) > 0), is_true())
15 | 	contentlength <- sapply(testcorp, function(x) 
16 | 				if( length(content(x)) < 1) 0 else nchar(content(x)))	
17 | 	contentratio <- length(which(contentlength > 0)) / length(testcorp)
18 | 	expect_that(contentratio > 0.5, is_true())
19 | 	
20 | 	# Check Meta Data
21 | 	datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp"))
22 | 	#FIXME: Date should be fixed
23 | 	expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "character")), is_true())
24 | 	
25 | 	heading <- lapply(testcorp, function(x) meta(x, "heading")[1])
26 | 	expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true())
27 | 	expect_that(all(sapply(heading, nchar) > 0), is_true())
28 | 	
29 | 	id <- lapply(testcorp, function(x) meta(x, "id")[1])
30 | 	expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true())
31 | 	expect_that(all(sapply(id, nchar) > 0), is_true())
32 | 	
33 | 	testcorp <- testcorp[1:length(minlengthcorp)]
34 | 	# TODO: test should be re-activated again
35 |   #testcorp <- corpus.update(testcorp)
36 | 	#expect_that(length(testcorp) >= lengthcorp, is_true())
37 | 	
38 | 	cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100))
39 | })
40 | 
41 | 


--------------------------------------------------------------------------------
/R/transform.R:
--------------------------------------------------------------------------------
 1 | #' @title Enclose Text Content in HTML tags
 2 | #' @description Simple helper function which encloses text content of character
 3 | #' (or \code{\link[tm]{TextDocument}}) in HTML-tags. That way, HTML
 4 | #' content can be easier parsed by \code{\link[XML]{htmlTreeParse}}
 5 | #' @param x object of PlainTextDocument class
 6 | #' @export
 7 | #' @aliases encloseHTML.PlainTextDocument encloseHTML.character
 8 | encloseHTML <- function(x) UseMethod("encloseHTML", x)
 9 | 
10 | #' @importFrom NLP content<-
11 | #' @noRd
12 | #' @export 
13 | # FIXME: Could be done easier?? 
14 | encloseHTML.PlainTextDocument <- function(x){
15 | 	content(x) <- sprintf("<html>%s</html>", x)
16 | 	x
17 | } 
18 | 
19 | #' @title Remove non-ASCII characters from Text. 
20 | #' @description This is a helper function to generate package data 
21 | #' without non-ASCII character and omit the warning at R CMD check.
22 | #' @param x object of PlainTextDocument class
23 | #' @param fields specifies fields to be converted, defaults to fields = c("Content", "Heading", "Description")
24 | #' @param from specifies encoding from which conversion should be done, defaults to "UTF-8"
25 | #' @param to speciefies target encoding, defaults to "ASCII//TRANSLIT"
26 | #' @export
27 | #' @aliases removeNonASCII.PlainTextDocument
28 | removeNonASCII <- function(x, fields = c("Content", "Heading", "Description"), from = "UTF-8", to = "ASCII//TRANSLIT")
29 | 	UseMethod("removeNonASCII", x)
30 | 
31 | #' @noRd
32 | #' @export
33 | removeNonASCII.PlainTextDocument <- function(x, fields = c("Content", "Heading", "Description"), from = "UTF-8", to = "ASCII//TRANSLIT"){
34 | 	if("Content" %in% fields){
35 | 		content(x) <- iconv(x, from, to)
36 | 	}
37 | 	for(fn in setdiff(fields, "Content")){
38 | 		meta(x, fn) <- iconv(meta(x, fn), from, to)
39 | 	}
40 | 	x
41 | } 


--------------------------------------------------------------------------------
/tests/testthat/test-source-reutersnews.R:
--------------------------------------------------------------------------------
 1 | context("ReutersNewsSource")
 2 | 
 3 | test_that("ReutersNewsSource",{
 4 | 	
 5 | 	lengthcorp <- 20
 6 | 		
 7 | 	testcorp <- WebCorpus(ReutersNewsSource("businessNews"))
 8 | 	# Check Corpus object
 9 | 	expect_that(length(testcorp), equals(lengthcorp))
10 | 	expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus")))
11 | 	
12 | 	# Check Content
13 | 	#expect_that(all(sapply(testcorp, nchar) > 0), is_true())
14 | 	contentlength <- sapply(testcorp, function(x) 
15 | 				if( length(content(x)) < 1) 0 else nchar(content(x)))	
16 | 	contentratio <- length(which(contentlength > 0)) / length(testcorp)
17 | 	expect_that(contentratio > 0.5, is_true())
18 | 	
19 | 	# Check Meta Data
20 | 	datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp"))
21 | 	expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true())
22 | 	
23 | 	description <- lapply(testcorp, function(x) meta(x, "description"))
24 | 	expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true())
25 | 	
26 | 	heading <- lapply(testcorp, function(x) meta(x, "heading"))
27 | 	expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true())
28 | 	expect_that(all(sapply(heading, nchar) > 0), is_true())
29 | 	
30 | 	id <- lapply(testcorp, function(x) meta(x, "id"))
31 | 	expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true())
32 | 	expect_that(all(sapply(id, nchar) > 0), is_true())
33 | 	
34 | 	origin <- lapply(testcorp, function(x) meta(x, "origin"))
35 | 	expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true())
36 | 	expect_that(all(sapply(origin, nchar) > 0), is_true())
37 | 	
38 | 	testcorp <- testcorp[1:5]
39 | 	testcorp <- corpus.update(testcorp)
40 | 	expect_that(length(testcorp) >= lengthcorp, is_true())
41 | 	
42 | 	cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100))
43 | })
44 | 
45 | 


--------------------------------------------------------------------------------
/tests/testthat/test-source-googlefinance.R:
--------------------------------------------------------------------------------
 1 | context("GoogleFinanceSource")
 2 | 
 3 | test_that("GoogleFinanceSource",{
 4 | 	
 5 | 	lengthcorp <- 20
 6 | 		
 7 | 	testcorp <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT"))
 8 | 	# Check Corpus object
 9 | 	expect_that(length(testcorp), equals(lengthcorp))
10 | 	expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus")))
11 | 	
12 | 	# Check Content
13 | 	#expect_that(all(sapply(testcorp, nchar) > 0), is_true())
14 | 	contentlength <- sapply(testcorp, function(x) 
15 | 				if( length(content(x)) < 1) 0 else nchar(content(x)))	
16 | 	contentratio <- length(which(contentlength > 0)) / length(testcorp)
17 | 	expect_that(contentratio > 0.5, is_true())
18 | 	
19 | 	# Check Meta Data
20 | 	datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp"))
21 | 	expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true())
22 | 	
23 | 	description <- lapply(testcorp, function(x) meta(x, "description"))
24 | 	expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true())
25 | 	
26 | 	heading <- lapply(testcorp, function(x) meta(x, "heading"))
27 | 	expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true())
28 | 	expect_that(all(sapply(heading, nchar) > 0), is_true())
29 | 	
30 | 	id <- lapply(testcorp, function(x) meta(x, "id"))
31 | 	expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true())
32 | 	expect_that(all(sapply(id, nchar) > 0), is_true())
33 | 	
34 | 	origin <- lapply(testcorp, function(x) meta(x, "origin"))
35 | 	expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true())
36 | 	expect_that(all(sapply(origin, nchar) > 0), is_true())
37 | 	
38 | 	testcorp <- testcorp[1:10]
39 | 	testcorp <- corpus.update(testcorp)
40 | 	expect_that(length(testcorp) >= lengthcorp, is_true())
41 | 	
42 | 	cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100))
43 | })
44 | 
45 | 


--------------------------------------------------------------------------------
/man/WebSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{WebSource}
 4 | \alias{WebSource}
 5 | \title{Read Web Content and respective Link Content from feedurls.}
 6 | \usage{
 7 | WebSource(feedurls, class = "WebXMLSource", reader, parser,
 8 |   encoding = "UTF-8", curlOpts = curlOptions(followlocation = TRUE,
 9 |   maxconnects = 5, maxredirs = 20, timeout = 30, connecttimeout = 30,
10 |   ssl.verifyhost = FALSE, ssl.verifypeer = FALSE), postFUN = NULL,
11 |   retrieveFeedURL = TRUE, ...)
12 | }
13 | \arguments{
14 | \item{feedurls}{urls from feeds to be retrieved}
15 | 
16 | \item{class}{class label to be assigned to \code{Source} object, defaults to "WebXMLSource"}
17 | 
18 | \item{reader}{function to be used to read content, see also \code{\link{readWeb}}}
19 | 
20 | \item{parser}{function to be used to split feed content into chunks, returns list of content elements}
21 | 
22 | \item{encoding}{specifies default encoding, defaults to 'UTF-8'}
23 | 
24 | \item{curlOpts}{a named list or CURLOptions object identifying the curl options for the handle. Type \code{listCurlOptions()} for all Curl options available.}
25 | 
26 | \item{postFUN}{function saved in WebSource object and called to retrieve full text content from feed urls}
27 | 
28 | \item{retrieveFeedURL}{logical; Specify if feedurls should be downloaded first.}
29 | 
30 | \item{...}{additional parameters passed to \code{WebSource} object/structure}
31 | }
32 | \value{
33 | WebSource
34 | }
35 | \description{
36 | WebSource is derived from \code{\link[tm]{Source}}. In addition to calling the
37 | base \code{\link[tm]{Source}} constructor function it also retrieves the specified
38 | feedurls and pre--parses the content with the parser function.
39 | The fields \code{$Content}, \code{$Feedurls} \code{$Parser} and \code{$CurlOpts} are finally
40 | added to the \code{Source} object.
41 | }
42 | \author{
43 | Mario Annau
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/testthat/test-source-yahoonews.R:
--------------------------------------------------------------------------------
 1 | context("YahooNewsSource")
 2 | 
 3 | test_that("YahooNewsSource",{
 4 | 	
 5 | 	lengthcorp <- 10
 6 | 		
 7 | 	testcorp <- WebCorpus(YahooNewsSource("Microsoft"))
 8 | 	# Check Corpus object
 9 | 	expect_that(length(testcorp), equals(lengthcorp))
10 | 	expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus")))
11 | 	
12 | 	# Check Content
13 | 	#FIXME: No content is retrieved
14 | 	#expect_that(all(sapply(testcorp, nchar) > 0), is_true())
15 | 	contentlength <- sapply(testcorp, function(x) 
16 | 				if( length(content(x)) < 1) 0 else nchar(content(x)))	
17 | 	contentratio <- length(which(contentlength > 0)) / length(testcorp)
18 | 	expect_that(contentratio > 0.5, is_true())
19 | 	
20 | 	# Check Meta Data
21 | 	datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp"))
22 | 	expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true())
23 | 	
24 | 	description <- lapply(testcorp, function(x) meta(x, "description"))
25 | 	expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true())
26 | 	
27 | 	heading <- lapply(testcorp, function(x) meta(x, "heading"))
28 | 	expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true())
29 | 	expect_that(all(sapply(heading, nchar) > 0), is_true())
30 | 	
31 | 	id <- lapply(testcorp, function(x) meta(x, "id"))
32 | 	expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true())
33 | 	expect_that(all(sapply(id, nchar) > 0), is_true())
34 | 	
35 | 	origin <- lapply(testcorp, function(x) meta(x, "origin"))
36 | 	expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true())
37 | 	expect_that(all(sapply(origin, nchar) > 0), is_true())
38 | 	
39 | 	testcorp <- testcorp[1:10]
40 | 	testcorp <- corpus.update(testcorp)
41 | 	expect_that(length(testcorp) >= lengthcorp, is_true())
42 | 	
43 | 	cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100))
44 | })
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/testthat/test-source-googlenews.R:
--------------------------------------------------------------------------------
 1 | context("GoogleNewsSource")
 2 | 
 3 | test_that("GoogleNewsSource",{
 4 | 	
 5 | 	lengthcorp <- 30
 6 | 	query <- "Microsoft"
 7 | 		
 8 | 	testcorp <- WebCorpus(GoogleNewsSource(query, 
 9 | 					params = list(hl = "en", q = query, ie = "utf-8", 
10 | 							num = lengthcorp, output = "rss")))
11 | 	# Check Corpus object
12 | 	expect_that(length(testcorp), equals(lengthcorp))
13 | 	expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus")))
14 | 	
15 | 	# Check Content
16 | 	contentlength <- sapply(testcorp, function(x) 
17 | 				if( length(content(x)) < 1) 0 else nchar(content(x)))	
18 | 	contentratio <- length(which(contentlength > 0)) / length(testcorp)
19 | 	expect_that(contentratio > 0.5, is_true())
20 | 	
21 | 	# Check Meta Data
22 | 	datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp"))
23 | 	expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true())
24 | 	
25 | 	description <- lapply(testcorp, function(x) meta(x, "description"))
26 | 	expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true())
27 | 	
28 | 	heading <- lapply(testcorp, function(x) meta(x, "heading"))
29 | 	expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true())
30 | 	expect_that(all(sapply(heading, nchar) > 0), is_true())
31 | 	
32 | 	id <- lapply(testcorp, function(x) meta(x, "id"))
33 | 	expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true())
34 | 	expect_that(all(sapply(id, nchar) > 0), is_true())
35 | 	
36 | 	origin <- lapply(testcorp, function(x) meta(x, "origin"))
37 | 	expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true())
38 | 	expect_that(all(sapply(origin, nchar) > 0), is_true())
39 | 	
40 | 	testcorp <- testcorp[1:10]
41 | 	testcorp <- corpus.update(testcorp)
42 | 	expect_that(length(testcorp) >= lengthcorp, is_true())
43 | 	
44 | 	cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100))
45 | })
46 | 
47 | 


--------------------------------------------------------------------------------
/tests/testthat/test-source-yahoofinance.R:
--------------------------------------------------------------------------------
 1 | context("YahooFinanceSource")
 2 | 
 3 | test_that("YahooFinanceSource",{
 4 | 	
 5 | 	lengthcorp <- 20
 6 | 		
 7 | 	testcorp <- WebCorpus(YahooFinanceSource("MSFT"))
 8 | 	# Check Corpus object
 9 | 	#FIXME: Content in Yahoo Finance is not retrieved
10 | 	expect_that(length(testcorp), equals(lengthcorp))
11 | 	expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus")))
12 | 	
13 | 	
14 | 	
15 | 	# Check Content
16 | 	#expect_that(all(sapply(testcorp, nchar) > 0), is_true())
17 | 	contentlength <- sapply(testcorp, function(x) 
18 | 				if( length(content(x)) < 1) 0 else nchar(content(x)))	
19 | 	contentratio <- length(which(contentlength > 0)) / length(testcorp)
20 | 	expect_that(contentratio > 0.5, is_true())
21 | 	
22 | 	# Check Meta Data
23 | 	datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp"))
24 | 	expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true())
25 | 	
26 | 	description <- lapply(testcorp, function(x) meta(x, "description"))
27 | 	expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true())
28 | 	
29 | 	heading <- lapply(testcorp, function(x) meta(x, "heading"))
30 | 	expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true())
31 | 	expect_that(all(sapply(heading, nchar) > 0), is_true())
32 | 	
33 | 	id <- lapply(testcorp, function(x) meta(x, "id"))
34 | 	expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true())
35 | 	expect_that(all(sapply(id, nchar) > 0), is_true())
36 | 	
37 | 	origin <- lapply(testcorp, function(x) meta(x, "origin"))
38 | 	expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true())
39 | 	expect_that(all(sapply(origin, nchar) > 0), is_true())
40 | 	
41 | 	testcorp <- testcorp[1:10]
42 | 	testcorp <- corpus.update(testcorp)
43 | 	expect_that(length(testcorp) >= lengthcorp, is_true())
44 | 	
45 | 	cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100))
46 | })
47 | 
48 | 


--------------------------------------------------------------------------------
/man/tm.plugin.webmining-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/tm.plugin.webmining-package.R
 3 | \docType{package}
 4 | \name{tm.plugin.webmining-package}
 5 | \alias{tm.plugin.webmining}
 6 | \alias{tm.plugin.webmining-package}
 7 | \alias{webmining}
 8 | \title{Retrieve structured, textual data from various web sources}
 9 | \description{
10 | tm.plugin.webmining facilitates the retrieval of textual data through various
11 | web feed formats like XML and JSON. Also direct retrieval from HTML
12 | is supported. As most (news) feeds only incorporate small fractions
13 | of the original text tm.plugin.webmining goes a step further and even
14 | retrieves and extracts the text of the original text source.
15 | Generally, the retrieval procedure can be described as a two--step process:
16 | \describe{
17 | \item{Meta Retrieval}{In a first step, all relevant meta feeds are retrieved.
18 | From these feeds all relevant meta data items are extracted.
19 | }
20 | \item{Content Retrieval}{In a second step the relevant source content is retrieved.
21 | Using the \code{boilerpipeR} package even the main content of \code{HTML} pages can
22 | be extracted.
23 | }}
24 | }
25 | \examples{
26 | \dontrun{
27 | googlefinance <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT"))
28 | googlenews <- WebCorpus(GoogleNewsSource("Microsoft"))
29 | nytimes <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid))
30 | reutersnews <- WebCorpus(ReutersNewsSource("businessNews"))
31 | yahoofinance <- WebCorpus(YahooFinanceSource("MSFT"))
32 | yahooinplay <- WebCorpus(YahooInplaySource())
33 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft"))
34 | liberation <- WebCorpus(LiberationSource("latest"))
35 | }
36 | }
37 | \author{
38 | Mario Annau \email{mario.annau@gmail}
39 | }
40 | \seealso{
41 | \code{\link{WebCorpus}} \code{\link{GoogleFinanceSource}} \code{\link{GoogleNewsSource}} \code{\link{NYTimesSource}} \code{\link{ReutersNewsSource}} \code{\link{YahooFinanceSource}} \code{\link{YahooInplaySource}} \code{\link{YahooNewsSource}}
42 | }
43 | \keyword{package}
44 | 
45 | 


--------------------------------------------------------------------------------
/man/NYTimesSource.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/source.R
 3 | \name{NYTimesSource}
 4 | \alias{NYTimesSource}
 5 | \alias{readNYTimes}
 6 | \title{Get feed data from NYTimes Article Search (\url{http://developer.nytimes.com/docs/read/article_search_api_v2}).}
 7 | \usage{
 8 | NYTimesSource(query, n = 100, appid, sleep = 1, params = list(format =
 9 |   "json", q = query, page = 0:(ceiling(n/10) - 1), `api-key` = appid),
10 |   curlOpts = curlOptions(followlocation = TRUE, maxconnects = 10, maxredirs =
11 |   10, timeout = 30, connecttimeout = 30), ...)
12 | }
13 | \arguments{
14 | \item{query}{character specifying query to be used to search NYTimes articles}
15 | 
16 | \item{n}{number of items, defaults to 100}
17 | 
18 | \item{appid}{Developer App id to be used, obtained from \url{http://developer.nytimes.com/}}
19 | 
20 | \item{sleep}{integer; Seconds to sleep between feed retrieval.}
21 | 
22 | \item{params}{additional query parameters, specified as list, see \url{http://developer.nytimes.com/docs/read/article_search_api}}
23 | 
24 | \item{curlOpts}{CURLOptions; RCurl options used for feed retrieval.}
25 | 
26 | \item{...}{additional parameters to \code{\link{WebSource}}}
27 | }
28 | \description{
29 | Excerpt from the website: "With the NYTimes Article Search API, you can search New York Times articles
30 | from 1981 to today, retrieving headlines, abstracts, lead paragraphs, links to associated multimedia
31 | and other article metadata. Along with standard keyword searching, the API also offers faceted searching.
32 | The available facets include Times-specific fields such as sections, taxonomic classifiers and controlled
33 | vocabulary terms (names of people, organizations and geographic locations)."
34 | Feed retrieval is limited to 1000 items (or 100 pages).
35 | }
36 | \examples{
37 | \dontrun{
38 | #nytimes_appid needs to be specified
39 | corpus <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid))
40 | }
41 | }
42 | \author{
43 | Mario Annau
44 | }
45 | \seealso{
46 | \code{\link{WebSource}}, \code{\link{readNYTimes}}
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2 (4.1.1): do not edit by hand
 2 | 
 3 | S3method("[",WebCorpus)
 4 | S3method(corpus.update,WebCorpus)
 5 | S3method(encloseHTML,PlainTextDocument)
 6 | S3method(extract,PlainTextDocument)
 7 | S3method(getElem,WebJSONSource)
 8 | S3method(getElem,WebXMLSource)
 9 | S3method(getEmpty,WebCorpus)
10 | S3method(removeNonASCII,PlainTextDocument)
11 | S3method(source.update,WebXMLSource)
12 | export(GoogleFinanceSource)
13 | export(GoogleNewsSource)
14 | export(LiberationSource)
15 | export(NYTimesSource)
16 | export(ReutersNewsSource)
17 | export(WebCorpus)
18 | export(WebSource)
19 | export(YahooFinanceSource)
20 | export(YahooInplaySource)
21 | export(YahooNewsSource)
22 | export(corpus.update)
23 | export(encloseHTML)
24 | export(extract)
25 | export(extractContentDOM)
26 | export(extractHTMLStrip)
27 | export(feedquery)
28 | export(getEmpty)
29 | export(getLinkContent)
30 | export(json_content)
31 | export(parse)
32 | export(readGoogle)
33 | export(readLiberationSource)
34 | export(readNYTimes)
35 | export(readReutersNews)
36 | export(readWeb)
37 | export(readWebHTML)
38 | export(readWebJSON)
39 | export(readWebXML)
40 | export(readYahoo)
41 | export(readYahooHTML)
42 | export(readYahooInplay)
43 | export(removeNonASCII)
44 | export(removeTags)
45 | export(source.update)
46 | export(trimWhiteSpaces)
47 | importFrom(NLP,"content<-")
48 | importFrom(NLP,"meta<-")
49 | importFrom(NLP,content)
50 | importFrom(NLP,meta)
51 | importFrom(RCurl,curlEscape)
52 | importFrom(RCurl,curlOptions)
53 | importFrom(RCurl,getURL)
54 | importFrom(RJSONIO,fromJSON)
55 | importFrom(XML,addAttributes)
56 | importFrom(XML,free)
57 | importFrom(XML,getNodeSet)
58 | importFrom(XML,htmlTreeParse)
59 | importFrom(XML,newXMLNamespace)
60 | importFrom(XML,removeNodes)
61 | importFrom(XML,saveXML)
62 | importFrom(XML,toString.XMLNode)
63 | importFrom(XML,xmlApply)
64 | importFrom(XML,xmlChildren)
65 | importFrom(XML,xmlInternalTreeParse)
66 | importFrom(XML,xmlValue)
67 | importFrom(XML,xpathSApply)
68 | importFrom(boilerpipeR,ArticleExtractor)
69 | importFrom(tm,Corpus)
70 | importFrom(tm,FunctionGenerator)
71 | importFrom(tm,PlainTextDocument)
72 | importFrom(tm,SimpleSource)
73 | importFrom(tm,eoi)
74 | importFrom(tm,getElem)
75 | importFrom(tm,reader)
76 | importFrom(tm,stepNext)
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tm.plugin.webmining
 2 | [![Build Status](https://travis-ci.org/mannau/tm.plugin.webmining.svg?branch=master)](https://travis-ci.org/mannau/tm.plugin.webmining) [![codecov.io](http://codecov.io/github/mannau/tm.plugin.webmining/coverage.svg?branch=master)](http://codecov.io/github/mannau/tm.plugin.webmining?branch=master) [![License](http://img.shields.io/badge/license-GPL%20%28%3E=%203%29-blue.svg?style=flat)](http://www.gnu.org/licenses/gpl-3.0.html)
 3 | 
 4 | **tm.plugin.webmining** is an R-package which facilitates text retrieval from feed formats like XML (RSS, ATOM) and JSON. Also direct retrieval from HTML is supported. As most (news) feeds only incorporate small fractions of the original text **tm.plugin.webmining** even extracts the text from the original text source.
 5 | 
 6 | ## Install
 7 | To install the [latest version from CRAN](http://cran.r-project.org/web/packages/tm.plugin.webmining/index.html) simply 
 8 | ```python
 9 | install.packages("tm.plugin.webmining")
10 | ```
11 | 
12 | Using the **devtools** package you can easily install the latest development version of **tm.plugin.webmining** from github with
13 | 
14 | ```python
15 | library(devtools)
16 | install_github("mannau/tm.plugin.webmining")
17 | ```
18 | 
19 | Windows users need to use the following command to install from github:
20 | 
21 | ```python
22 | library(devtools)
23 | install_github("mannau/boilerpipeR", args = "--no-multiarch")
24 | ```
25 | 
26 | ## Usage
27 | The next snippet shows how to download and extract the main text from all supported sources as WebCorpus objects including a rich set of metadata like *Author*, *DateTimeStamp* or *Source*:
28 | 
29 | ```python
30 | library(tm.plugin.webmining)
31 | googlefinance <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT"))
32 | googlenews <- WebCorpus(GoogleNewsSource("Microsoft"))
33 | nytimes <- WebCorpus(NYTimesSource("Microsoft", appid = "<nytimes_appid>"))
34 | reutersnews <- WebCorpus(ReutersNewsSource("businessNews"))
35 | #twitter <- WebCorpus(TwitterSource("Microsoft")) -> not supported yet
36 | yahoofinance <- WebCorpus(YahooFinanceSource("MSFT"))
37 | yahooinplay <- WebCorpus(YahooInplaySource())
38 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft"))
39 | liberation <- WebCorpus(LiberationSource("latest"))
40 | ```
41 | 
42 | ## License
43 | **tm.plugin.webmining** is released under the [GNU General Public License Version 3](http://www.gnu.org/copyleft/gpl.html)
44 | 


--------------------------------------------------------------------------------
/tests/testthat/test-source-nytimes.R:
--------------------------------------------------------------------------------
 1 | context("NYTimesSource")
 2 | 
 3 | data(nytimes_appid)
 4 | 
 5 | test_that("NYTimesSource",{
 6 | 	
 7 | 	lengthcorp <- 200
 8 | 	
 9 | 	if(!exists(as.character(substitute(nytimes_appid)))){
10 | 		cat("No Variable nytimes_appid provided. Skipping Test...\n")
11 | 		return()
12 | 	}
13 | 	
14 | 	testcorp <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid, n = lengthcorp))
15 | 	# Check Corpus object
16 | 	expect_that(length(testcorp), equals(lengthcorp))
17 | 	expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus")))
18 | 	
19 | 	# Check Content
20 | 	#expect_that(all(sapply(testcorp, nchar) > 0), is_true())
21 | 	contentlength <- sapply(testcorp, function(x) 
22 | 				if( length(content(x)) < 1) 0 else nchar(content(x)))	
23 | 	contentratio <- length(which(contentlength > 0)) / length(testcorp)
24 | 	expect_that(contentratio > 0.5, is_true())
25 | 	
26 | 	# Check Meta Data
27 | 	datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp"))
28 | 	expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true())
29 | 	
30 | 	description <- lapply(testcorp, function(x) meta(x, "description"))
31 | 	expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true())
32 | 	expect_that(all(sapply(description, nchar) > 0), is_true())
33 | 	
34 | 	heading <- lapply(testcorp, function(x) meta(x, "heading"))
35 | 	expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true())
36 | 	expect_that(all(sapply(heading, nchar) > 0), is_true())
37 | 	
38 | 	id <- lapply(testcorp, function(x) meta(x, "id"))
39 | 	expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true())
40 | 	expect_that(all(sapply(id, nchar) > 0), is_true())
41 | 	
42 | 	language <- lapply(testcorp, function(x) meta(x, "language"))
43 | 	expect_that(all(sapply(language, function(x) class(x)[1] == "character")), is_true())
44 | 	expect_that(all(sapply(language, nchar) > 0), is_true())
45 | 	
46 | 	origin <- lapply(testcorp, function(x) meta(x, "origin"))
47 | 	expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true())
48 | 	expect_that(all(sapply(origin, nchar) > 0), is_true())
49 | 	
50 | 	testcorp <- testcorp[1:10]
51 | 	testcorp <- corpus.update(testcorp)
52 | 	expect_that(length(testcorp) >= lengthcorp, is_true())
53 | 	
54 | 	cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100))
55 | })
56 | 
57 | 


--------------------------------------------------------------------------------
/man/getLinkContent.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.1.1): do not edit by hand
 2 | % Please edit documentation in R/getLinkContent.R
 3 | \name{getLinkContent}
 4 | \alias{getLinkContent}
 5 | \title{Get main content for corpus items, specified by links.}
 6 | \usage{
 7 | getLinkContent(corpus, links = sapply(corpus, meta, "origin"),
 8 |   timeout.request = 30, chunksize = 20, verbose = getOption("verbose"),
 9 |   curlOpts = curlOptions(verbose = FALSE, followlocation = TRUE, maxconnects =
10 |   5, maxredirs = 20, timeout = timeout.request, connecttimeout =
11 |   timeout.request, ssl.verifyhost = FALSE, ssl.verifypeer = FALSE, useragent =
12 |   "R", cookiejar = tempfile()), retry.empty = 3, sleep.time = 3,
13 |   extractor = ArticleExtractor, .encoding = integer(), ...)
14 | }
15 | \arguments{
16 | \item{corpus}{object of class \code{\link[tm]{Corpus}} for which link content should be downloaded}
17 | 
18 | \item{links}{character vector specifyinig links to be used for download, defaults to
19 | sapply(corpus, meta, "Origin")}
20 | 
21 | \item{timeout.request}{timeout (in seconds) to be used for connections/requests, defaults to 30}
22 | 
23 | \item{chunksize}{Size of download chunks to be used for parallel retrieval, defaults to 20}
24 | 
25 | \item{verbose}{Specifies if retrieval info should be printed, defaults to getOption("verbose")}
26 | 
27 | \item{curlOpts}{curl options to be passed to \code{\link{getURL}}}
28 | 
29 | \item{retry.empty}{Specifies number of times empty content sites should be retried, defaults to 3}
30 | 
31 | \item{sleep.time}{Sleep time to be used between chunked download, defaults to 3 (seconds)}
32 | 
33 | \item{extractor}{Extractor to be used for content extraction, defaults to extractContentDOM}
34 | 
35 | \item{.encoding}{encoding to be used for \code{\link{getURL}}, defaults to integer() (=autodetect)}
36 | 
37 | \item{...}{additional parameters to \code{\link{getURL}}}
38 | }
39 | \value{
40 | corpus including downloaded link content
41 | }
42 | \description{
43 | \code{getLinkContent} downloads and extracts content from weblinks for \code{\link[tm]{Corpus}} objects.
44 | Typically it is integrated and called as a post-processing function (field:\code{$postFUN}) for most \code{\link{WebSource}}
45 | objects. \code{getLinkContent} implements content download in chunks which has been proven to be a stabler approach for
46 | large content requests.
47 | }
48 | \seealso{
49 | \code{\link{WebSource}} \code{\link[RCurl]{getURL}} \code{\link[boilerpipeR]{Extractor}}
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | Rscript := $(shell whereis Rscript) --vanilla -e
 2 | R := $(shell whereis R)
 3 | 
 4 | PKG_VERSION := $(shell grep -i ^version DESCRIPTION | cut -d : -d \  -f 2)
 5 | PKG_NAME := $(shell grep -i ^package DESCRIPTION | cut -d : -d \  -f 2)
 6 | 
 7 | #DATA_FILES := $(wildcard data/*.rda)
 8 | R_FILES := $(wildcard R/*.R)
 9 | TEST_FILES := $(wildcard tests/*.R) $(wildcard tests/testthat/*.R)
10 | #ALL_SRC_FILES := $(wildcard src/*.cpp) $(wildcard src/*.h) src/Makevars
11 | #SRC_FILES := $(filter-out src/RcppExports.cpp, $(ALL_SRC_FILES))
12 | #HEADER_FILES := $(wildcard src/*.h)
13 | #RCPPEXPORTS := src/RcppExports.cpp R/RcppExports.R
14 | ROXYGENFILES := $(wildcard man/*.Rd) NAMESPACE 
15 | PKG_FILES := DESCRIPTION $(ROXYGENFILES) $(R_FILES) $(TEST_FILES)
16 | #OBJECTS := $(wildcard src/*.o) $(wildcard src/*.o-*) $(wildcard src/*.dll) $(wildcard src/*.so) $(wildcard src/*.rds)
17 | CHECKPATH := $(PKG_NAME).Rcheck
18 | CHECKLOG := `cat $(CHECKPATH)/00check.log`
19 | 
20 | .PHONY: all build build-cran check check-cran manual install clean compileAttributes 
21 | 
22 | all: 
23 | 	install
24 | 	
25 | build: $(PKG_NAME)_$(PKG_VERSION).tar.gz
26 | 
27 | build-cran:
28 | 	@make clean
29 | 	@make roxygen
30 | 	@cp tests/testthat.R tests/testthat.R.temp
31 | 	@cp tests/testthat.R.cran tests/testthat.R
32 | 	$(R) CMD build --resave-data .
33 | 	@cp tests/testthat.R.temp tests/testthat.R
34 | 	@rm tests/testthat.R.temp
35 | 	
36 | $(PKG_NAME)_$(PKG_VERSION).tar.gz: $(PKG_FILES)
37 | 	@make roxygen
38 | 	$(R) CMD build --resave-data --no-build-vignettes .
39 | 
40 | roxygen: $(R_FILES)
41 | 	$(Rscript) 'library(roxygen2); roxygenize()'
42 | 	
43 | check: $(PKG_NAME)_$(PKG_VERSION).tar.gz 
44 | 	@rm -rf $(CHECKPATH)
45 | 	$(R) CMD check --no-multiarch --no-manual --no-clean $(PKG_NAME)_$(PKG_VERSION).tar.gz
46 | 
47 | check-cran: 
48 | 	@make build-cran
49 | 	@rm -rf $(CHECKPATH)
50 | 	$(R) CMD check --as-cran --no-clean $(PKG_NAME)_$(PKG_VERSION).tar.gz
51 | 
52 | 00check.log: check
53 | 	@mv $(CHECKPATH)\\00check.log .
54 | 	@rm -rf $(CHECKPATH)
55 | 
56 | manual: $(PKG_NAME)-manual.pdf
57 | 
58 | $(PKG_NAME)-manual.pdf: $(ROXYGENFILES)
59 | 	$(R) CMD Rd2pdf --no-preview -o $(PKG_NAME)-manual.pdf .
60 | 	
61 | install: $(PKG_NAME)_$(PKG_VERSION).tar.gz
62 | 	$(R) CMD INSTALL --no-multiarch --byte-compile $(PKG_NAME)_$(PKG_VERSION).tar.gz
63 | 
64 | clean:
65 | 	@rm -rf $(wildcard *.Rcheck)
66 | 	@rm -f $(wildcard *.tar.gz)
67 | 	@echo '*** PACKAGE CLEANUP COMPLETE ***'
68 | 


--------------------------------------------------------------------------------
/R/tm.plugin.webmining-package.R:
--------------------------------------------------------------------------------
 1 | #' tm.plugin.webmining facilitates the retrieval of textual data through various 
 2 | #' web feed formats like XML and JSON. Also direct retrieval from HTML 
 3 | #' is supported. As most (news) feeds only incorporate small fractions
 4 | #' of the original text tm.plugin.webmining goes a step further and even
 5 | #' retrieves and extracts the text of the original text source.
 6 | #' Generally, the retrieval procedure can be described as a two--step process:
 7 | #' \describe{
 8 | #' \item{Meta Retrieval}{In a first step, all relevant meta feeds are retrieved.
 9 | #' From these feeds all relevant meta data items are extracted.
10 | #' }
11 | #' \item{Content Retrieval}{In a second step the relevant source content is retrieved.
12 | #' Using the \code{boilerpipeR} package even the main content of \code{HTML} pages can
13 | #' be extracted.
14 | #' }}
15 | #' 
16 | #' @name tm.plugin.webmining-package
17 | #' @aliases tm.plugin.webmining webmining
18 | #' @docType package
19 | #' @title Retrieve structured, textual data from various web sources
20 | #' @author Mario Annau \email{mario.annau@@gmail}
21 | #' @keywords package
22 | #' @seealso \code{\link{WebCorpus}} \code{\link{GoogleFinanceSource}} \code{\link{GoogleNewsSource}} \code{\link{NYTimesSource}} \code{\link{ReutersNewsSource}} \code{\link{YahooFinanceSource}} \code{\link{YahooInplaySource}} \code{\link{YahooNewsSource}} 
23 | #' @examples
24 | #' \dontrun{
25 | #' googlefinance <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT"))
26 | #' googlenews <- WebCorpus(GoogleNewsSource("Microsoft"))
27 | #' nytimes <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid))
28 | #' reutersnews <- WebCorpus(ReutersNewsSource("businessNews"))
29 | #' yahoofinance <- WebCorpus(YahooFinanceSource("MSFT"))
30 | #' yahooinplay <- WebCorpus(YahooInplaySource())
31 | #' yahoonews <- WebCorpus(YahooNewsSource("Microsoft"))
32 | #' liberation <- WebCorpus(LiberationSource("latest"))
33 | #' }
34 | NULL
35 | 
36 | #' WebCorpus retrieved from Yahoo! News for the search term "Microsoft"
37 | #' through the YahooNewsSource. Length of retrieved corpus is 20.
38 | #' @name yahoonews
39 | #' @docType data
40 | #' @author Mario Annau
41 | #' @keywords data
42 | #' @examples
43 | #' #Data set has been generated as follows:
44 | #' \dontrun{
45 | #' yahoonews <- WebCorpus(YahooNewsSource("Microsoft"))
46 | #' }
47 | NULL
48 | 
49 | #' AppID for the NYtimes-API.
50 | #' 
51 | #' USED ONLY FOR PACKAGE TESTING. PLEASE DOWNLOAD YOUR OWN KEY AT \url{http://developer.nytimes.com/}!!!
52 | #' @name nytimes_appid
53 | #' @docType data
54 | #' @author Mario Annau
55 | #' @keywords data
56 | NULL
57 | 


--------------------------------------------------------------------------------
/R/getLinkContent.R:
--------------------------------------------------------------------------------
  1 | #' @title Get main content for corpus items, specified by links. 
  2 | #' @description \code{getLinkContent} downloads and extracts content from weblinks for \code{\link[tm]{Corpus}} objects.
  3 | #' Typically it is integrated and called as a post-processing function (field:\code{$postFUN}) for most \code{\link{WebSource}}
  4 | #' objects. \code{getLinkContent} implements content download in chunks which has been proven to be a stabler approach for
  5 | #' large content requests. 
  6 | #' @param corpus object of class \code{\link[tm]{Corpus}} for which link content should be downloaded
  7 | #' @param links character vector specifyinig links to be used for download, defaults to 
  8 | #' sapply(corpus, meta, "Origin")
  9 | #' @param timeout.request timeout (in seconds) to be used for connections/requests, defaults to 30
 10 | #' @param curlOpts curl options to be passed to \code{\link{getURL}}
 11 | #' @param chunksize Size of download chunks to be used for parallel retrieval, defaults to 20
 12 | #' @param verbose Specifies if retrieval info should be printed, defaults to getOption("verbose")
 13 | #' @param retry.empty Specifies number of times empty content sites should be retried, defaults to 3
 14 | #' @param sleep.time Sleep time to be used between chunked download, defaults to 3 (seconds)
 15 | #' @param extractor Extractor to be used for content extraction, defaults to extractContentDOM
 16 | #' @param ... additional parameters to \code{\link{getURL}}
 17 | #' @param .encoding encoding to be used for \code{\link{getURL}}, defaults to integer() (=autodetect)
 18 | #' @return corpus including downloaded link content
 19 | #' @seealso \code{\link{WebSource}} \code{\link[RCurl]{getURL}} \code{\link[boilerpipeR]{Extractor}} 
 20 | #' @importFrom NLP content
 21 | #' @importFrom RCurl getURL
 22 | #' @export
 23 | getLinkContent <- function(corpus, links = sapply(corpus, meta, "origin"),
 24 | 		timeout.request = 30, chunksize = 20, verbose = getOption("verbose"),
 25 | 		curlOpts = curlOptions(verbose = FALSE,
 26 | 				followlocation = TRUE, 
 27 | 				maxconnects = 5,
 28 | 				maxredirs = 20,
 29 | 				timeout = timeout.request,
 30 | 				connecttimeout = timeout.request,
 31 | 				ssl.verifyhost=FALSE,
 32 | 				ssl.verifypeer = FALSE,
 33 | 				useragent = "R", 
 34 | 				cookiejar = tempfile()),  
 35 | 		retry.empty = 3, 
 36 | 		sleep.time = 3, 
 37 | 		extractor = ArticleExtractor, 
 38 | 		.encoding = integer(),
 39 | 		...){
 40 | 	
 41 | 	if(length(corpus) != length(links))
 42 | 		stop("Corpus length not equal to links length\n")
 43 | 	
 44 | 	#content_urls <- unlist(sapply(content_parsed, linkreader))
 45 | 	if(verbose){
 46 | 		cat("Starting URL Download ...\n")
 47 | 	}
 48 | 	retries <- 0
 49 | 	while(any(empty <- sapply(corpus, function(x) identical(content(x), character(0)))) & (retries <= retry.empty)){
 50 | 			retries <- retries + 1
 51 | 			emptycontent.ids <- which(empty)
 52 | 			
 53 | 			if(verbose){
 54 | 				cat("Run ", retries, ", retrieving ", length(emptycontent.ids), " content items\n")
 55 | 			}
 56 | 			
 57 | 			#for(cstart in seq(from = 1, to =  length(links), by = chunksize)){
 58 | 			for(cstart in seq(from = 1, to =  length(emptycontent.ids), by = chunksize)){
 59 | 				if(sleep.time > 0){
 60 | 					if(verbose){
 61 | 						cat("Sleeping ", sleep.time, " seconds...\n")
 62 | 					}
 63 | 					Sys.sleep(sleep.time)
 64 | 				}
 65 | 				
 66 | 				cend <- min(cstart[1] + chunksize-1, length(emptycontent.ids))
 67 | 				chunk.ids <- emptycontent.ids[cstart:cend]
 68 | 				chunk <- links[chunk.ids]
 69 | 				
 70 | 				# TODO Enable chunk download
 71 | 				content <- tryCatch({
 72 | 							getURL(chunk, .opts = curlOpts, .encoding = .encoding, ...)
 73 | 						},
 74 | 						error=function(e){
 75 | 							print(e)
 76 | 							# TODO: Check if single retrieval part is really necessary
 77 | 							cat("\nError on retrieval, single retrieval fallback... \n")
 78 | 							content <- list()
 79 | 							for(i in 1:length(chunk)){
 80 | 								content[[i]] <- tryCatch({
 81 | 											getURL(chunk[i], .opts = curlOpts, .encoding = .encoding, ...)
 82 | 										},error = function(f) {
 83 | 											print(f)
 84 | 											""})
 85 | 							}
 86 | 							#cat("Done\n")
 87 | 							do.call(c, content)})
 88 | 				
 89 | 				
 90 | 				# Extract Content
 91 | 				extract <- sapply(content, extractor)
 92 | 
 93 | 				# Put Content Into Corpus
 94 | 				for(i in 1:length(chunk.ids)){
 95 | 					cid <- chunk.ids[i]
 96 | 					content(corpus[[cid]]) <- extract[i]
 97 | 					
 98 | 				}
 99 | 				if(verbose){
100 | 					progress <- floor(cend/length(links)*100)
101 | 					cat(paste(progress, "% (",cend,"/",length(emptycontent.ids), ") ", Sys.time(), "\n",sep = ""))
102 | 				}
103 | 			}
104 | 	}
105 | 	corpus
106 | }


--------------------------------------------------------------------------------
/R/corpus.R:
--------------------------------------------------------------------------------
  1 | #' @title WebCorpus constructor function.
  2 | #' @description \code{WebCorpus} adds further methods and meta data to \code{\link[tm]{Corpus}} and therefore
  3 | #' constructs a derived class of \code{\link[tm]{Corpus}}. Most importantly, \code{WebCorpus}
  4 | #' calls \code{$PostFUN} on the generated \code{WebCorpus}, which retrieves the main content
  5 | #' for most implemented \code{WebSource}s. Thus it enables an efficient retrieval of new feed items
  6 | #' (\code{\link{corpus.update}}). All additional WebCorpus fields are added to \code{tm$meta}
  7 | #' like \code{$source}, \code{$readerControl} and \code{$postFUN}.
  8 | #' @param x object of type Source, see also \code{\link{Corpus}}
  9 | #' @param readerControl specifies reader to be used for \code{Source}, defaults to
 10 | #' list(reader = x$DefaultReader, language = "en"
 11 | #' @param postFUN function to be applied to WebCorpus after web retrieval has been completed,
 12 | #' defaults to x$PostFUN
 13 | #' @param retryEmpty specifies if retrieval for empty content elements should be repeated, 
 14 | #' defaults to TRUE
 15 | #' @param ... additional parameters for Corpus function (actually Corpus reader)
 16 | #' @importFrom tm Corpus reader getElem stepNext eoi SimpleSource
 17 | #' @export
 18 | WebCorpus <- function(x, readerControl = list(reader = reader(x), language = "en"),
 19 |     postFUN = x$postFUN, retryEmpty = TRUE, ...)
 20 | {
 21 |   stopifnot(inherits(x, "WebSource"))
 22 |   
 23 |   readerControl <- prepareReader(readerControl, reader(x))
 24 |   
 25 |   if (is.function(readerControl$init))
 26 |     readerControl$init()
 27 |   
 28 |   if (is.function(readerControl$exit))
 29 |     on.exit(readerControl$exit())
 30 |   
 31 |   tdl <- vector("list", length(x))
 32 |   counter <- 1
 33 |   while (!eoi(x)) {
 34 |     x <- stepNext(x)
 35 |     elem <- getElem(x)
 36 |     doc <- readerControl$reader(elem,
 37 |         readerControl$language,
 38 |         as.character(counter))
 39 |     tdl[[counter]] <- doc
 40 |     counter <- counter + 1
 41 |   }
 42 | 
 43 |   corpus <- structure(list(content = tdl,
 44 |           meta = CorpusMeta(source = x, readerControl = readerControl, postFUN = postFUN),
 45 |           dmeta = data.frame(row.names = seq_along(tdl))),
 46 |       class = c("WebCorpus", "VCorpus", "Corpus"))
 47 |   if(retryEmpty){
 48 |     corpus <- getEmpty(corpus)
 49 |   }
 50 |   corpus
 51 | }
 52 | 
 53 | # TODO: Tell Ingo to export CorpusMeta
 54 | CorpusMeta <-
 55 |     function(..., meta = NULL)
 56 | {
 57 |   if (is.null(meta))
 58 |     meta <- list(...)
 59 |   
 60 |   stopifnot(is.list(meta))
 61 |   
 62 |   structure(meta, class = "CorpusMeta")
 63 | }
 64 | 
 65 | # TODO: Tell Ingo to export prepareReader
 66 | prepareReader <- 
 67 | function(readerControl, reader = NULL, ...)
 68 | {
 69 |   if (is.null(readerControl$reader))
 70 |     readerControl$reader <- reader
 71 |   if (inherits(readerControl$reader, "FunctionGenerator"))
 72 |     readerControl$reader <- readerControl$reader(...)
 73 |   if (is.null(readerControl$language))
 74 |     readerControl$language <- "en"
 75 |   readerControl
 76 | }
 77 | 
 78 | 
 79 | #' @noRd
 80 | #' @export
 81 | `[.WebCorpus` <- function(x, i) {
 82 | 	if (missing(i)) return(x)
 83 | 	corpus <- NextMethod("[")
 84 | 	class(corpus) <- c("WebCorpus", class(corpus))
 85 | 	corpus
 86 | }
 87 | 
 88 | #' @title Update/Extend \code{\link{WebCorpus}} with new feed items.
 89 | #' @description The \code{corpus.update} method ensures, that the original 
 90 | #' \code{\link{WebCorpus}} feed sources are downloaded and checked against
 91 | #' already included \code{TextDocument}s. Based on the \code{ID} included
 92 | #' in the  \code{TextDocument}'s meta data, only new feed elements are
 93 | #' downloaded and added to the \code{\link{WebCorpus}}.
 94 | #' All relevant information regariding the original source feeds are stored
 95 | #' in the \code{\link{WebCorpus}}' meta data (\code{\link[tm]{meta}}).
 96 | #' @param x object of type \code{\link{WebCorpus}}
 97 | #' @param ... 
 98 | #' \describe{
 99 | #' \item{fieldname}{name of \code{\link{Corpus}} field name to be used as ID, defaults to "ID"}
100 | #' \item{retryempty}{specifies if empty corpus elements should be downloaded again, defaults to TRUE}
101 | #' \item{...}{additional parameters to \code{\link{Corpus}} function}
102 | #' }
103 | #' @export corpus.update
104 | #' @aliases corpus.update.WebCorpus
105 | corpus.update <- function(x, ...){
106 | 	UseMethod("corpus.update", x)	
107 | }
108 | 
109 | #' Update/Extend \code{\link{WebCorpus}} with new feed items.
110 | #' @param x \code{\link{WebCorpus}}
111 | #' @param fieldname name of \code{\link{Corpus}} field name to be used as ID, defaults to "ID"
112 | #' @param retryempty specifies if empty corpus elements should be downloaded again, defaults to TRUE
113 | #' @param ... additional parameters to \code{\link{Corpus}} function
114 | #' @importFrom tm Corpus
115 | #' @importFrom NLP meta
116 | #' @noRd
117 | #' @export
118 | corpus.update.WebCorpus <- 
119 | function(x, fieldname = "id", retryempty = TRUE, verbose = FALSE, ...) {
120 | 	cm <- x$meta
121 | 	
122 | 	newsource <- source.update(cm$source)
123 | 	
124 |   #WebCorpus
125 | 	newcorpus <- WebCorpus(newsource, readerControl = cm$MetaData$ReaderControl, 
126 |       retryEmpty = FALSE, ...)
127 | 	#intersect on ID
128 | 	id_old <- sapply(x, meta, fieldname)
129 | 	if(any(sapply(id_old, length) == 0))
130 | 		stop(paste("Not all elements in corpus to update have field '", fieldname, "' defined", sep = ""))
131 | 
132 | 	id_new <- sapply(newcorpus, meta, fieldname)
133 | 	if(any(sapply(id_new, length) == 0))
134 | 		stop(paste("Not all elements in corpus to update have field '", fieldname, "' defined", sep = ""))
135 | 	
136 | 	newcorpus <- newcorpus[!id_new %in% id_old]
137 | 	
138 | 	if(length(newcorpus) > 0){
139 | 		if(!is.null(cm$postFUN)){
140 | 			newcorpus <- cm$postFUN(newcorpus)
141 | 		}
142 | 		corpus <- c(x, newcorpus)
143 | 		#attr(corpus, "CMetaData") <- CMetaData(x)
144 | 		class(corpus) <- c("WebCorpus", class(corpus))
145 | 	}else{
146 | 		corpus <- x
147 | 	}
148 | 	
149 | 	if(retryempty){
150 | 		corpus <- getEmpty(corpus)
151 | 	}
152 | 	
153 | 	if(verbose){
154 | 		cat(length(newcorpus), " corpus items added.\n")
155 | 	}
156 | 		
157 | 	corpus
158 | }
159 | 
160 | 
161 | #' @title Retrieve Empty Corpus Elements through \code{$postFUN}. 
162 | #' @description Retrieve content of all empty (textlength equals zero) corpus elements. If 
163 | #' corpus element is empty, \code{$postFUN} is called (specified in \code{\link{meta}})
164 | #' @param x object of type \code{\link{WebCorpus}}
165 | #' @param ... additional parameters to PostFUN
166 | #' @seealso \code{\link{WebCorpus}}
167 | #' @export getEmpty
168 | #' @aliases getEmpty.WebCorpus
169 | getEmpty <- function(x, ...){
170 | 	UseMethod("getEmpty", x)	
171 | }
172 | 	
173 | 
174 | 
175 | #' @importFrom NLP content
176 | #' @noRd
177 | #' @export
178 | getEmpty.WebCorpus <- 
179 | function(x, nChar = 0, ...){
180 | 	cm <- x$meta
181 | 	noContent <- which(sapply(x, function(y){
182 |             cy <- content(y)
183 |             if(length(cy) == 0L) 0
184 |             else nchar(content(y)) 
185 |           }) <= nChar)
186 | 	if(length(noContent) > 0){
187 | 		corp_nocontent <- x[noContent]
188 | 		if(!is.null(cm$postFUN)){
189 | 			corp_nocontent <- cm$postFUN(corp_nocontent, ...)
190 | 		}
191 |     # TODO: stupid construct because direct assignment of corpus does not work
192 |     for(i in 1:length(noContent)){
193 |       x[[noContent[i]]] <- corp_nocontent[[i]]
194 |     }
195 | 	}
196 | 	x
197 | }
198 | 
199 | 
200 | 


--------------------------------------------------------------------------------
/vignettes/ShortIntro.Rnw:
--------------------------------------------------------------------------------
  1 | \documentclass[a4paper]{article}
  2 | \usepackage{Sweave}
  3 | \usepackage[margin=2cm]{geometry}
  4 | \usepackage[round]{natbib}
  5 | \usepackage{url}
  6 | \usepackage{hyperref}
  7 | \usepackage{listings}
  8 | 
  9 | \let\code=\texttt
 10 | \newcommand{\acronym}[1]{\textsc{#1}}
 11 | \newcommand{\class}[1]{\mbox{\textsf{#1}}}
 12 | \newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}}
 13 | \newcommand{\proglang}[1]{\textsf{#1}}
 14 | \newcommand{\fkt}[1]{\code{#1()}}
 15 | \newcommand{\todo}[1]{\begin{center}\code{<TODO: #1>}\end{center}}    
 16 | \newcommand{\field}[1]{\code{\$#1}} 
 17 |  
 18 | \sloppy
 19 | %% \VignetteIndexEntry{Introduction to the tm.plugin.webmining Package}
 20 | \SweaveOpts{prefix.string=webmining} 
 21 | \SweaveOpts{include=FALSE}
 22 | 
 23 | 
 24 | \begin{document}
 25 | 
 26 | <<Init_hidden,echo=FALSE,eval=T, results=hide>>=
 27 | library(tm)
 28 | library(tm.plugin.webmining)
 29 | data(yahoonews)
 30 | options(width = 60)
 31 | @
 32 |  
 33 | \title{Short Introduction to \pkg{tm.plugin.webmining}}
 34 | \author{Mario Annau\\
 35 | 		\texttt{mario.annau@gmail.com}}
 36 | 
 37 | \maketitle
 38 |    
 39 | \abstract{
 40 | This vignette gives a short introduction to \pkg{tm.plugin.webmining} which
 41 | facilitates the retrieval of textual data from the web. The main focus of
 42 | \pkg{tm.plugin.webmining} is the retrieval of web content from structured news
 43 | feeds in the \proglang{XML} (\proglang{RSS}, \proglang{ATOM}) and
 44 | \proglang{JSON} format. Additionally, retrieval and extraction of
 45 | \proglang{HTML} documents is implemented. Numerous data sources are currently
 46 | supported through public feeds/APIs, including Google-- and Yahoo! News,
 47 | Reuters and the New York Times.
 48 | }
 49 | 
 50 |   
 51 | \section{Getting Started}
 52 | After package installation we make the functionality of
 53 | \pkg{tm.plugin.webmining} available through
 54 | 
 55 | <<echo=T, eval=F>>=
 56 | library(tm)
 57 | library(tm.plugin.webmining)
 58 | @
 59 | 
 60 | \pkg{tm.plugin.webmining} depends on numerous packages, most
 61 | importantly \pkg{tm} by \cite{hornik:Feinerer+Hornik+Meyer:2008} for text
 62 | mining capabilities and data structures.
 63 | \pkg{RCurl} functions are used for web data retrieval and \pkg{XML} for the 
 64 | extraction of \proglang{XML}/\proglang{HTML} based feeds.
 65 | As a first experiment, we can retrieve a \class{(Web-)Corpus} using data from
 66 | Yahoo! News and the search query \code{"Microsoft"}:
 67 | 
 68 | <<echo=T, eval=F>>=
 69 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft"))
 70 | @ 
 71 | 
 72 | Users already familiar with \pkg{tm}
 73 | will notice the different function call \fkt{WebCorpus} for corpus construction. Like
 74 | \pkg{tm}'s \fkt{Corpus} constructor it takes a \class{(Web-)Source} object as
 75 | input and constructs a \class{(Web-)Corpus} object.
 76 | A Review of the object's \fkt{class}
 77 | 
 78 | <<echo=T, eval=T>>=
 79 | class(yahoonews)
 80 | @ 
 81 | 
 82 | reveals, that \class{WebCorpus} is directly derived from \class{Corpus} and adds
 83 | further functionality to it. It can therefore be used like a "normal"
 84 | \class{Corpus} using \pkg{tm}'s text mining capabilities.
 85 | 
 86 | <<echo=T, eval=T>>=
 87 | yahoonews
 88 | @ 
 89 | 
 90 | Under the hood, a call of \fkt{YahooNewsSource} retrieves a data feed from
 91 | Yahoo! News and pre--parses its contents.
 92 | Subsequently, \fkt{WebCorpus} extracts (meta--)data from the \class{WebSource}
 93 | object and also downloads and extracts the actual main content
 94 | of the news item (most commonly an \proglang{HTML}--Webpage).
 95 | In effect, it implements a two--step procedure to
 96 | 
 97 | \begin{enumerate}
 98 | \item Download meta data from the feed (through \class{WebSource})
 99 | \item Download and extract main content for the feed item (through
100 | \class{WebCorpus})
101 | \end{enumerate}
102 | 
103 | These procedures ensure that the resulting \class{WebCorpus} not only includes
104 | a rich set of meta data but also the full main text content for text mining
105 | purposes. An examination of the meta data for the first element in the corpus
106 | is shown below.
107 | 
108 | <<echo=F, eval=T>>=
109 | # Little hack to restrict output width
110 | meta(yahoonews[[1]], "description") <- 
111 | 		paste(substring(meta(yahoonews[[1]], "description"), 1, 70), "...", sep = "")
112 | meta(yahoonews[[1]], "id") <- 
113 | 		paste(substring(meta(yahoonews[[1]], "id"), 1, 70), "...", sep = "")
114 | meta(yahoonews[[1]], "origin") <- 
115 | 		paste(substring(meta(yahoonews[[1]], "origin"), 1, 70), "...", sep = "")
116 | @
117 | <<echo=T, eval=T>>=
118 | meta(yahoonews[[1]])
119 | @
120 | 
121 | For a Yahoo! News \class{TextDocument} we get useful meta--data like
122 | \code{DateTimeStamp}, \code{Description}, \code{Heading}, \code{ID} and
123 | \code{Origin}. The main content, as specified in the \code{Origin} of a
124 | \class{TextDocument} can be examined as follows (shortened for output):
125 | 
126 | <<echo=F, eval=T>>=
127 | # Little hack to restrict output length
128 | content(yahoonews[[1]]) <- 
129 | 		paste(substring(yahoonews[[1]], 1, 100), "...", sep = "")
130 | @
131 | <<echo=T, eval=T>>=
132 | yahoonews[[1]]
133 | @
134 | 
135 | It has been extracted from an unstructured \proglang{HTML} page and freed from
136 | ads and sidebar content by \pkg{boilerpipeR}'s \fkt{DefaultExtractor}. To view the
137 | entire corpus main content also consider \fkt{inspect} (output omitted):
138 | 
139 | <<echo=T, eval=F>>=
140 | inspect(yahoonews)
141 | @
142 | 
143 | \section{Implemented Sources}
144 | \begin{table}[t]
145 |   \begin{center}
146 |     \input{tables/sources}
147 |   \end{center}
148 |   \caption{Overview of implemented \class{WebSources} listing the maximum number
149 |   of items per feed, a descriptive URL, if authentification is necessary (x
150 |   for yes) and the feed format.}
151 |   \label{tab:sources}
152 | \end{table}
153 | 
154 | All currently implemented (web--)sources are listed on Table~\ref{tab:sources}.
155 | The following commands show, how to use the implemented Sources. If available,
156 | the search query/stock ticker \code{Microsoft} has been used. Since Reuters News
157 | only offers a predefined number of channels we selected \code{businessNews}. 
158 | 
159 | <<echo=T, eval=F>>=
160 | googlefinance <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT"))
161 | googlenews <- WebCorpus(GoogleNewsSource("Microsoft"))
162 | nytimes <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid))
163 | reutersnews <- WebCorpus(ReutersNewsSource("businessNews"))
164 | yahoofinance <- WebCorpus(YahooFinanceSource("MSFT"))
165 | yahooinplay <- WebCorpus(YahooInplaySource())
166 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft"))
167 | @
168 | 
169 | \section{Extending/Updating Corpora}
170 | Most data feeds only contain 20--100 feed items. A text corpus of such a small
171 | size may not be sufficient for text mining purposes. For that reason,
172 | the \fkt{corpus.update} method has been implemented. In a nutshell, it first
173 | downloads a feed's meta data, checks which items are new (as determined by the meta--data
174 | ID field) and finally downloads the main content of new web documents. Since
175 | most time of \class{WebCorpus} construction is spend downloading the main content of
176 | corpus items, this procedures ensures a more efficient and faster
177 | \class{WebCorpus}--update. \\
178 | The Yahoo! News corpus can now simply be updated:
179 | 
180 | <<echo=T, eval=F>>=
181 | yahoonews <- corpus.update(yahoonews)
182 | @
183 | 
184 | To continously update a \class{WebCorpus} a scheduled task/cron job could be set
185 | up which runs \fkt{corpus.update} in a script.
186 | \newpage
187 | 
188 | \section{Conclusion}
189 | This vignette has given a short introduction to \pkg{tm.plugin.webmining}, a
190 | package to retrieve textual data from the web. Although
191 | \pkg{tm.plugin.webmining} has been tested for the retrieval of 10000+ items per
192 | feed it is generally not recommended to start massive feed downloads due to
193 | memory-- and \pkg{RCurl} restrictions. For this purpose, web scraping 
194 | frameworks like Scrapy (\url{scrapy.org}), Heritrix (\url{crawler.archive.org})
195 | or Nutch (\url{nutch.apache.org}) are much better suited.
196 | \\
197 | Keeping these issues in mind, \pkg{tm.plugin.webmining} is well suited for the
198 | retrieval and processing of small to medium sized text corpora. By using the
199 | full meta data and textual contents, quite interesting text mining experiments
200 | can be done using the full capabilities of the \pkg{tm} package. 
201 | 
202 | 
203 | \bibliographystyle{plainnat}
204 | \bibliography{references}
205 | 
206 | 
207 | 
208 | \end{document}
209 | 


--------------------------------------------------------------------------------
/R/extract.R:
--------------------------------------------------------------------------------
  1 | #' @title Extract main content from \code{TextDocument}s.
  2 | #' @description Use implemented extraction functions (through boilerpipeR) to extract main content from
  3 | #' \code{TextDocument}s.
  4 | #' @param x PlainTextDocument
  5 | #' @param extractor default extraction function to be used, defaults to \code{\link{extractContentDOM}}
  6 | #' @param ... additional parameters to extractor function
  7 | #' @export
  8 | #' @aliases extract.PlainTextDocument
  9 | extract <- function(x, extractor, ...) UseMethod("extract", x)
 10 | 
 11 | 
 12 | #' Extract Main Content from Text Documents
 13 | #' Use implemented extraction functions (through boilerpipeR) to extract main content from
 14 | #' \code{TextDocument}s.
 15 | #' @param x PlainTextDocument
 16 | #' @param extractor default extraction function to be used, defaults to \code{\link{extractContentDOM}}
 17 | #' @param ... additional parameters to extractor function
 18 | #' @importFrom NLP content
 19 | #' @noRd
 20 | #' @export
 21 | extract.PlainTextDocument <- function(x, extractor = extractContentDOM, ...){
 22 | 	content(x) <- tryCatch(extractor(x, ...),
 23 | 			error = function(e){
 24 | 				warning(e)
 25 | 				content(x)
 26 | 			})
 27 | 	x
 28 | } 
 29 | 
 30 | #' @title Simply strip HTML Tags from Document
 31 | #' @description \code{extractHTMLStrip} parses an url, character or filename, reads the DOM
 32 | #' tree, removes all HTML tags in the tree and outputs the source text without
 33 | #' markup.
 34 | #' @author Mario Annau
 35 | #' @param url character, url or filename
 36 | #' @param asText specifies if url parameter is a \code{character}, defaults to TRUE
 37 | #' @param encoding specifies local encoding to be used, depending on platform
 38 | #' @param ... Additional parameters for \code{\link{htmlTreeParse}} 
 39 | #' @seealso \code{\link{xmlNode}}
 40 | #' @importFrom XML htmlTreeParse toString.XMLNode xmlChildren xmlValue free
 41 | #' @seealso \code{\link{htmlTreeParse}} \code{\link{encloseHTML}}
 42 | #' @note Input text should be enclosed in <html>'TEXT'</html> tags to ensure correct
 43 | #' DOM parsing (issue especially under .Platform$os.type = 'windows')
 44 | #' @export
 45 | extractHTMLStrip <-
 46 | function(url, asText = TRUE, encoding, ...){
 47 | 	if(missing(encoding)){
 48 | 		encoding <- switch(.Platform$OS.type,
 49 | 				unix = "UTF-8",
 50 | 				windows = "latin1")
 51 | 	}	
 52 | 
 53 | 	if(url == ""){
 54 | 		return("")
 55 | 	}
 56 | 
 57 | 	parseerror <- capture.output(tree <- htmlTreeParse(url, asText = asText, 
 58 |                   useInternalNodes = TRUE, encoding = encoding, ...))
 59 | 	
 60 | 	children <- xmlChildren(tree)
 61 |   children <- children[!sapply(children, function(x) 
 62 |                       grepl("<!DOCTYPE", toString.XMLNode(x)))]
 63 | 	childlen <- sapply(children, function(x) nchar(toString.XMLNode(x)))
 64 | 	childidx <- max(which(childlen == max(childlen)))
 65 | 	#childidx <- min(childidx, length(children))
 66 | 	html <- children[[childidx]]
 67 | 	val <- xmlValue(html)
 68 | 	XML::free(tree)
 69 | 	return(val)
 70 | }
 71 | 
 72 | 
 73 | #' @title Extract Main HTML Content from DOM
 74 | #' @description Function extracts main HTML Content using its Document Object Model.
 75 | #' Idea comes basically from the fact, that main content of an HTML Document
 76 | #' is in a subnode of the HTML DOM Tree with a high text-to-tag ratio.
 77 | #' Internally, this function also calls 
 78 | #' \code{assignValues}, \code{calcDensity}, \code{getMainText} 
 79 | #' and \code{removeTags}.
 80 | #' @author Mario Annau
 81 | #' @param url character, url or filename
 82 | #' @param threshold threshold for extraction, defaults to 0.5
 83 | #' @param asText boolean, specifies if url should be interpreted as character
 84 | #' @param ... Additional Parameters to \code{\link{htmlTreeParse}}
 85 | #' @seealso \code{\link{xmlNode}}
 86 | #' @references 	\url{http://www.elias.cn/En/ExtMainText}, 
 87 | #' 				\url{http://ai-depot.com/articles/the-easy-way-to-extract-useful-text-from-arbitrary-html/}
 88 | #' 				\cite{Gupta et al., DOM-based Content Extraction of HTML Documents},\url{http://www2003.org/cdrom/papers/refereed/p583/p583-gupta.html}
 89 | #' @importFrom XML xmlChildren
 90 | #' @importFrom XML toString.XMLNode
 91 | #' @importFrom XML htmlTreeParse
 92 | #' @aliases assignValues calcDensity removeTags getMainText
 93 | #' @export
 94 | extractContentDOM <-
 95 | function(url, threshold, asText = TRUE, ...){
 96 | 		
 97 | 		# FIXME: Hack because of roxygen2 bug (dot replaced by comma):
 98 | 		if(missing(threshold)){
 99 | 			threshold <- 0.5
100 | 		}
101 | 
102 | 		if(url == ""){
103 | 			return("")
104 | 		}
105 | 		
106 | 		parseerror <- capture.output(tree <- htmlTreeParse(url, asText = asText, useInternalNodes = TRUE, ...))
107 | 		childlen <- sapply(xmlChildren(tree), function(x) nchar(toString.XMLNode(x)))
108 | 		childidx <- which(childlen == max(childlen))
109 | 		html <- xmlChildren(tree)[[childidx]]
110 | 		tags <- c("script" , "noscript", "style")
111 | 		htmlclean <- removeTags(html, tags)
112 | 		
113 | 		htmlannotated <- assignValues(htmlclean, FUN = calcDensity, threshold)
114 | 		content <- getMainText(htmlannotated, threshold)
115 | 		return(content)
116 | }
117 | 
118 | #' Calculate density of html text to overall length of html tree text
119 | #' @author Mario Annau
120 | #' @param xn object of class xmlNode
121 | #' @param annotate Specifies if \code{xn} should be annotated, defaults to TRUE
122 | #' @seealso \code{\link{extractContentDOM}}, \code{\link{xmlNode}}
123 | #' @importFrom XML toString.XMLNode
124 | #' @importFrom XML xmlValue
125 | #' @importFrom XML addAttributes
126 | #' @noRd 
127 | calcDensity <-
128 | function(xn, annotate = TRUE){
129 | 	textlen <- nchar( xmlValue(xn))
130 | 	treelen <- nchar(toString.XMLNode(xn))
131 | 	dens <- textlen / treelen
132 | 	if(annotate & inherits(xn, "XMLInternalElementNode")){
133 | 		addAttributes(xn, "dens" = dens, "textlen" = textlen, "treelen" = treelen)
134 | 	}
135 | 	return(c(dens, textlen, treelen))
136 | }
137 | 
138 | #' Assign Values as Attributes to xmlNode
139 | #' @author Mario Annau
140 | #' @param t object of class xmlNode
141 | #' @param FUN Function to be executed
142 | #' @param threshold maximum threshold needed to step down the tree, defaults to 0.5
143 | #' @param attribname Name of used attribute, defaults to "attrib"
144 | #' @param recursive should tree be recursively annotated?, defaults to TRUE
145 | #' @param mintextlen minimum textlength needed to step down the tree
146 | #' @param ... additional arguments for FUN
147 | #' @seealso \code{\link{extractContentDOM}}, \code{\link{xmlNode}}
148 | #' @importFrom XML xmlApply
149 | #' @noRd 
150 | assignValues <-
151 | function(t, FUN, threshold, attribname = "attrib", recursive = TRUE, mintextlen = 10, ...){
152 | 	
153 | 	# FIXME: Hack because of roxygen2 bug (dot replaced by comma):
154 | 	if(missing(threshold)){
155 | 		threshold <- 0.5
156 | 	}
157 | 
158 | 	dens <- xmlApply(t, FUN)
159 | 	dens <- do.call("rbind", dens)
160 | 	#dens <- as.data.frame(dens)
161 | 	
162 | 	
163 | 	if(!recursive){
164 | 		return(t)
165 | 	}
166 | 	lapply(t[(dens[,2] > mintextlen) & (dens[,1] < threshold)], assignValues, FUN, ...)
167 | 	return(t)
168 | 	
169 | }
170 | #' Get Main Text from Annotated HTML Tree
171 | #' Main Text is obtained from Tree -Subnode where threshold > threshold and 
172 | #' textlength is at maximum
173 | #' @author Mario Annau
174 | #' @param xml object of class xmlNode
175 | #' @param threshold minimum threshold needed to be considered
176 | #' @seealso \code{\link{extractContentDOM}}, \code{\link{xmlNode}}
177 | #' @importFrom XML xpathSApply
178 | #' @importFrom XML xmlValue
179 | #' @noRd 
180 | getMainText <-
181 | function(xml, threshold){
182 | 	# FIXME: Hack because of roxygen2 bug (dot replaced by comma):
183 | 	if(missing(threshold)){
184 | 		threshold <- 0.5
185 | 	}
186 | 
187 | 	textlen <- as.numeric( xpathSApply(xml, path = "//attribute::textlen"))
188 | 	dens <- as.numeric( xpathSApply(xml, path = "//attribute::dens"))
189 | 	
190 | 	textlen[dens < threshold] <- 0
191 | 	idxmaintext <- which(textlen == max(textlen))
192 | 	if(max(textlen) == 0){
193 | 		return("")
194 | 	}
195 | 	
196 | 	content <-  xpathSApply(xml, path = paste("//*[@textlen][@dens]",sep = ""))[[idxmaintext]]
197 | 	
198 | 	cleancontent <-  xmlValue(content)
199 | 	cleancontent <- trimWhiteSpaces(cleancontent)
200 | 	
201 | 	return(cleancontent)
202 | }
203 | 
204 | #' Remove specified tags from (XML) Document Tree.
205 | #' Tags and all of its inner content will be removed.
206 | #' @author Mario Annau
207 | #' @param xmldoc xmlDoc object of class xmlDoc 
208 | #' @param tags character vector which specifies tags to remove
209 | #' @seealso \code{\link{extractContentDOM}}
210 | #' @export
211 | #' @importFrom XML getNodeSet
212 | #' @importFrom XML removeNodes
213 | #' @noRd 
214 | removeTags <-
215 | function(xmldoc, tags){
216 | 	#remove scripts tags
217 | 	xquery <- paste("//", tags, sep = "", collapse = " | ")
218 | 	scripts <-  getNodeSet(xmldoc, path = xquery)
219 | 	ret <- removeNodes(scripts , free = rep(FALSE, length(scripts)))
220 | 	removeTags <- xmldoc
221 | }
222 | 
223 | 
224 | 


--------------------------------------------------------------------------------
/R/reader.R:
--------------------------------------------------------------------------------
  1 | #' @title Read content from WebXMLSource/WebHTMLSource/WebJSONSource. 
  2 | #' @description \code{readWeb} is a FunctionGenerator which specifies content retrieval from a \code{\link{WebSource}} 
  3 | #' content elements. Currently, it is defined for XML, HTML and JSON feeds through \code{readWebXML},
  4 | #' \code{readWebHTML} and \code{readWebJSON}. Also content parsers (\code{xml_content}, \code{json_content})
  5 | #' need to be defined.
  6 | #' @param spec specification of content reader
  7 | #' @param doc document to be parsed
  8 | #' @param parser parser function to be used
  9 | #' @param contentparser content parser function to be used, see also \code{tm:::xml_content} or \code{json_content}
 10 | #' @param freeFUN function to free memory from parsed object (actually only relevant for XML and HTML trees)
 11 | #' @return FunctionGenerator
 12 | #' @importFrom tm FunctionGenerator PlainTextDocument
 13 | #' @aliases readWebXML readWebHTML readWebJSON json_content 
 14 | #' @export
 15 | readWeb <- FunctionGenerator(function(spec, doc, parser, contentparser, freeFUN = NULL) {
 16 | 			
 17 | 	parser <- parser
 18 | 	contentparser <- contentparser
 19 | 	freeFUN <- freeFUN
 20 | 	spec <- spec
 21 | 	doc <- doc
 22 | 
 23 | 	function(elem, language, id) {
 24 | 		tree <- parser(elem$content)
 25 | 	
 26 | 		###Set Content
 27 | 		content(doc) <- if ("content" %in% names(spec)){
 28 | 							content <- contentparser(tree, spec[["content"]])
 29 | 						}
 30 | 						else{
 31 | 							character(0)
 32 | 						}		
 33 | 
 34 | 		for (n in setdiff(names(spec), "content")){
 35 | 				meta(doc, n) <- contentparser(tree, spec[[n]])
 36 | 			}
 37 | 			
 38 | 			if(!is.null(freeFUN)){
 39 | 				freeFUN(tree)
 40 | 			}
 41 | 			doc
 42 | 		}
 43 | })
 44 | 
 45 | #' Read content from WebXMLSource
 46 | #' @param ... additional parameters to \code{\link{readWeb}}
 47 | #' @export
 48 | #' @importFrom XML xmlInternalTreeParse free
 49 | #' @noRd 
 50 | readWebXML <- function(...){
 51 | 	parser <- function(x){
 52 | 		#XML::xmlInternalTreeParse(x, asText = TRUE)
 53 | 		parse(x, type = "XML")
 54 | 	} 
 55 | 	contentparser <- xml_content
 56 | 	freeFUN <- free
 57 | 	readWeb(parser = parser, contentparser = contentparser, freeFUN = freeFUN, ...)
 58 | }
 59 | 
 60 | #' Read content from WebHTMLSource
 61 | #' @param ... additional parameters to \code{\link{readWeb}}
 62 | #' @export
 63 | #' @importFrom XML htmlTreeParse free
 64 | #' @noRd 
 65 | readWebHTML <- function(...){
 66 | 	#parser <- function(x) XML::htmlTreeParse(x, asText = TRUE, useInternalNodes = TRUE)
 67 | 	parser <- function(x) parse(x, type = "HTML", useInternalNodes = TRUE)
 68 | 	contentparser <- function(x, cspec) xml_content(x, cspec)
 69 | 	freeFUN <- free
 70 | 	readWeb(parser = parser, contentparser = contentparser, freeFUN = freeFUN, ...)
 71 | }
 72 | 
 73 | #' Read content from WebJSONSource
 74 | #' @param ... additional parameters to \code{\link{readWeb}}
 75 | #' @export
 76 | #' @noRd 
 77 | readWebJSON <- function(...){
 78 | 	parser <- function(x) identity(x)
 79 | 	contentparser <- function(x, cspec) json_content(x, cspec)
 80 | 	freeFUN <- rm
 81 | 	readWeb(parser = parser, contentparser = contentparser, freeFUN = freeFUN, ...)
 82 | }
 83 | 
 84 | #' Read content from XMLSource
 85 | #' @param doc list object from which content should be retrieved
 86 | #' @param spec list field name as character
 87 | #' @noRd
 88 | #' @importFrom XML xmlValue
 89 | xml_content <- function(doc, spec) {
 90 | 	type <- spec[[1]]
 91 | 	fun <- switch(type,
 92 | 			node = XML::xmlValue,
 93 | 			attribute = identity)
 94 | 	
 95 | 	if (identical(type, "unevaluated"))
 96 | 		spec[[2]]
 97 | 	else if (identical(type, "function") && is.function(spec[[2]]))
 98 | 		spec[[2]](doc)
 99 | 	else
100 | 		as.character(sapply(XML::getNodeSet(doc, spec[[2]]), fun))
101 | }
102 | 
103 | #' Read content from JSONSource
104 | #' @param doc list object from which content should be retrieved
105 | #' @param spec list field name as character
106 | #' @export
107 | #' @noRd 
108 | json_content <- 
109 | function (doc, spec) 
110 | {
111 | 	type <- spec[[1]]
112 | 	fun <- switch(type, field = identity, node = identity)
113 | 	if (identical(type, "unevaluated")) 
114 | 		spec[[2]]
115 | 	else if (identical(type, "function") && is.function(spec[[2]])) 
116 | 		spec[[2]](doc)
117 | 	else{
118 | 		as.character(sapply(doc[[spec[[2]]]], 
119 | 						fun))
120 | 	} 
121 | }
122 | 
123 | #' Read content from NYTimesSource
124 | #' @noRd
125 | #' @export
126 | readNYTimes <- readWebJSON(spec = list(
127 | 		author = list("field", c("byline", "original")),
128 | 		description = list("field", "snippet"),
129 | 		datetimestamp = list("function", function(node)
130 | 					strptime(node[["pub_date"]],
131 | 							format = "%Y-%m-%dT%H:%M:%SZ",
132 | 							tz = "EST")),
133 | 		heading = list("field", c("headline", "main")),
134 | 		origin = list("field", "web_url"),
135 | 		language = list("unevaluated", "en"),
136 | 		id = list("field", "_id")),
137 | 	doc = PlainTextDocument())
138 | 
139 | #' Read content from Google...Source
140 | #' @importFrom XML getNodeSet xmlValue
141 | #' @importFrom NLP meta<-
142 | #' @noRd
143 | #' @export
144 | readGoogle <- readWebXML(spec = list(
145 | 		heading = list("node", "//title"),
146 | 		datetimestamp = list("function", function(node){
147 | 					loc <- Sys.getlocale("LC_TIME")
148 | 					Sys.setlocale("LC_TIME", "C")
149 | 					val <- sapply(getNodeSet(node, "//pubDate"), xmlValue)
150 | 					time <- strptime(val,format = "%a, %d %b %Y %H:%M:%S",tz = "GMT")
151 | 					Sys.setlocale("LC_TIME", loc)
152 | 					time
153 | 				}),
154 | 		origin = list("node", "//link"),
155 | 		description = list("function", function(node){
156 | 					val <- sapply(getNodeSet(node, "//item/description"), xmlValue)
157 | 					extractHTMLStrip(sprintf("<html>%s</html>", val), asText = T)
158 | 				}),
159 | 		id = list("node",  "//guid")),
160 | 	doc = PlainTextDocument())
161 | 
162 | #' Read content from Yahoo RSS Source
163 | #' @importFrom XML getNodeSet xmlValue
164 | #' @seealso \code{\link{YahooFinanceSource}}
165 | #' @noRd
166 | #' @export
167 | readYahoo <- readWebXML(spec = list(
168 | 		heading = list("node", "//title"),
169 | 		datetimestamp = list("function", function(node){
170 | 					loc <- Sys.getlocale("LC_TIME")
171 | 					Sys.setlocale("LC_TIME", "C")
172 | 					val <- sapply(getNodeSet(node, "//pubDate"), xmlValue)
173 | 					time <- strptime(val,format = "%a, %d %b %Y %H:%M:%S",tz = "GMT")
174 | 					Sys.setlocale("LC_TIME", loc)
175 | 					time
176 | 				}),
177 | 		origin = list("node", "//link"),
178 | 		description = list("node", "//item/description"),
179 | 		id = list("node",  "//guid")),
180 | 	doc = PlainTextDocument())
181 | 
182 | #' Read content from Yahoo HTML Source
183 | #' @importFrom XML getNodeSet xmlValue
184 | #' @seealso \code{\link{YahooNewsSource}}
185 | #' @noRd
186 | #' @export
187 | readYahooHTML <- readWebHTML(spec = list(
188 |     heading = list("node", "//div[@class='compTitle']/h3[@class='title']/a"),
189 |     datetimestamp = list("function", function(node){
190 |                 loc <- Sys.getlocale("LC_TIME")
191 |                 Sys.setlocale("LC_TIME", "C")
192 |                 val <- sapply(getNodeSet(node, "//span[@class='tri fc-2nd ml-10']"), xmlValue)
193 |                 time <- strptime(val, format = "%b %d %H:%M %p",tz = "GMT")
194 |                 Sys.setlocale("LC_TIME", loc)
195 |                 time
196 |             }),
197 |     origin = list("attribute", "//div[@class='compTitle']/h3[@class='title']/a/@href"),
198 |     author = list("node", "//span[@class='cite']"),
199 |     description = list("node", "//div[@class='compText']/p"),
200 |     id = list("attribute", "//div[@class='compTitle']/h3[@class='title']/a/@href")),
201 | doc = PlainTextDocument())
202 | 
203 | #' Read content from YahooInplaySource
204 | #' @importFrom XML getNodeSet xmlValue
205 | #' @noRd
206 | #' @export
207 | readYahooInplay <- readWebHTML(spec = list(
208 | 		heading = list("node", "//b[1]"),
209 | 		id = list("node", "//b[1]"),
210 | 		content = list("node", "//p"),
211 | 		datetimestamp = list("function", function(node){
212 | 					val <- unlist(getNodeSet(node, "//b[1]", fun = xmlValue))
213 | 					substr(val, 1, regexpr("\\s", val)-1)
214 | 				}),
215 | 		ticker  = list("node", "//p/b/a")),
216 | 	doc = PlainTextDocument())
217 | 
218 | 
219 | 
220 | 
221 | #' Read content from ReutersNewsSource
222 | #' @importFrom XML getNodeSet xmlValue
223 | #' @noRd
224 | #' @export
225 | readReutersNews <- readWebXML(spec = list(
226 | 				heading = list("node", "//title"),
227 | 				datetimestamp = list("function", function(node){
228 | 							loc <- Sys.getlocale("LC_TIME")
229 | 							Sys.setlocale("LC_TIME", "C")
230 | 							val <- sapply(getNodeSet(node, "//pubDate"), xmlValue)
231 | 							time <- strptime(val,format = "%a, %d %b %Y %H:%M:%S",tz = "GMT")
232 | 							Sys.setlocale("LC_TIME", loc)
233 | 							time
234 | 						}),
235 | 				origin = list("node", "//link"),
236 | 				description = list("function", function(node){
237 | 							val <- sapply(getNodeSet(node, "//item/description"), xmlValue)
238 | 							extractHTMLStrip(sprintf("<html>%s</html>", val), asText = T)
239 | 						}),
240 | 				id = list("node",  "//guid"),
241 | 				category = list("node", "//category")),
242 | 		doc = PlainTextDocument())
243 | 
244 | #' Read content from LiberationSource
245 | #' @importFrom XML getNodeSet xmlValue
246 | #' @importFrom NLP meta<-
247 | #' @noRd
248 | #' @export
249 | readLiberationSource <- readWebXML(spec = list(
250 |         heading = list("node", "//title"),
251 |         datetimestamp = list("function", function(node){
252 |                     loc <- Sys.getlocale("LC_TIME")
253 |                     Sys.setlocale("LC_TIME", "C")
254 |                     val <- sapply(getNodeSet(node, "//updated"), xmlValue)
255 |                     time <- strptime(val, format = "%Y-%m-%dT%H:%M:%S",tz = "GMT")
256 |                     Sys.setlocale("LC_TIME", loc)
257 |                     time
258 |                 }),
259 |         origin = list("attribute", "//link[1]/@href"),
260 |         author = list("node", "//author/name"),
261 |         description = list("function", function(node){
262 |                     val <- sapply(getNodeSet(node, "//summary"), xmlValue)
263 |                     extractHTMLStrip(sprintf("<html>%s</html>", val), asText = T)
264 |                 }),
265 |         id = list("node",  "//id"),
266 |         language = list("unevaluated", "fr")),
267 |     doc = PlainTextDocument())
268 | 


--------------------------------------------------------------------------------
/R/source.R:
--------------------------------------------------------------------------------
  1 | #' @title Read Web Content and respective Link Content from feedurls.
  2 | #' @description WebSource is derived from \code{\link[tm]{Source}}. In addition to calling the
  3 | #' base \code{\link[tm]{Source}} constructor function it also retrieves the specified
  4 | #' feedurls and pre--parses the content with the parser function.
  5 | #' The fields \code{$Content}, \code{$Feedurls} \code{$Parser} and \code{$CurlOpts} are finally
  6 | #' added to the \code{Source} object.
  7 | #' @author Mario Annau
  8 | #' @param feedurls urls from feeds to be retrieved
  9 | #' @param class class label to be assigned to \code{Source} object, defaults to "WebXMLSource"
 10 | #' @param reader function to be used to read content, see also \code{\link{readWeb}}
 11 | #' @param parser function to be used to split feed content into chunks, returns list of content elements
 12 | #' @param encoding specifies default encoding, defaults to 'UTF-8'
 13 | #' @param curlOpts a named list or CURLOptions object identifying the curl options for the handle. Type \code{listCurlOptions()} for all Curl options available.
 14 | #' @param postFUN function saved in WebSource object and called to retrieve full text content from feed urls 
 15 | #' @param retrieveFeedURL logical; Specify if feedurls should be downloaded first.
 16 | #' @param ... additional parameters passed to \code{WebSource} object/structure
 17 | #' @return WebSource
 18 | #' @export
 19 | #' @importFrom XML getNodeSet xmlValue
 20 | #' @importFrom RCurl curlOptions
 21 | WebSource <- function(feedurls, class = "WebXMLSource", reader, parser, encoding = "UTF-8",
 22 |         curlOpts = curlOptions(
 23 |                 followlocation = TRUE, 
 24 |                 maxconnects = 5,
 25 |                 maxredirs = 20,
 26 |                 timeout = 30,
 27 |                 connecttimeout = 30,
 28 |                 ssl.verifyhost = FALSE,
 29 |                 ssl.verifypeer = FALSE), 
 30 |         postFUN = NULL, retrieveFeedURL = TRUE, ...){
 31 | 
 32 | 	content_raw <- NULL
 33 | 	if(retrieveFeedURL) {
 34 | 		content_raw <- getURL(feedurls, .opts = curlOpts)
 35 | 	} else {
 36 | 		content_raw <- feedurls
 37 | 	}
 38 |   # Filter empty content
 39 |   content_raw <- content_raw[sapply(content_raw, nchar) > 0]
 40 |   content_parsed <- unlist(lapply(content_raw, parser), recursive = FALSE)
 41 |   structure(list(encoding = encoding, length = length(content_parsed), names = NA_character_,
 42 |               position = 0, reader = reader, content = content_parsed, feedurls = feedurls,
 43 |               parser = parser, curlOpts = curlOpts, postFUN = postFUN, retrieveFeedURL = retrieveFeedURL, ...), 
 44 |             class = unique(c(class, "WebSource", "SimpleSource")))
 45 | }
 46 | 
 47 | 
 48 | #' @title Update WebXMLSource/WebHTMLSource/WebJSONSource
 49 | #' @description Typically, update is called from \code{link{corpus.update}} and refreshes \code{$Content} in 
 50 | #' Source object.
 51 | #' @param x Source object to be updated
 52 | #' @export source.update
 53 | #' @aliases source.update.WebXMLSource source.update.WebHTMLSource source.update.WebJSONSource
 54 | source.update <- function(x){
 55 | 	UseMethod("source.update", x)	
 56 | }
 57 | 
 58 | #'update WebSource
 59 | #' @noRd
 60 | #' @export
 61 | source.update.WebXMLSource <- 
 62 | source.update.WebHTMLSource <- 
 63 | source.update.WebJSONSource <- 
 64 | function(x) {
 65 |   content_raw <- NULL
 66 | 	if(x$retrieveFeedURL) {
 67 |     content_raw <- getURL(x$feedurls, .opts = x$curlOpts)
 68 | 	} else {
 69 |     content_raw <- x$feedurls
 70 |   }
 71 |   # Filter empty content
 72 |   content_raw <- content_raw[sapply(content_raw, nchar) > 0]
 73 |   
 74 | 	content_parsed <- unlist(lapply(content_raw, x$parser), recursive = FALSE)
 75 | 	x$content <- content_parsed
 76 | 	x$position <- 0
 77 | 	x
 78 | }
 79 | 
 80 | #' @title Get feed Meta Data from Google Finance. 
 81 | #' @description Google Finance provides business and enterprise headlines for many companies. Coverage is 
 82 | #' particularly strong for US-Markets. However, only up to 20 feed items can be retrieved.
 83 | #' @author Mario Annau
 84 | #' @param query ticker symbols of companies to be searched for, see \url{http://www.google.com/finance}.
 85 | #' Please note that Google ticker symbols need to be prefixed with the exchange name, e.g. NASDAQ:MSFT
 86 | #' @param params additional query parameters
 87 | #' @param ... additional parameters to \code{\link{WebSource}}
 88 | #' @return WebXMLSource
 89 | #' @seealso \code{\link{WebSource}}
 90 | #' @export
 91 | #' @examples
 92 | #' \dontrun{
 93 | #' corpus <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT"))
 94 | #' }
 95 | #' @importFrom XML xmlInternalTreeParse
 96 | #' @importFrom XML xpathSApply
 97 | #' @importFrom XML getNodeSet
 98 | #' @importFrom XML xmlValue
 99 | #' @aliases readGoogle
100 | GoogleFinanceSource <- function(query, params = 
101 | 				list( 	hl= 'en', 
102 | 						q=query, 
103 | 						ie='utf-8', 
104 | 						start = 0, 
105 | 						num = 20, 
106 | 						output='rss'),...){
107 | 	feed <- "http://www.google.com/finance/company_news"
108 | 	parser <- function(cr){
109 | 		tree <- parse(cr, type = "XML", asText = FALSE)
110 | 		xpathSApply(tree, path = "//item")
111 | 	}
112 | 	fq <- feedquery(feed, params)
113 |   ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readGoogle, 
114 |       postFUN = getLinkContent, retrieveFeedURL = FALSE,...)
115 | 	ws
116 | }
117 | 
118 | #' @title Get feed data from Yahoo! Finance.
119 | #' @description Yahoo! Finance is a popular site which provides financial news and information. It is a large source
120 | #' for historical price data as well as financial news. Using the typical Yahoo! Finance ticker 
121 | #' news items can easily be retrieved. However, the maximum number of items is 20. 
122 | #' @author Mario Annau
123 | #' @param query ticker symbols of companies to be searched for, see \url{http://finance.yahoo.com/lookup}.
124 | #' @param params, additional query parameters, see \url{http://developer.yahoo.com/rss/}
125 | #' @param ... additional parameters to \code{\link{WebSource}}
126 | #' @return WebXMLSource
127 | #' @export
128 | #' @examples
129 | #' \dontrun{
130 | #' corpus <- WebCorpus(YahooFinanceSource("MSFT"))
131 | #' }
132 | #' @seealso \code{\link{WebSource}}
133 | #' @importFrom XML xmlInternalTreeParse
134 | #' @importFrom XML xpathSApply
135 | #' @importFrom XML getNodeSet
136 | #' @importFrom XML xmlValue
137 | #' @aliases readYahoo
138 | YahooFinanceSource <- function(query, params = 
139 | 				list(	s= query, 
140 | 						region = "US",
141 | 						lang = "en-US"), ...){
142 | 	feed <- "https://feeds.finance.yahoo.com/rss/2.0/headline"
143 | 	
144 | 	fq <- feedquery(feed, params)
145 | 	parser <- function(cr){
146 | 		tree <- parse(cr, type = "XML", asText = TRUE)
147 | 		xpathSApply(tree, path = "//item")
148 | 	}
149 | 	ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readYahoo, 
150 |       postFUN = getLinkContent, retrieveFeedURL = TRUE,  ...)
151 | 	ws
152 | }
153 | 
154 | #' @title Get feed data from Google News Search \url{http://news.google.com/}
155 | #' @description Google News Search is one of the most popular news aggregators on the web. News
156 | #' can be retrieved for any customized user query. Up to 30 can be retrieved per 
157 | #' request.
158 | #' @author Mario Annau
159 | #' @param query Google News Search query
160 | #' @param params, additional query parameters
161 | #' @param ... additional parameters to \code{\link{WebSource}}
162 | #' @return WebXMLSource
163 | #' @seealso \code{\link{WebSource}}
164 | #' @export
165 | #' @examples
166 | #' \dontrun{
167 | #' corpus <- WebCorpus(GoogleNewsSource("Microsoft"))
168 | #' }
169 | #' @importFrom XML xmlInternalTreeParse xpathSApply getNodeSet xmlValue newXMLNamespace
170 | GoogleNewsSource <- function(query, params = 
171 | 				list(	hl= 'en', 
172 | 						q = query, 
173 | 						ie='utf-8', 
174 | 						num = 30, 
175 | 						output='rss'), ...){
176 | 	feed <- "http://news.google.com/news"
177 | 	fq <- feedquery(feed, params)
178 | 	parser <- function(cr){
179 | 		tree <- parse(cr, type = "XML", asText = TRUE)
180 | 		nodes <- xpathSApply(tree, path = "//item")
181 | 		xmlns1 <- lapply(nodes, newXMLNamespace, "http://purl.org/dc/elements/1.1/", "dc")
182 | 		nodes
183 | 	}
184 | 	ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readGoogle,
185 |       postFUN = getLinkContent, retrieveFeedURL = TRUE, ...)
186 | 	ws
187 | }
188 | 
189 | #' @title Get feed data from Reuters News RSS feed channels. Reuters provides numerous feed 
190 | #' @description channels (\url{http://www.reuters.com/tools/rss}) which can be retrieved through RSS 
191 | #' feeds. Only up to 25 items can be retrieved---therefore an alternative retrieval
192 | #' through the Google Reader API (\code{link{GoogleReaderSource}}) could be considered.
193 | #' @author Mario Annau
194 | #' @param query Reuters News RSS Feed, see \url{http://www.reuters.com/tools/rss} for a list of all feeds provided. Note that only string after 'http://feeds.reuters.com/reuters/' must be given. Defaults to 'businessNews'.
195 | #' @param ... additional parameters to \code{\link{WebSource}}
196 | #' @return WebXMLSource
197 | #' @seealso \code{\link{WebSource}}
198 | #' @export
199 | #' @examples
200 | #' \dontrun{
201 | #' corpus <- WebCorpus(ReutersNewsSource("businessNews"))
202 | #' }
203 | #' @importFrom XML xmlInternalTreeParse xpathSApply getNodeSet xmlValue newXMLNamespace
204 | #' @aliases readReutersNews
205 | ReutersNewsSource <- function(query = 'businessNews', ...){
206 | 	feed <- "http://feeds.reuters.com/reuters"
207 | 	
208 | 	fq <- paste(feed, query, sep = "/")
209 | 	parser <- function(cr){
210 | 		tree <- parse(cr, type = "XML")
211 | 		nodes <- xpathSApply(tree, path = "//item")
212 | 		xmlns1 <- lapply(nodes, newXMLNamespace, "http://rssnamespace.org/feedburner/ext/1.0", "feedburner")
213 | 		nodes
214 | 	}
215 | 
216 | 	ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readReutersNews, 
217 |       postFUN = getLinkContent, ...)
218 | 	ws
219 | }
220 | 
221 | #' @title Get news data from Yahoo! News (\url{https://news.search.yahoo.com/search/}).
222 | #' @description Currently, only a maximum of 10 items can be retrieved.
223 | #' @author Mario Annau
224 | #' @param query words to be searched in Yahoo News, multiple words must be separated by '+'
225 | #' @param params, additional query parameters, see \url{http://developer.yahoo.com/rss/}
226 | #' @param ... additional parameters to \code{\link{WebSource}}
227 | #' @return WebXMLSource
228 | #' @export
229 | #' @examples
230 | #' \dontrun{
231 | #' corpus <- WebCorpus(YahooNewsSource("Microsoft"))
232 | #' }
233 | #' @seealso \code{\link{WebSource}}
234 | #' @importFrom XML xmlInternalTreeParse
235 | #' @importFrom XML xpathSApply
236 | #' @importFrom XML getNodeSet
237 | #' @importFrom XML xmlValue
238 | #' @aliases readYahooHTML
239 | YahooNewsSource <- function(query, params = 
240 | 				list(	p= query), ...){
241 | 	feed <- "https://news.search.yahoo.com/search"
242 | 	fq <- feedquery(feed, params)
243 | 	parser <- function(cr){
244 | 		tree <- parse(cr, type = "HTML", useInternalNodes = TRUE)
245 | 		xpathSApply(tree, path = "//div[contains(@class, 'NewsArticle')]")
246 | 	}
247 | 	ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readYahooHTML, 
248 |       postFUN = getLinkContent, ...)
249 | 	ws
250 | }
251 | 
252 | 
253 | #' @title Get feed data from NYTimes Article Search (\url{http://developer.nytimes.com/docs/read/article_search_api_v2}). 
254 | #' @description Excerpt from the website: "With the NYTimes Article Search API, you can search New York Times articles 
255 | #' from 1981 to today, retrieving headlines, abstracts, lead paragraphs, links to associated multimedia 
256 | #' and other article metadata. Along with standard keyword searching, the API also offers faceted searching. 
257 | #' The available facets include Times-specific fields such as sections, taxonomic classifiers and controlled 
258 | #' vocabulary terms (names of people, organizations and geographic locations)."
259 | #' Feed retrieval is limited to 1000 items (or 100 pages).
260 | #' @author Mario Annau
261 | #' @param query character specifying query to be used to search NYTimes articles
262 | #' @param n number of items, defaults to 100
263 | #' @param sleep integer; Seconds to sleep between feed retrieval.
264 | #' @param curlOpts CURLOptions; RCurl options used for feed retrieval.
265 | #' @param appid Developer App id to be used, obtained from \url{http://developer.nytimes.com/}
266 | #' @param params additional query parameters, specified as list, see \url{http://developer.nytimes.com/docs/read/article_search_api}
267 | #' @param ... additional parameters to \code{\link{WebSource}}
268 | #' @seealso \code{\link{WebSource}}, \code{\link{readNYTimes}} 
269 | #' @export
270 | #' @examples
271 | #' \dontrun{
272 | #' #nytimes_appid needs to be specified
273 | #' corpus <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid))
274 | #' }
275 | #' @export
276 | #' @importFrom RJSONIO fromJSON
277 | #' @importFrom boilerpipeR ArticleExtractor
278 | #' @aliases readNYTimes
279 | NYTimesSource <- function(query, n = 100, appid, 
280 |         sleep = 1, params = 
281 | 		list(	format="json",
282 | 				q = query,
283 | 				page = 0:(ceiling(n/10)-1),
284 | 				"api-key" = appid), 
285 |     curlOpts = curlOptions(	followlocation = TRUE, 
286 |         maxconnects = 10,
287 |         maxredirs = 10,
288 |         timeout = 30,
289 |         connecttimeout = 30), ...){
290 | 	feed <- "http://api.nytimes.com/svc/search/v2/articlesearch.json"
291 | 	fq <- feedquery(feed, params)
292 | 	
293 | 	parser <- function(cr){
294 | 		json <- parse(cr, type = "JSON")
295 | 		json$response$docs
296 | 	}
297 |   
298 |   count <- 10
299 |   start <- seq(1, length(fq), by = count)
300 |   end <- if(n < count) length(fq) else seq(count, length(fq), length.out = length(start))
301 | 
302 |   feedcontent <- sapply(1:length(start), function(i) {
303 |               fcontent <- getURL(fq[start[i]:end[i]], .opts = curlOpts)
304 |               Sys.sleep(sleep)
305 |               fcontent
306 |           })
307 |   
308 | 	ws <- WebSource(feedurls = feedcontent, class = "WebJSONSource", parser = parser, reader = readNYTimes, 
309 |       postFUN = getLinkContent, retrieveFeedURL = FALSE, ...)
310 | 	
311 | 	ws
312 | }
313 | 
314 | #' @title Get News from Yahoo Inplay.
315 | #' @description Yahoo Inplay lists a range of company news provided by Briefing.com. Since Yahoo Inplay
316 | #' does not provide a structured XML news feed, content is parsed directly from the HTML page.
317 | #' Therefore, no further Source parameters can be specified. The number of feed items per 
318 | #' request can vary substantially.  
319 | #' @author Mario Annau
320 | #' @param ... additional parameters to \code{\link{WebSource}}
321 | #' @return WebHTMLSource
322 | #' @export
323 | #' @examples
324 | #' \dontrun{
325 | #' corpus <- WebCorpus(YahooInplaySource())
326 | #' }
327 | #' @importFrom XML htmlTreeParse
328 | #' @importFrom XML xpathSApply
329 | #' @aliases readYahooInplay
330 | YahooInplaySource <- function(...){
331 | 	url <- "http://finance.yahoo.com/marketupdate/inplay"
332 | 	parser <- function(cr){
333 | 		tree <- parse(cr, useInternalNodes = T, type = "HTML")
334 | 		xp_expr = "//div[@class= 'body yom-art-content clearfix']/p"
335 | 		paragraphs = xpathSApply(tree, xp_expr)
336 | 	}
337 | 	
338 | 	ws <- WebSource(feedurls = url, class = "WebHTMLSource", parser = parser, reader = readYahooInplay, ...)
339 | 	ws
340 | }
341 | 
342 | #' @title Get news data from french Liberation News Paper (\url{http://rss.liberation.fr/rss}).
343 | #' @author Mario Annau
344 | #' @param query feed to be retrieved, defaults to 'latest'
345 | #' @param ... additional parameters to \code{\link{WebSource}}
346 | #' @return WebXMLSource
347 | #' @export
348 | #' @examples
349 | #' \dontrun{
350 | #' corpus <- WebCorpus(LiberationSource("latest"))
351 | #' }
352 | #' @seealso \code{\link{WebSource}}
353 | #' @importFrom XML xmlInternalTreeParse
354 | #' @importFrom XML xpathSApply
355 | #' @importFrom XML getNodeSet
356 | #' @importFrom XML xmlValue
357 | #' @aliases readLiberationSource
358 | LiberationSource <- function(query = "latest", ...){
359 |     fq <- paste("http://rss.liberation.fr/rss", query, sep = "/")
360 |     parser <- function(cr){
361 |         tree <- parse(cr, type = "XML", useInternalNodes = TRUE)
362 |         namespaces <- c(ns = "http://www.w3.org/2005/Atom")
363 |         xpathSApply(tree, "//ns:entry", namespaces = namespaces)
364 |     }
365 |     ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readLiberationSource, 
366 |             postFUN = getLinkContent, retrieveFeedURL = TRUE, ...)
367 |     ws
368 | }
369 | 
370 | #' @importFrom XML saveXML
371 | #' @noRd
372 | #' @export
373 | getElem.WebXMLSource <- 
374 | getElem.WebHTMLSource <- function(x) {
375 | 	list(content = saveXML(x$content[[x$position]]), linkcontent = NULL, uri = NULL)
376 | }
377 | 
378 | #' @noRd
379 | #' @export
380 | getElem.WebJSONSource <- function(x) {
381 | 	list(content = x$content[[x$position]], linkcontent = NULL, uri = NULL)
382 | }
383 | 


--------------------------------------------------------------------------------
/vignettes/references.bib:
--------------------------------------------------------------------------------
  1 | @inproceedings{kohlschuetter:webextract,
  2 |     abstract = {{In addition to the actual content Web pages consist of navigational elements, templates, and advertisements. This boilerplate text typically is not related to the main content, may deteriorate search precision and thus needs to be detected properly. In this paper, we analyze a small set of shallow text features for classifying the individual text elements in a Web page. We compare the approach to complex, state-of-the-art techniques and show that competitive accuracy can be achieved, at almost no cost. Moreover, we derive a simple and plausible stochastic model for describing the boilerplate creation process. With the help of our model, we also quantify the impact of boilerplate removal to retrieval performance and show significant improvements over the baseline. Finally, we extend the principled approach by straight-forward heuristics, achieving a remarkable detection accuracy.}},
  3 |     address = {New York, NY, USA},
  4 |     author = {Kohlsch\"{u}tter, Christian and Fankhauser, Peter and Nejdl, Wolfgang},
  5 |     booktitle = {Proceedings of the third ACM international conference on Web search and data mining},
  6 |     citeulike-article-id = {8241255},
  7 |     citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1718542},
  8 |     citeulike-linkout-1 = {http://dx.doi.org/10.1145/1718487.1718542},
  9 |     doi = {10.1145/1718487.1718542},
 10 |     isbn = {978-1-60558-889-6},
 11 |     location = {New York, New York, USA},
 12 |     pages = {441--450},
 13 |     posted-at = {2010-11-23 07:02:43},
 14 |     priority = {2},
 15 |     publisher = {ACM},
 16 |     series = {WSDM '10},
 17 |     title = {{Boilerplate detection using shallow text features}},
 18 |     url = {http://code.google.com/p/boilerpipe/},
 19 |     year = {2010}
 20 | }
 21 | 
 22 | 
 23 | 
 24 | 
 25 | @inproceedings{Goog:MapReduce,
 26 |     abstract = {MapReduce is a programming model and an associated implementation for processing and generating large data sets.  Users specify a \_map\_ function that processes a key/value pair to generate a set of intermediate key/value pairs, and a \_reduce\_ function that merges all intermediate values associated with the same intermediate key.  Many real world tasks are expressible in this model, as shown in the paper. <P> Programs written in this functional style are automatically parallelized and executed on a large cluster of commodity machines. The run-time system takes care of the details of partitioning the input data, scheduling the program's execution across a set of machines, handling machine failures, and managing the required inter- machine communication.  This allows programmers without any experience with parallel and distributed systems to easily utilize the resources of a large distributed system.  <P> Our implementation of MapReduce runs on a large cluster of commodity machines and is highly scalable: a typical MapReduce computation processes many terabytes of data on thousands of machines. Programmers find the system easy to use: hundreds of MapReduce programs have been implemented and upwards of one thousand MapReduce jobs are executed on Google's clusters every day.     <P>},
 27 |     author = {Dean, Jeffrey and Ghemawat, Sanjay},
 28 |     citeulike-article-id = {430834},
 29 |     citeulike-linkout-0 = {http://www.usenix.org/events/osdi04/tech/dean.html},
 30 |     journal = {OSDI '04},
 31 |     keywords = {cluster, google, parallel},
 32 |     pages = {137--150},
 33 |     posted-at = {2008-03-27 02:27:59},
 34 |     priority = {3},
 35 |     title = {MapReduce: Simplified Data Processing on Large Clusters},
 36 |     url = {http://www.usenix.org/events/osdi04/tech/dean.html},
 37 |     year = {2008},
 38 |     booktitle = {MapReduce: Simplified Data Processing on Large Clusters}
 39 | }
 40 | 
 41 | @ARTICLE{Pang+Lee:08b,
 42 |   author = {Bo Pang and Lillian Lee},
 43 |   title = {Opinion mining and sentiment analysis},
 44 |   journal = {Foundations and Trends in Information Retrieval},
 45 |   year = {2008},
 46 |   volume = {2},
 47 |   pages = {1--135},
 48 |   number = {1-2}
 49 | }
 50 | 
 51 | 
 52 | @article{Msft:MapReduce,
 53 |     abstract = {Google's MapReduce programming model serves for processing large data sets in a massively parallel manner. We deliver the first rigorous description of the model including its advancement as Google's domain-specific language Sawzall. To this end, we reverse-engineer the seminal papers on MapReduce and Sawzall, and we capture our findings as an executable specification. We also identify and resolve some obscurities in the informal presentation given in the seminal papers. We use typed functional programming (specifically Haskell) as a tool for design recovery and executable specification. Our development comprises three components: (i) the basic program skeleton that underlies MapReduce computations; (ii) the opportunities for parallelism in executing MapReduce computations; (iii) the fundamental characteristics of Sawzall's aggregators as an advancement of the MapReduce approach. Our development does not formalize the more implementational aspects of an actual, distributed execution of MapReduce computations.},
 54 |     author = {Lammel, Ralf},
 55 |     citeulike-article-id = {2152671},
 56 |     citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1290549.1290812},
 57 |     citeulike-linkout-1 = {http://dx.doi.org/10.1016/j.scico.2007.07.001},
 58 |     citeulike-linkout-2 = {http://linkinghub.elsevier.com/retrieve/pii/S0167642307001281},
 59 |     citeulike-linkout-3 = {http://www.sciencedirect.com/science/article/B6V17-4P718HK-1/2/77f5109e6e40c6c24df92250b314c2f1},
 60 |     doi = {10.1016/j.scico.2007.07.001},
 61 |     journal = {Science of Computer Programming},
 62 |     month = {January},
 63 |     number = {1},
 64 |     pages = {1--30},
 65 |     posted-at = {2009-09-08 04:26:54},
 66 |     priority = {2},
 67 |     title = {Google's MapReduce programming model -- Revisited},
 68 |     url = {http://dx.doi.org/10.1016/j.scico.2007.07.001},
 69 |     volume = {70},
 70 |     year = {2008}
 71 | }
 72 | 
 73 | @inproceedings{Theu:RHadoop,
 74 |     author = {Theussl, Stefan},
 75 |     booktitle = {Computational Finance and Financial Engineering, Second R/Rmetrics User and Developer Workshop},
 76 |     year = {2009},
 77 |     month = {June},
 78 |     address = {Meielisalp, Lake Thune, Switzerland},
 79 |     keywords = {cluster, google, parallel},
 80 |     title = {Simple Parallel Computing in R Using Hadoop},
 81 |     url = {http://www.rmetrics.org/Meielisalp2009/Presentations/Theussl1.pdf}
 82 | }
 83 | 
 84 | 
 85 | @inproceedings{Theu:RParallel,
 86 |     author = {Theussl, Stefan},
 87 |     booktitle = {Computational Finance and Financial Engineering, Second R/Rmetrics User and Developer Workshop},
 88 |     year = {2008},
 89 |     month = {June},
 90 |     address = {Meielisalp, Lake Thune, Switzerland},
 91 |     keywords = {cluster, parallel, r},
 92 |     title = {Getting the most out of your CPUs: Parallel computing strategies in R},
 93 |     url = {http://www.rmetrics.org/Meielisalp2008/Presentations/Theussl1.pdf},
 94 |   lastchecked = {\today}
 95 | }
 96 | 
 97 | @webpage{Feinerer:TM,
 98 |   author = {Feinerer,Ingo},
 99 |   title = "tm: Text Mining Package",
100 |   url = "http://cran.r-project.org/web/packages/tm/index.html",
101 |   lastchecked = {\today}
102 | }
103 | 
104 | @article{Bharat:Rank,
105 |  author = {Bharat, Krishna and Mihaila, George A.},
106 |  title = {When experts agree: using non-affiliated experts to rank popular topics},
107 |  journal = {ACM Trans. Inf. Syst.},
108 |  volume = {20},
109 |  issue = {1},
110 |  month = {January},
111 |  year = {2002},
112 |  issn = {1046-8188},
113 |  pages = {47--58},
114 |  numpages = {12},
115 |  url = {http://doi.acm.org/10.1145/503104.503107},
116 |  doi = {http://doi.acm.org/10.1145/503104.503107},
117 |  acmid = {503107},
118 |  publisher = {ACM},
119 |  address = {New York, NY, USA},
120 |  keywords = {WWW search, authorities, connectivity, host affiliation, link analysis, ranking, topic experts},
121 | } 
122 | 
123 | 
124 | @webpage{Apache:Hadoop,
125 |   author = {Apache, Software Foundation},
126 |   title = "Hadoop",
127 |   url = "http://hadoop.apache.org/",
128 |   year = 2011,
129 |   lastchecked = {\today}
130 | }
131 | 
132 | @webpage{Spotlight,
133 |   author = {Reuters Labs},
134 |   title = "Reuters Spotlight",
135 |   url = "http://spotlight.reuters.com",
136 |   year = 2011,
137 |   lastchecked = {\today}
138 | }
139 | 
140 | @webpage{SK:RGrowth,
141 |   author = {Reader SK, Revolution Analytics},
142 |   title = "R's exponential package growth, ctd.",
143 |   url = "http://blog.revolutionanalytics.com/2010/01/rs-exponential-package-growth-ctd.html",
144 |   year = 2010,
145 |   month = 1,
146 |   day = 7,
147 |   lastchecked = {\today}
148 | }
149 | 
150 | @article{fama:EMH,
151 |     author = {Fama, Eugene F.},
152 |     citeulike-article-id = {1571390},
153 |     citeulike-linkout-0 = {http://dx.doi.org/10.2307/2350752},
154 |     citeulike-linkout-1 = {http://www.jstor.org/stable/2350752},
155 |     doi = {10.2307/2350752},
156 |     issn = {00219398},
157 |     journal = {The Journal of Business},
158 |     keywords = {behavior, stock-market},
159 |     number = {1},
160 |     pages = {34--105},
161 |     posted-at = {2008-09-23 23:37:46},
162 |     priority = {2},
163 |     publisher = {The University of Chicago Press},
164 |     title = {{The Behavior of Stock-Market Prices}},
165 |     url = {http://dx.doi.org/10.2307/2350752},
166 |     volume = {38},
167 |     year = {1965}
168 | }
169 | 
170 | @article{fama:EMH2,
171 |     author = {Fama, Eugene F.},
172 |     citeulike-article-id = {1485929},
173 |     citeulike-linkout-0 = {http://dx.doi.org/10.2307/2325486},
174 |     citeulike-linkout-1 = {http://www.jstor.org/stable/2325486},
175 |     doi = {10.2307/2325486},
176 |     issn = {00221082},
177 |     journal = {The Journal of Finance},
178 |     keywords = {depaper, efficient, hypothesis, market},
179 |     number = {2},
180 |     pages = {383--417},
181 |     posted-at = {2008-05-25 20:16:01},
182 |     priority = {2},
183 |     publisher = {Blackwell Publishing for the American Finance Association},
184 |     title = {{Efficient Capital Markets: A Review of Theory and Empirical Work}},
185 |     url = {http://dx.doi.org/10.2307/2325486},
186 |     volume = {25},
187 |     year = {1970}
188 | }
189 | 
190 | @book{PangLee:Opinion,
191 |     abstract = {{An important part of our information-gathering behavior has always been to find out what other people think. With the growing availability and popularity of opinion-rich resources such as online review sites and personal blogs, new opportunities and challenges arise as people can, and do, actively use information technologies to seek out and understand the opinions of others. The sudden eruption of activity in the area of opinion mining and sentiment analysis, which deals with the computational treatment of opinion, sentiment, and subjectivity in text, has thus occurred at least in part as a direct response to the surge of interest in new systems that deal directly with opinions as a first-class object. Opinion Mining and Sentiment Analysis covers techniques and approaches that promise to directly enable opinion-oriented information-seeking systems. The focus is on methods that seek to address the new challenges raised by sentiment-aware applications, as compared to those that are already present in more traditional fact-based analysis. The survey includes an enumeration of the various applications, a look at general challenges and discusses categorization, extraction and summarization. Finally, it moves beyond just the technical issues, devoting significant attention to the broader implications that the development of opinion-oriented information-access services have: questions of privacy, vulnerability to manipulation, and whether or not reviews can have measurable economic impact. To facilitate future work, a discussion of available resources, benchmark datasets, and evaluation campaigns is also provided. Opinion Mining and Sentiment Analysis is the first such comprehensive survey of this vibrant and important research area and will be of interest to anyone with an interest in opinion-oriented information-seeking systems.}},
192 |     author = {Pang, Bo and Lee, Lillian},
193 |     citeulike-article-id = {3481153},
194 |     day = {08},
195 |     howpublished = {Paperback},
196 |     isbn = {1601981503},
197 |     keywords = {information-retrieval, review, sentiment-analysis},
198 |     month = jul,
199 |     posted-at = {2009-09-20 21:20:23},
200 |     priority = {4},
201 |     publisher = {Now Publishers Inc},
202 |     title = {{Opinion Mining and Sentiment Analysis}},
203 |     url = {http://www.cs.cornell.edu/home/llee/opinion-mining-sentiment-analysis-survey.html},
204 |     year = {2008}
205 | }
206 | 
207 |  
208 | 
209 | @article{hornik:Feinerer+Hornik+Meyer:2008,
210 |   author = {Ingo Feinerer and Kurt Hornik and David Meyer},
211 |   title = {Text Mining Infrastructure in {R}},
212 |   journal = {Journal of Statistical Software},
213 |   volume = 25,
214 |   number = 5,
215 |   pages = {1--54},
216 |   day = 10,
217 |   month = 2,
218 |   year = 2008,
219 |   coden = {JSSOBK},
220 |   issn = {1548-7660},
221 |   url = {http://www.jstatsoft.org/v25/i05},
222 |   accepted = {2008-02-10},
223 |   submitted = {2007-09-05},
224 |   file = {Feinerer+Hornik+Meyer_j=JSS_y=2008.pdf}
225 |   
226 | }
227 |  
228 | @MISC{AlpertHajaj:GoogleBigWeb,
229 | author = {Jesse Alpert and Nissan Hajaj},
230 | title = {We knew the web was big... - The Official Google Blog.},
231 | year = {2008},
232 | month = {7}, 
233 | day = {25},
234 | url = { http://googleblog.blogspot.com/2008/07/we-knew-web-was-big.html}
235 | }
236 | 
237 | @MISC{Elias:ExtMainText,
238 | author = {Jinliang Song},
239 | title = {ExtMainText - Extract main text from html document},
240 | year = {2010},
241 | url = { http://www.elias.cn/En/ExtMainText}
242 | }
243 | 
244 | @MISC{AIDepot:ExtractHTMLEasy,
245 | author = {alexjc},
246 | title = {The Easy Way to Extract Useful Text from Arbitrary HTML},
247 | year = {2007},
248 | url = { http://ai-depot.com/articles/the-easy-way-to-extract-useful-text-from-arbitrary-html/}
249 | }
250 | 
251 | @MISC{Snowball:Snowball,
252 | 	author = "Martin Porter",
253 | 	title = "Snowball:Snowball", 
254 | 	url = "http://snowball.tartarus.org",
255 | 	year = {2010},
256 | 	lastchecked = {\today}
257 | }
258 | 
259 | 
260 | @MISC{YahooFinance:RSSIndesx,
261 | 	author = "Yahoo!, Finance",
262 | 	title = "RSS Feeds", 
263 | 	url = "http://finance.yahoo.com/rssindex",
264 | 	year = 2011,
265 | 	lastchecked = {\today}
266 | }
267 | 
268 | @MISC{YahooFinance:RSSAPI,
269 | 	author = "Yahoo!, Finance",
270 | 	title = "Company News RSS Feed", 
271 | 	url = "http://developer.yahoo.com/finance/company.html",
272 | 	year = 2011,
273 | 	lastchecked = {\today}
274 | }
275 | 
276 | 
277 | @MISC{Gupta03dom-basedcontent,
278 |     author = {Suhit Gupta and Gail Kaiser and David Neistadt and Peter Grimm},
279 |     title = {DOM-based Content Extraction of HTML Documents},
280 |     year = {2003}
281 | }
282 | 
283 | @PhdThesis{Gott:ContentExtraction,
284 |   author = 	 {Thomas Gottron},
285 |   title = 	 {Content Extraction: Identifying the Main content in HTML Documents},
286 |   school = 	 {Johannes Gutenberg-Universität},
287 |   year = 	 {2008},
288 |   OPTaddress = 	 {Mainz, Germany},
289 | }
290 | 
291 | @TechReport{zhang:FinAnalysisUsingNewsPaperSurvey,
292 |   author = 	 {Wenbin Zhang and Steven Skiena},
293 |   title = 	 {Financial Analysis Using News Data},
294 |   institution =  {Department of Computer Science, Stony Brook University},
295 |   year = 	 {2008},
296 |   address = 	 {Stony Brook, NY 11794-4400 USA}
297 | }
298 | 
299 | @book{chambers:GuidetoS,
300 |     abstract = {{Here is a thorough and authoritative guide to the latest version of the S language and to its programming environment, the premier software platform for computing with data. Programming with Data describes a new and greatly extended version of S, and is written by the chief designer of the language. The book is a guide to the complete programming process, starting from simple, interactive use and continuing through ambitious software projects.S is designed for computing with data - for any project in which organizing, visualizing, summarizing, or modeling data is a central concern. Its focus is on the needs of the programmer/user, and its goal is "to turn ideas into software, quickly and faithfully." S is a functional, object-based language with a huge library of functions for all aspects of computing with data. Its long and enthusiastic use in statistics and applied fields has also led to many valuable libraries of user-written functions.The new version of S provides a powerful class/method structure, new techniques to deal with large objects, extended interfaces to other languages and files, object-based documentation compatible with HTML, and powerful new interactive programming techniques. This version of S underlies the S-Plus system, versions 5.0 and higher.John Chambers has been a member of the technical staff in research at Bell Laboratories since 1966. In 1977, he became the first statistician to be named a Bell Labs Fellow, cited for "pioneering contributions to the field of statistical computing." His research has touched on nearly all aspects of computing with data, but he is best known for the design of the S language. He is the author or co-author of seven books on S, on computational methods, and on graphical methods; and he is a Fellow of the American Statistical Association and the American Association for the Advancement of Science.}},
301 |     author = {Chambers, John M.},
302 |     citeulike-article-id = {699469},
303 |     citeulike-linkout-0 = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0387985034},
304 |     citeulike-linkout-1 = {http://www.amazon.de/exec/obidos/redirect?tag=citeulike01-21\&amp;path=ASIN/0387985034},
305 |     citeulike-linkout-2 = {http://www.amazon.fr/exec/obidos/redirect?tag=citeulike06-21\&amp;path=ASIN/0387985034},
306 |     citeulike-linkout-3 = {http://www.amazon.jp/exec/obidos/ASIN/0387985034},
307 |     citeulike-linkout-4 = {http://www.amazon.co.uk/exec/obidos/ASIN/0387985034/citeulike00-21},
308 |     citeulike-linkout-5 = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/0387985034},
309 |     citeulike-linkout-6 = {http://www.worldcat.org/isbn/0387985034},
310 |     citeulike-linkout-7 = {http://books.google.com/books?vid=ISBN0387985034},
311 |     citeulike-linkout-8 = {http://www.amazon.com/gp/search?keywords=0387985034\&index=books\&linkCode=qs},
312 |     citeulike-linkout-9 = {http://www.librarything.com/isbn/0387985034},
313 |     day = {19},
314 |     edition = {Corrected},
315 |     howpublished = {Paperback},
316 |     isbn = {0387985034},
317 |     month = jun,
318 |     posted-at = {2006-06-17 23:54:54},
319 |     priority = {2},
320 |     publisher = {Springer},
321 |     title = {{Programming with Data: A Guide to the S Language}},
322 |     url = {http://www.worldcat.org/isbn/0387985034},
323 |     year = {1998}
324 | }
325 | 
326 | @article{R:Ihaka+Gentleman:1996,
327 |   author = {Ross Ihaka and Robert Gentleman},
328 |   title = {R: A Language for Data Analysis and Graphics},
329 |   journal = {Journal of Computational and Graphical Statistics},
330 |   year = 1996,
331 |   volume = 5,
332 |   number = 3,
333 |   pages = {299--314},
334 |   url = {http://www.amstat.org/publications/jcgs/}
335 | }
336 | 
337 | 
338 | 
339 | @article{ tetlock:MediaStockMarket,
340 |   type={Accepted Paper Series},
341 |   title={{Giving Content to Investor Sentiment: The Role of Media in the Stock Market}},
342 |   author={Tetlock, Paul C. },
343 |   journal={Journal of Finance},
344 |   publisher={SSRN},
345 |   year = {2007},
346 |   doi={10.2139/ssrn.685145},
347 |   keywords={Investor sentiment, financial news media, content analysis, efficient markets},
348 |   location={http://ssrn.com/paper=685145},
349 |   language={English}
350 | }
351 | 
352 | @inproceedings{Godbole+Srinivasaiah+Skiena:07a,
353 |   author = {Namrata Godbole and Manjunath Srinivasaiah and Steven Skiena},
354 |   booktitle = {Proceedings of the International Conference on Weblogs and Social
355 | 	Media (ICWSM)},
356 |   interhash = {db9c97e105d4387821aa7b404cbeb04a},
357 |   intrahash = {b67e0f2a90a04960e14ea8453134ecb5},
358 |   title = {Large-Scale Sentiment Analysis for News and Blogs},
359 |   year = 2007,
360 |   keywords = {analysis mining opinion sentiment},
361 |   added-at = {2009-03-18T13:40:43.000+0100},
362 |   biburl = {http://www.bibsonomy.org/bibtex/2b67e0f2a90a04960e14ea8453134ecb5/om}
363 | }
364 | 
365 | @PHDTHESIS{Gottron:2008e,
366 |   author = {Thomas Gottron},
367 |   title = {Content Extraction: Identifying the Main Content in HTML Documents},
368 |   school = {Johannes Gutenberg-University, Mainz},
369 |   year = {2008},
370 |   owner = {gotti},
371 |   timestamp = {2009.04.24}
372 | }
373 | 
374 | 
375 | @inproceedings{DBLP:conf/icwsm/ZhangS10,
376 |   author    = {Wenbin Zhang and
377 |                Steven Skiena},
378 |   title     = {Trading Strategies to Exploit Blog and News Sentiment},
379 |   booktitle = {ICWSM},
380 |   year      = {2010},
381 |   ee        = {http://www.aaai.org/ocs/index.php/ICWSM/ICWSM10/paper/view/1529},
382 |   crossref  = {DBLP:conf/icwsm/2010},
383 |   bibsource = {DBLP, http://dblp.uni-trier.de}
384 | }
385 | 
386 | @proceedings{DBLP:conf/icwsm/2010,
387 |   editor    = {William W. Cohen and
388 |                Samuel Gosling},
389 |   title     = {Proceedings of the Fourth International Conference on Weblogs
390 |                and Social Media, ICWSM 2010, Washington, DC, USA, May 23-26,
391 |                2010},
392 |   booktitle = {ICWSM},
393 |   publisher = {The AAAI Press},
394 |   year      = {2010},
395 |   bibsource = {DBLP, http://dblp.uni-trier.de}
396 | }
397 | 
398 | @Book{NLTK,
399 |   author =	 {Steven Bird and Ewan Klein and Edward Loper},
400 |   title = 	 {{How people learn: Brain, mind, experience, and school}},
401 |   publisher = 	 {O'Reilly Media},
402 |   year = 	 2009,
403 |   address =	 {1005 Gravenstein Highwsay North, Sebastopol, CA 95472},
404 |   edition =	 {1},
405 |   url =		 {http://www.nltk.org/book}
406 | }
407 | 
408 | @MISC{Fielding96t.berners-lee,
409 |     author = {Tim Berners-Lee and R. Fielding and J. Gettys Dec and J. C. Mogul},
410 |     title = {T. Berners-Lee, MIT/LCS},
411 |     year = {1996}
412 | }
413 | 
414 | 
415 | @MISC{GeneralInquirer,
416 |     author = {Philip Stone},
417 |     title = {The General Inquirer Home Page},
418 |     year = {2006}
419 | }
420 | 
421 | 
422 | 
423 | @webpage{Reuters:Newsscope,
424 |   author = "Thomson Reuters",
425 |   title = "Newsscope",
426 |   url = "http://thomsonreuters.com/products_services/financial/financial_products/event_driven_trading/newsscope_archive",
427 |   lastchecked = {\today}
428 | }
429 | 
430 | @webpage{RPack:RCurl,
431 |   author = "Duncan Temple Lang",
432 |   title = "The RCurl Package",
433 |   url = "http://www.omegahat.org/RCurl/",
434 |   lastchecked = {\today}
435 | }
436 | 
437 | @webpage{RPack:PerformanceAnalytics,
438 |   author = "Peter Carl and Brian G. Peterson",
439 |   title = "PerformanceAnalytics: Econometric tools for performance and risk analysis",
440 |   url = "http://cran.r-project.org/web/packages/PerformanceAnalytics/",
441 |   year = 2010,
442 |   month = 9,
443 |   day = 15,
444 |   lastchecked = {\today}
445 | }
446 | 
447 |   @Book{fPortfolio,
448 |     title = {Portfolio Optimization with R/Rmetrics},
449 |     author = {Diethelm Wuertz and Yohan Chalabi and William Chen and
450 |       Andrew Ellis},
451 |     year = {2010},
452 |     month = {April},
453 |     editor = {{Wuertz} and {Diethelm} and {Hanf} and {Martin}},
454 |     publisher = {Rmetrics Association & Finance Online,
455 |       www.rmetrics.org},
456 |     note = {R package version 2130.80},
457 |   }
458 | 
459 |   @Book{Achelis:TechAnal,
460 |     title = {Technical Analysis from A to Z},
461 |     author = {Steven Achelis},
462 |     year = {2000},
463 |     month = {October},
464 |     publisher = {McGraw-Hill; 2 edition},
465 |     isbn = {0071363483}
466 |   }
467 |   
468 | @webpage{hedgefundtwitter,
469 |   author = "Jack Jordan",
470 |   title = "Hedge Fund Will Track Twitter to Predict Stock Moves",
471 |   url = "http://www.bloomberg.com/news/2010-12-22/hedge-fund-will-track-twitter-to-predict-stockmarket-movements.html",
472 |   year = 2010,
473 |   month = 12,
474 |   day = 22,
475 |   lastchecked = {\today}
476 | }
477 | 
478 | @webpage{universalfeedparser,
479 |   author = "Mark Pilgrim",
480 |   title = "Universal Feed Parser",
481 |   url = "http://feedparser.org/docs/",
482 |   year = 2006,
483 |   month = 01,
484 |   day = 10,
485 |   lastchecked = {\today}
486 | }
487 | 
488 | @webpage{simplejson,
489 |   author = "Bob Ippolito",
490 |   title = "simplejson 2.1.5",
491 |   url = "http://pypi.python.org/pypi/simplejson/",
492 |   year = 2011,
493 |   month = 04,
494 |   day = 17,
495 |   lastchecked = {\today}
496 | }
497 | 
498 | @book{oliphant06guide,
499 |     author = {Oliphant, T. E.},
500 |     booktitle = {Guide to NumPy},
501 |     citeulike-article-id = {2515650},
502 |     posted-at = {2008-03-11 16:41:13},
503 |     priority = {2},
504 |     publisher = {Trelgol Publishing},
505 |     title = {{Guide to NumPy}},
506 |     year = {2006}
507 | }
508 | 
509 | @article{matplotlib,
510 |     abstract = {{Matplotlib is a 2D graphics package for Python for application development, interactive scripting, and publication-quality image generation across user interfaces and operating systems.}},
511 |     address = {Los Alamitos, CA, USA},
512 |     author = {Hunter, John D.},
513 |     booktitle = {Computing in Science \& Engineering},
514 |     citeulike-article-id = {2878517},
515 |     citeulike-linkout-0 = {http://doi.ieeecomputersociety.org/10.1109/MCSE.2007.55},
516 |     citeulike-linkout-1 = {http://dx.doi.org/10.1109/MCSE.2007.55},
517 |     citeulike-linkout-2 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4160265},
518 |     doi = {10.1109/MCSE.2007.55},
519 |     issn = {1521-9615},
520 |     journal = {Computing in Science and Engineering},
521 |     keywords = {evaluation, python},
522 |     number = {3},
523 |     pages = {90--95},
524 |     posted-at = {2009-03-26 14:48:42},
525 |     priority = {2},
526 |     publisher = {IEEE Computer Society},
527 |     title = {{Matplotlib: A 2D Graphics Environment}},
528 |     url = {http://dx.doi.org/10.1109/MCSE.2007.55},
529 |     volume = {9},
530 |     year = {2007}
531 | }
532 | 
533 | @inproceedings{boilerpipe,
534 |     abstract = {{In addition to the actual content Web pages consist of navigational elements, templates, and advertisements. This boilerplate text typically is not related to the main content, may deteriorate search precision and thus needs to be detected properly. In this paper, we analyze a small set of shallow text features for classifying the individual text elements in a Web page. We compare the approach to complex, state-of-the-art techniques and show that competitive accuracy can be achieved, at almost no cost. Moreover, we derive a simple and plausible stochastic model for describing the boilerplate creation process. With the help of our model, we also quantify the impact of boilerplate removal to retrieval performance and show significant improvements over the baseline. Finally, we extend the principled approach by straight-forward heuristics, achieving a remarkable detection accuracy.}},
535 |     address = {New York, NY, USA},
536 |     author = {Kohlsch\"{u}tter, Christian and Fankhauser, Peter and Nejdl, Wolfgang},
537 |     booktitle = {Proceedings of the third ACM international conference on Web search and data mining},
538 |     citeulike-article-id = {8241255},
539 |     citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1718542},
540 |     citeulike-linkout-1 = {http://dx.doi.org/10.1145/1718487.1718542},
541 |     doi = {10.1145/1718487.1718542},
542 |     isbn = {978-1-60558-889-6},
543 |     location = {New York, New York, USA},
544 |     pages = {441--450},
545 |     posted-at = {2010-11-23 07:02:43},
546 |     priority = {2},
547 |     publisher = {ACM},
548 |     series = {WSDM '10},
549 |     title = {{Boilerplate detection using shallow text features}},
550 |     url = {http://dx.doi.org/10.1145/1718487.1718542},
551 |     year = {2010}
552 | }
553 | 
554 | 
555 | 
556 | 
557 | @misc{scipy,
558 |     author = {Jones, Eric and Oliphant, Travis and Peterson, Pearu and Others},
559 |     citeulike-article-id = {3398487},
560 |     citeulike-linkout-0 = {http://www.scipy.org/},
561 |     keywords = {python, scipy},
562 |     posted-at = {2009-07-23 14:10:37},
563 |     priority = {2},
564 |     title = {{SciPy: Open source scientific tools for Python}},
565 |     url = {http://www.scipy.org/},
566 |     year = {2001}
567 | }
568 | 
569 | @misc{mlpy,
570 |     author = {Davide Albanese and Giuseppe Jurman and Roberto Visintainer},
571 |     title = {{mlpy Documentation}},
572 |     url = {https://mlpy.fbk.eu/data/mlpy.pdf},
573 |     year = {2010}
574 | }
575 | 
576 | 
577 | 
578 | @misc{BehaveIntro,
579 |   author = "Martin Swell",
580 |   title = "Introduction to Behavioural Finance",
581 |   url = "http://www.behaviouralfinance.net/behavioural-finance.pdf",
582 |   year = 2010,
583 |   month = 4,
584 |   day = 14,
585 |   lastchecked = {\today}
586 | }
587 | 
588 | @book{Pareto:Homo,
589 |   address = {Padova},
590 |   author = {Vilfredo Pareto},
591 |   booktitle = {Manuale Di Economia Politica. Con Una Introduzione Alla Scienza Sociale},
592 |   interhash = {a91d7162f83db6f54698ade6de04f0bb},
593 |   intrahash = {a4fda646bd4bf1e8d58442b790126bb1},
594 |   pages = {404 p.},
595 |   publisher = {CEDAM},
596 |   title = {Manuale di economia politica. Con una introduzione alla scienza sociale (1974)},
597 |   year = {1906},
598 |   date-modified = {2010-02-28 21:15:22 -0500},
599 |   keywords = {economic economy political},
600 |   added-at = {2010-03-02T17:25:53.000+0100},
601 |   biburl = {http://www.bibsonomy.org/bibtex/2a4fda646bd4bf1e8d58442b790126bb1/jrennstich},
602 |   language = {Italian}
603 | }
604 | 
605 | @InProceedings{Pang+Lee+Vaithyanathan:02a,
606 |   author =       {Bo Pang and Lillian Lee and Shivakumar Vaithyanathan},
607 |   title =        {Thumbs up?  {Sentiment} Classification using Machine Learning Techniques},
608 |   booktitle =    "Proceedings of the 2002 Conference on Empirical Methods in Natural
609 | Language Processing (EMNLP)",
610 |   pages = {79--86},
611 |   year =         2002
612 | }
613 | 
614 | @InProceedings{Cun02b,
615 |   author =   {H. Cunningham and D. Maynard and K. Bontcheva and V. Tablan},
616 |   title =    {{GATE: A framework and graphical development environment for robust NLP tools and applications}},
617 |   booktitle =    {Proceedings of the 40th Anniversary Meeting of the
618 |                   Association for Computational Linguistics},
619 |   year =     2002
620 | }
621 | 
622 | @InProceedings{Pang+Lee:04a,
623 |   author =       {Bo Pang and Lillian Lee},
624 |   title =        {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
625 |   booktitle =    "Proceedings of the ACL",
626 |   year =         2004
627 | }
628 | 
629 | 
630 | 
631 | @inproceedings{DBLP:conf/tools/Rossum97,
632 |   author    = {{Guido van Rossum}},
633 |   title     = {A Tour of the Python Language},
634 |   booktitle = {TOOLS (23)},
635 |   year      = {1997},
636 |   pages     = {370},
637 |   ee        = {http://doi.ieeecomputersociety.org/10.1109/TOOLS.1997.10001},
638 |   crossref  = {DBLP:conf/tools/23-1997},
639 |   bibsource = {DBLP, http://dblp.uni-trier.de}
640 | }
641 | 
642 | 
643 | 
644 | 
645 | 
646 | @article{penntreebank,
647 |     abstract = {{this paper, we review our experience with constructing one such large annotated
648 | corpus--the Penn Treebank, a corpus consisting of over 4.5 million words of American
649 | English. During the first three-year phase of the Penn Treebank Project (1989-1992), this
650 | corpus has been annotated for part-of-speech (POS) information. In addition, over half
651 | 
652 | of it has been annotated for skeletal syntactic structure. These materials are available
653 | to members of the Linguistic Data Consortium; for details, see...}},
654 |     author = {Marcus, Mitchell P. and Santorini, Beatrice and Marcinkiewicz, Mary A.},
655 |     citeulike-article-id = {1205174},
656 |     citeulike-linkout-0 = {http://acl.ldc.upenn.edu/J/J93/J93-2004.pdf},
657 |     citeulike-linkout-1 = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.9706},
658 |     journal = {Computational Linguistics},
659 |     keywords = {annotation, corpora, english, nlp, penn-treebank},
660 |     number = {2},
661 |     pages = {313--330},
662 |     posted-at = {2009-05-18 10:02:58},
663 |     priority = {2},
664 |     title = {{Building a Large Annotated Corpus of English: The Penn Treebank}},
665 |     url = {http://acl.ldc.upenn.edu/J/J93/J93-2004.pdf},
666 |     volume = {19},
667 |     year = {1994}
668 | }
669 | 
670 | 
671 | 
672 | @misc{miningpeanut,
673 |     abstract = {{The web contains a wealth of product reviews, but sifting through
674 | them is a daunting task. Ideally, an opinion mining tool would process
675 | a set of search results for a given item, generating a list of
676 | product attributes (quality, features, etc.) and aggregating opinions
677 | about each of them (poor, mixed, good). We begin by identifying
678 | the unique properties of this problem and develop a method
679 | for automatically distinguishing between positive and negative reviews.
680 | Our classifier draws on...}},
681 |     author = {Dave, D. and Lawrence, S.},
682 |     citeulike-article-id = {899598},
683 |     citeulike-linkout-0 = {http://citeseer.ist.psu.edu/dave03mining.html},
684 |     citeulike-linkout-1 = {http://citeseer.lcs.mit.edu/dave03mining.html},
685 |     citeulike-linkout-2 = {http://citeseer.ifi.unizh.ch/dave03mining.html},
686 |     citeulike-linkout-3 = {http://citeseer.comp.nus.edu.sg/dave03mining.html},
687 |     keywords = {blogs, lecture-8, social, web, web\_20},
688 |     posted-at = {2008-02-25 21:56:38},
689 |     priority = {2},
690 |     year = 2003,
691 |     title = {{Mining the peanut gallery: opinion extraction and semantic classification of product reviews}},
692 |     url = {http://citeseer.ist.psu.edu/dave03mining.html}
693 | }
694 | 
695 | 
696 | 
697 | @book{webdatamining,
698 |     abstract = {{<P>Web mining aims to discover useful information and knowledge from the Web hyperlink structure, page contents, and usage data. Although Web mining uses many conventional data mining techniques, it is not purely an application of traditional data mining due to the semistructured and unstructured nature of the Web data and its heterogeneity. It has also developed many of its own algorithms and techniques.</P> <P>Liu has written a comprehensive text on Web data mining. Key topics of structure mining, content mining, and usage mining are covered both in breadth and in depth. His book brings together all the essential concepts and algorithms from related areas such as data mining, machine learning, and text processing to form an authoritative and coherent text. </P> <P>The book offers a rich blend of theory and practice, addressing seminal research ideas, as well as examining the technology from a practical point of view. It is suitable for students, researchers and practitioners interested in Web mining both as a learning text and a reference book. Lecturers can readily use it for classes on data mining, Web mining, and Web search. Additional teaching materials such as lecture slides, datasets, and implemented algorithms are available online.</P>}},
699 |     author = {Liu, Bing},
700 |     citeulike-article-id = {975464},
701 |     citeulike-linkout-0 = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/3540378812},
702 |     citeulike-linkout-1 = {http://www.amazon.de/exec/obidos/redirect?tag=citeulike01-21\&amp;path=ASIN/3540378812},
703 |     citeulike-linkout-2 = {http://www.amazon.fr/exec/obidos/redirect?tag=citeulike06-21\&amp;path=ASIN/3540378812},
704 |     citeulike-linkout-3 = {http://www.amazon.jp/exec/obidos/ASIN/3540378812},
705 |     citeulike-linkout-4 = {http://www.amazon.co.uk/exec/obidos/ASIN/3540378812/citeulike00-21},
706 |     citeulike-linkout-5 = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/3540378812},
707 |     citeulike-linkout-6 = {http://www.worldcat.org/isbn/3540378812},
708 |     citeulike-linkout-7 = {http://books.google.com/books?vid=ISBN3540378812},
709 |     citeulike-linkout-8 = {http://www.amazon.com/gp/search?keywords=3540378812\&index=books\&linkCode=qs},
710 |     citeulike-linkout-9 = {http://www.librarything.com/isbn/3540378812},
711 |     day = {21},
712 |     edition = {1st ed. 2007. Corr. 2nd printing},
713 |     howpublished = {Hardcover},
714 |     isbn = {3540378812},
715 |     keywords = {data-mining, machine-learning},
716 |     month = jan,
717 |     posted-at = {2007-10-02 19:55:48},
718 |     priority = {2},
719 |     publisher = {Springer},
720 |     title = {{Web Data Mining: Exploring Hyperlinks, Contents, and Usage Data (Data-Centric Systems and Applications)}},
721 |     url = {http://www.worldcat.org/isbn/3540378812},
722 |     year = {2009}
723 | }
724 | 
725 | 
726 | 
727 | 
728 | @webpage{opennlp,
729 |   author = "Apache, Incubator",
730 |   title = "openNLP",
731 |   url = "http://incubator.apache.org/opennlp/",
732 |   year = 2011,
733 |   month = 01,
734 |   day = 29,
735 |   lastchecked = {\today}
736 | }
737 | 
738 | @webpage{stanfordpos,
739 |   author = "Stanford NLP, (The Stanford Natural Language Processing Group)",
740 |   title = "Stanford Log-linear Part-Of-Speech Tagger",
741 |   url = "http://nlp.stanford.edu/software/tagger.shtml",
742 |   year = 2010,
743 |   month = 05,
744 |   day = 21,
745 |   lastchecked = {\today}
746 | }
747 | 
748 | @inproceedings{sentimentanalysissvm,
749 |     address = {Barcelona, Spain},
750 |     author = {Mullen, Tony and Collier, Nigel},
751 |     booktitle = {Proceedings of EMNLP 2004},
752 |     citeulike-article-id = {4742195},
753 |     citeulike-linkout-0 = {http://www.aclweb.org/anthology-new/W/W04/W04-3253.bib},
754 |     citeulike-linkout-1 = {http://www.aclweb.org/anthology-new/W/W04/W04-3253.pdf},
755 |     editor = {Lin, Dekang and Wu, Dekai},
756 |     keywords = {detection, different, learning, machine, pmi, sentiment, sources, svm},
757 |     month = jul,
758 |     pages = {412--418},
759 |     posted-at = {2009-06-04 09:21:18},
760 |     priority = {0},
761 |     publisher = {Association for Computational Linguistics},
762 |     title = {{Sentiment Analysis using Support Vector Machines with Diverse Information Sources}},
763 |     url = {http://www.aclweb.org/anthology-new/W/W04/W04-3253.bib},
764 |     year = {2004}
765 | }
766 | 
767 | 
768 | @webpage{moviereviews,
769 |   author = "Pang, Bo and Lee, Lillian",
770 |   title = "Movie Review Data",
771 |   url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/",
772 |   year = 2009,
773 |   month = 10,
774 |   day = 1,
775 |   lastchecked = {\today}
776 | }
777 | 
778 | @webpage{quantly:lingfranc,
779 |   author = "Quantivity",
780 |   title = "Algorithmic Lingua Franca",
781 |   url = "http://quantivity.wordpress.com/2010/01/02/algorithmic-lingua-franca/",
782 |   year = 2010,
783 |   month = 1,
784 |   day = 2,
785 |   lastchecked = {\today}
786 | }
787 | 
788 | @webpage{RPack:snippets,
789 |   author = "Simon Urbanek",
790 |   title = "Code snippets, mostly visualization-related",
791 |   url = "http://www.rforge.net/snippets/",
792 |   year = 2011,
793 |   month = 2,
794 |   day = 15,
795 |   lastchecked = {\today}
796 | }
797 | 
798 | 
799 | 
800 | 
801 | @webpage{RMetrics,
802 |   author = "Rmetrics, Association",
803 |   title = "Rmetrics The premier open source software solution for teaching and training quantitative finance",
804 |   url = "https://www.rmetrics.org/",
805 |   year = 2011,
806 |   month = 4,
807 |   day = 6,
808 |   lastchecked = {\today}
809 | }
810 | 
811 | @MISC{RPack:XML,
812 |   author = "Duncan Temple Lang",
813 |   title = "XML: Tools for parsing and generating XML within R and S-Plus",
814 |   url = "http://www.omegahat.org/RSXML",
815 |   lastchecked = {\today}
816 | }
817 | 
818 | 
819 | @MISC{RPack:tm,
820 |   author = "Ingo Feinerer",
821 |   title = "tm: Text Mining Package",
822 |   url = "http://tm.r-forge.r-project.org/",
823 |   lastchecked = {\today}
824 | }
825 | 
826 | @MISC{RPack:xts,
827 |   author = "Jeffrey A. Ryan and Josh M. Ulrich",
828 |   title = "xts: Extensible Time Series",
829 |   url = "http://r-forge.r-project.org/projects/xts/",
830 |   year = 2011,
831 |   lastchecked = {\today}
832 | }
833 | 
834 | @MISC{RPack:TTR,
835 |   author = "Joshua Ulrich",
836 |   title = "TTR: Technical Trading Rules",
837 |   url = "http://cran.at.r-project.org/web/packages/TTR/TTR.pdf",
838 |   year = 2010,
839 |   lastchecked = {\today}
840 | }
841 | 
842 | @MISC{RPack:quantmod,
843 |   author = "Jeffrey A. Ryan",
844 |   title = "quantmod: Quantitative Financial Modelling Framework",
845 |   year = {2009},
846 |   url = "http://www.quantmod.com/",
847 |   lastchecked = {\today}
848 | }
849 | 
850 | @MISC{RPack:slam,
851 |   author = "Kurt Hornik and David Meyer and Christian Buchta",
852 |   title = "slam: Sparse Lightweight Arrays and Matrices",
853 |   url = "http://cran.at.r-project.org/web/packages/slam/slam.pdf",
854 |   year = 2011,
855 |   lastchecked = {\today}
856 | }
857 | 
858 | @MISC{RPack:zoo,
859 |   author = "Achim Zeileis and Gabor Grothendieck and Felix Andrews",
860 |   title = "zoo: Z's ordered observations",
861 |   url = "http://r-forge.r-project.org/projects/zoo/",
862 |   year = 2011,
863 |   lastchecked = {\today}
864 | }
865 | 
866 | @MISC{GoogleNewsArchive,
867 |   author = "Google",
868 |   title = "Google News Archive Search",
869 |   url = "http://news.google.com/archivesearch",
870 |   lastchecked = {\today}
871 | }
872 | 
873 | @MISC{XML,
874 |   author = "W3C",
875 |   title = "Extensible Markup Language (XML) 1.0 (Fifth Edition)",
876 |   url = "http://www.w3.org/TR/REC-xml/",
877 |   year = 2008,
878 |   month = 11,
879 |   day = 26,
880 |   lastchecked = {\today}
881 | }
882 | 
883 | @MISC{JavaScript,
884 |   author = "Mozilla",
885 |   title = "JavaScript",
886 |   url = "https://developer.mozilla.org/en/JavaScript#Documentation",
887 |   year = 2011,
888 |   lastchecked = {\today}
889 | }
890 | 
891 | @MISC{RSS,
892 |   author = "RSS Advisory Board",
893 |   title = "RSS 2.0 Specification",
894 |   url = "http://www.rssboard.org/rss-specification",
895 |   year = 2002,
896 |   lastchecked = {\today}
897 | }
898 | 
899 | @MISC{ATOM,
900 |   author = "IETF",
901 |   title = "The Atom Syndication Format",
902 |   url = "http://tools.ietf.org/html/rfc4287",
903 |   year = 2005,
904 |   lastchecked = {\today}
905 | }
906 | 
907 | @MISC{JSON,
908 |   author = "Douglas Crockford",
909 |   title = "Introducing JSON",
910 |   url = "http://www.json.org",
911 |   year = 2002,
912 |   lastchecked = {\today}
913 | }
914 | 
915 | 
916 | @MISC{GoogleFinance,
917 |   author = "Google",
918 |   title = "Google Finance",
919 |   url = "http://www.google.com/finance",
920 |   lastchecked = {\today}
921 | }
922 | 
923 | @MISC{YahooFinance,
924 |   author = "Yahoo!",
925 |   title = "Yahoo! Finance",
926 |   url = "http://finance.yahoo.com/",
927 |   lastchecked = {\today}
928 | }
929 | 
930 | @mastersthesis{Hariharan04NewsMining,
931 |     author = "Gurushyam Hariharan",
932 |     title = "News Mining Agent for Automated Stock Trading",
933 |     school = "University of Texas, Austin",
934 |     year = "2004"}
935 | 
936 | @book{hadoop,
937 |     abstract = {{Hadoop: The Definitive Guide helps you harness the power of your data. Ideal
938 | for processing large datasets, the Apache Hadoop framework is an open source
939 | implementation of the MapReduce algorithm on which Google built its empire.
940 | This comprehensive resource demonstrates how to use Hadoop to build reliable,
941 | scalable, distributed systems: programmers will find details for analyzing
942 | large datasets, and administrators will learn how to set up and run Hadoop
943 | clusters. Complete with case studies that illustrate how Hadoop solves
944 | specific problems, this book helps you:
945 | 
946 | Use the Hadoop Distributed File System (HDFS) for storing large datasets, and
947 | run distributed computations over those datasets using MapReduce Become
948 | familiar with Hadoop's data and I/O building blocks for compression, data
949 | integrity, serialization, and persistence Discover common pitfalls and
950 | advanced features for writing real-world MapReduce programs Design, build, and
951 | administer a dedicated Hadoop cluster, or run Hadoop in the cloud Use Pig, a
952 | high-level query language for large-scale data processing Take advantage of
953 | HBase, Hadoop's database for structured and semi-structured data Learn
954 | ZooKeeper, a toolkit of coordination primitives for building distributed
955 | systems
956 | 
957 | If you have lots of data -- whether it's gigabytes or petabytes -- Hadoop is
958 | the perfect solution. Hadoop: The Definitive Guide is the most thorough book
959 | available on the subject. "Now you have the opportunity to learn about Hadoop
960 | from a master-not only of the technology, but also of common sense and plain
961 | talk." -- Doug Cutting, Hadoop Founder, Yahoo!}},
962 |     author = {White, Tom},
963 |     citeulike-article-id = {4882841},
964 |     citeulike-linkout-0 = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&amp;path=ASIN/0596521979},
965 |     citeulike-linkout-1 = {http://www.amazon.de/exec/obidos/redirect?tag=citeulike01-21\&amp;path=ASIN/0596521979},
966 |     citeulike-linkout-2 = {http://www.amazon.fr/exec/obidos/redirect?tag=citeulike06-21\&amp;path=ASIN/0596521979},
967 |     citeulike-linkout-3 = {http://www.amazon.jp/exec/obidos/ASIN/0596521979},
968 |     citeulike-linkout-4 = {http://www.amazon.co.uk/exec/obidos/ASIN/0596521979/citeulike00-21},
969 |     citeulike-linkout-5 = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/0596521979},
970 |     citeulike-linkout-6 = {http://www.worldcat.org/isbn/0596521979},
971 |     citeulike-linkout-7 = {http://books.google.com/books?vid=ISBN0596521979},
972 |     citeulike-linkout-8 = {http://www.amazon.com/gp/search?keywords=0596521979\&index=books\&linkCode=qs},
973 |     citeulike-linkout-9 = {http://www.librarything.com/isbn/0596521979},
974 |     day = {05},
975 |     edition = {1},
976 |     howpublished = {Paperback},
977 |     isbn = {0596521979},
978 |     month = jun,
979 |     posted-at = {2009-06-20 17:40:53},
980 |     priority = {2},
981 |     publisher = {O'Reilly Media},
982 |     title = {{Hadoop: The Definitive Guide}},
983 |     url = {http://www.worldcat.org/isbn/0596521979},
984 |     year = {2009}
985 | }
986 | 
987 |     


--------------------------------------------------------------------------------