├── tests ├── .gitignore ├── testthat.R ├── testthat.R.cran └── testthat │ ├── test-source-yahooinplay.R │ ├── test-source-reutersnews.R │ ├── test-source-googlefinance.R │ ├── test-source-yahoonews.R │ ├── test-source-googlenews.R │ ├── test-source-yahoofinance.R │ └── test-source-nytimes.R ├── inst ├── .gitignore └── NEWS.Rd ├── data ├── yahoonews.rda └── nytimes_appid.rda ├── .gitignore ├── .Rbuildignore ├── man ├── nytimes_appid.Rd ├── trimWhiteSpaces.Rd ├── source.update.Rd ├── encloseHTML.Rd ├── getEmpty.Rd ├── extract.Rd ├── yahoonews.Rd ├── parse.Rd ├── LiberationSource.Rd ├── YahooInplaySource.Rd ├── removeNonASCII.Rd ├── YahooNewsSource.Rd ├── GoogleNewsSource.Rd ├── feedquery.Rd ├── extractHTMLStrip.Rd ├── YahooFinanceSource.Rd ├── GoogleFinanceSource.Rd ├── ReutersNewsSource.Rd ├── readWeb.Rd ├── corpus.update.Rd ├── extractContentDOM.Rd ├── WebCorpus.Rd ├── WebSource.Rd ├── tm.plugin.webmining-package.Rd ├── NYTimesSource.Rd └── getLinkContent.Rd ├── .travis.yml ├── R ├── trimWhiteSpaces.R ├── parser.R ├── feedquery.R ├── transform.R ├── tm.plugin.webmining-package.R ├── getLinkContent.R ├── corpus.R ├── extract.R ├── reader.R └── source.R ├── DESCRIPTION ├── vignettes ├── tables │ └── sources.tex ├── ShortIntro.Rnw └── references.bib ├── NAMESPACE ├── README.md └── Makefile /tests/.gitignore: -------------------------------------------------------------------------------- 1 | /.DS_Store 2 | -------------------------------------------------------------------------------- /inst/.gitignore: -------------------------------------------------------------------------------- 1 | /googleSearch.R 2 | -------------------------------------------------------------------------------- /data/yahoonews.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mannau/tm.plugin.webmining/HEAD/data/yahoonews.rda -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(tm) 3 | 4 | test_check("tm.plugin.webmining") 5 | -------------------------------------------------------------------------------- /data/nytimes_appid.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mannau/tm.plugin.webmining/HEAD/data/nytimes_appid.rda -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .settings 2 | .project 3 | .README.md.html 4 | *.tar.gz 5 | *.Rcheck 6 | /release 7 | /.DS_Store 8 | .Rproj.user 9 | *.Rproj -------------------------------------------------------------------------------- /tests/testthat.R.cran: -------------------------------------------------------------------------------- 1 | ### De-activate tests for CRAN 2 | #library(testthat) 3 | #library(tm) 4 | # 5 | #test_check("tm.plugin.webmining") 6 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | .settings 2 | .travis.yml 3 | .gitignore 4 | .project 5 | .README.md.html 6 | Makefile 7 | README.md 8 | release 9 | .*tar.gz 10 | data/.gitignore 11 | travis-tool.sh 12 | inst/googleSearch.R 13 | tests/testthat.R.cran 14 | tests/testthat.R.temp 15 | ^.*\.Rproj$ 16 | ^\.Rproj\.user$ 17 | -------------------------------------------------------------------------------- /inst/NEWS.Rd: -------------------------------------------------------------------------------- 1 | \name{NEWS} 2 | \title{News for Package 'tm.plugin.webmining'} 3 | \newcommand{\cpkg}{\href{http://CRAN.R-project.org/package=#1}{\pkg{#1}}} 4 | 5 | \section{Changes in tm.plugin.webmining version 1.3 (2015-05-07)}{ 6 | \itemize{ 7 | \item Fix Issue #6: NYTimesSource 8 | \item Various fixes in tests and sources 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /man/nytimes_appid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/tm.plugin.webmining-package.R 3 | \docType{data} 4 | \name{nytimes_appid} 5 | \alias{nytimes_appid} 6 | \title{AppID for the NYtimes-API.} 7 | \description{ 8 | USED ONLY FOR PACKAGE TESTING. PLEASE DOWNLOAD YOUR OWN KEY AT \url{http://developer.nytimes.com/}!!! 9 | } 10 | \author{ 11 | Mario Annau 12 | } 13 | \keyword{data} 14 | 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | 3 | env: 4 | global: 5 | - _R_CHECK_CRAN_INCOMING_=FALSE 6 | 7 | # Be strict when checking our package 8 | warnings_are_errors: true 9 | cran: http://cran.us.r-project.org 10 | 11 | # System dependencies for HTTP calling 12 | apt_packages: 13 | - libcurl4-openssl-dev 14 | - libxml2-dev 15 | - r-cran-rjava 16 | r_github_packages: 17 | - jimhester/covr 18 | after_success: 19 | - R --slave --vanilla -e 'library(covr); pc <- package_coverage(); pc; codecov(coverage = pc)' 20 | -------------------------------------------------------------------------------- /R/trimWhiteSpaces.R: -------------------------------------------------------------------------------- 1 | #' @title Trim White Spaces from Text Document. 2 | #' @description Transformation function, actually equal to stripWhiteSpace 3 | #' applicable for simple strings using Perl parser 4 | #' @author Mario Annau 5 | #' @param txt character 6 | #' @seealso \code{\link{stripWhitespace}} 7 | #' @export 8 | trimWhiteSpaces <- 9 | function(txt){ 10 | txt <- sub("\\s+", "", txt, perl = TRUE) 11 | txt <- sub("\\s+$", "", txt, perl = TRUE) 12 | txt <- gsub("\\s\\s+", " ", txt, perl = TRUE) 13 | return(txt) 14 | } 15 | 16 | -------------------------------------------------------------------------------- /man/trimWhiteSpaces.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/trimWhiteSpaces.R 3 | \name{trimWhiteSpaces} 4 | \alias{trimWhiteSpaces} 5 | \title{Trim White Spaces from Text Document.} 6 | \usage{ 7 | trimWhiteSpaces(txt) 8 | } 9 | \arguments{ 10 | \item{txt}{character} 11 | } 12 | \description{ 13 | Transformation function, actually equal to stripWhiteSpace 14 | applicable for simple strings using Perl parser 15 | } 16 | \author{ 17 | Mario Annau 18 | } 19 | \seealso{ 20 | \code{\link{stripWhitespace}} 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/source.update.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{source.update} 4 | \alias{source.update} 5 | \alias{source.update.WebHTMLSource} 6 | \alias{source.update.WebJSONSource} 7 | \alias{source.update.WebXMLSource} 8 | \title{Update WebXMLSource/WebHTMLSource/WebJSONSource} 9 | \usage{ 10 | source.update(x) 11 | } 12 | \arguments{ 13 | \item{x}{Source object to be updated} 14 | } 15 | \description{ 16 | Typically, update is called from \code{link{corpus.update}} and refreshes \code{$Content} in 17 | Source object. 18 | } 19 | 20 | -------------------------------------------------------------------------------- /man/encloseHTML.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/transform.R 3 | \name{encloseHTML} 4 | \alias{encloseHTML} 5 | \alias{encloseHTML.PlainTextDocument} 6 | \alias{encloseHTML.character} 7 | \title{Enclose Text Content in HTML tags} 8 | \usage{ 9 | encloseHTML(x) 10 | } 11 | \arguments{ 12 | \item{x}{object of PlainTextDocument class} 13 | } 14 | \description{ 15 | Simple helper function which encloses text content of character 16 | (or \code{\link[tm]{TextDocument}}) in HTML-tags. That way, HTML 17 | content can be easier parsed by \code{\link[XML]{htmlTreeParse}} 18 | } 19 | 20 | -------------------------------------------------------------------------------- /man/getEmpty.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/corpus.R 3 | \name{getEmpty} 4 | \alias{getEmpty} 5 | \alias{getEmpty.WebCorpus} 6 | \title{Retrieve Empty Corpus Elements through \code{$postFUN}.} 7 | \usage{ 8 | getEmpty(x, ...) 9 | } 10 | \arguments{ 11 | \item{x}{object of type \code{\link{WebCorpus}}} 12 | 13 | \item{...}{additional parameters to PostFUN} 14 | } 15 | \description{ 16 | Retrieve content of all empty (textlength equals zero) corpus elements. If 17 | corpus element is empty, \code{$postFUN} is called (specified in \code{\link{meta}}) 18 | } 19 | \seealso{ 20 | \code{\link{WebCorpus}} 21 | } 22 | 23 | -------------------------------------------------------------------------------- /man/extract.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/extract.R 3 | \name{extract} 4 | \alias{extract} 5 | \alias{extract.PlainTextDocument} 6 | \title{Extract main content from \code{TextDocument}s.} 7 | \usage{ 8 | extract(x, extractor, ...) 9 | } 10 | \arguments{ 11 | \item{x}{PlainTextDocument} 12 | 13 | \item{extractor}{default extraction function to be used, defaults to \code{\link{extractContentDOM}}} 14 | 15 | \item{...}{additional parameters to extractor function} 16 | } 17 | \description{ 18 | Use implemented extraction functions (through boilerpipeR) to extract main content from 19 | \code{TextDocument}s. 20 | } 21 | 22 | -------------------------------------------------------------------------------- /man/yahoonews.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/tm.plugin.webmining-package.R 3 | \docType{data} 4 | \name{yahoonews} 5 | \alias{yahoonews} 6 | \title{WebCorpus retrieved from Yahoo! News for the search term "Microsoft" 7 | through the YahooNewsSource. Length of retrieved corpus is 20.} 8 | \description{ 9 | WebCorpus retrieved from Yahoo! News for the search term "Microsoft" 10 | through the YahooNewsSource. Length of retrieved corpus is 20. 11 | } 12 | \examples{ 13 | #Data set has been generated as follows: 14 | \dontrun{ 15 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft")) 16 | } 17 | } 18 | \author{ 19 | Mario Annau 20 | } 21 | \keyword{data} 22 | 23 | -------------------------------------------------------------------------------- /man/parse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/parser.R 3 | \name{parse} 4 | \alias{parse} 5 | \title{Wrapper/Convenience function to ensure right encoding for different Platforms} 6 | \usage{ 7 | parse(..., asText = TRUE, type = c("XML", "HTML", "JSON")) 8 | } 9 | \arguments{ 10 | \item{...}{arguments to be passed to specified parser function} 11 | 12 | \item{asText}{defines if input should be treated as text/character, default to TRUE} 13 | 14 | \item{type}{either "XML", "HTML" or "JSON". Defaults to "XML"} 15 | } 16 | \description{ 17 | Depending on specified type one of the following parser functions is called: 18 | \describe{ 19 | \item{XML}{\code{\link{xmlInternalTreeParse}}} 20 | \item{HTML}{\code{\link{htmlTreeParse}}} 21 | \item{JSON}{\code{\link{fromJSON}}} 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /man/LiberationSource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{LiberationSource} 4 | \alias{LiberationSource} 5 | \alias{readLiberationSource} 6 | \title{Get news data from french Liberation News Paper (\url{http://rss.liberation.fr/rss}).} 7 | \usage{ 8 | LiberationSource(query = "latest", ...) 9 | } 10 | \arguments{ 11 | \item{query}{feed to be retrieved, defaults to 'latest'} 12 | 13 | \item{...}{additional parameters to \code{\link{WebSource}}} 14 | } 15 | \value{ 16 | WebXMLSource 17 | } 18 | \description{ 19 | Get news data from french Liberation News Paper (\url{http://rss.liberation.fr/rss}). 20 | } 21 | \examples{ 22 | \dontrun{ 23 | corpus <- WebCorpus(LiberationSource("latest")) 24 | } 25 | } 26 | \author{ 27 | Mario Annau 28 | } 29 | \seealso{ 30 | \code{\link{WebSource}} 31 | } 32 | 33 | -------------------------------------------------------------------------------- /man/YahooInplaySource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{YahooInplaySource} 4 | \alias{YahooInplaySource} 5 | \alias{readYahooInplay} 6 | \title{Get News from Yahoo Inplay.} 7 | \usage{ 8 | YahooInplaySource(...) 9 | } 10 | \arguments{ 11 | \item{...}{additional parameters to \code{\link{WebSource}}} 12 | } 13 | \value{ 14 | WebHTMLSource 15 | } 16 | \description{ 17 | Yahoo Inplay lists a range of company news provided by Briefing.com. Since Yahoo Inplay 18 | does not provide a structured XML news feed, content is parsed directly from the HTML page. 19 | Therefore, no further Source parameters can be specified. The number of feed items per 20 | request can vary substantially. 21 | } 22 | \examples{ 23 | \dontrun{ 24 | corpus <- WebCorpus(YahooInplaySource()) 25 | } 26 | } 27 | \author{ 28 | Mario Annau 29 | } 30 | 31 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: tm.plugin.webmining 2 | Version: 1.3.2 3 | Date: 2015-09-10 4 | Title: Retrieve Structured, Textual Data from Various Web Sources 5 | Authors@R: c(person("Mario", "Annau", role = c("aut", "cre"), 6 | email = "mario.annau@gmail.com")) 7 | Depends: 8 | R (>= 3.1.0) 9 | Imports: 10 | NLP (>= 0.1-2), 11 | tm (>= 0.6), 12 | boilerpipeR, 13 | RCurl, 14 | XML, 15 | RJSONIO 16 | Suggests: 17 | testthat 18 | Description: Facilitate text retrieval from feed 19 | formats like XML (RSS, ATOM) and JSON. Also direct retrieval from 20 | HTML is supported. As most (news) feeds only incorporate small 21 | fractions of the original text tm.plugin.webmining even retrieves 22 | and extracts the text of the original text source. 23 | License: GPL-3 24 | URL: https://github.com/mannau/tm.plugin.webmining 25 | BugReports: https://github.com/mannau/tm.plugin.webmining/issues 26 | -------------------------------------------------------------------------------- /man/removeNonASCII.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/transform.R 3 | \name{removeNonASCII} 4 | \alias{removeNonASCII} 5 | \alias{removeNonASCII.PlainTextDocument} 6 | \title{Remove non-ASCII characters from Text.} 7 | \usage{ 8 | removeNonASCII(x, fields = c("Content", "Heading", "Description"), 9 | from = "UTF-8", to = "ASCII//TRANSLIT") 10 | } 11 | \arguments{ 12 | \item{x}{object of PlainTextDocument class} 13 | 14 | \item{fields}{specifies fields to be converted, defaults to fields = c("Content", "Heading", "Description")} 15 | 16 | \item{from}{specifies encoding from which conversion should be done, defaults to "UTF-8"} 17 | 18 | \item{to}{speciefies target encoding, defaults to "ASCII//TRANSLIT"} 19 | } 20 | \description{ 21 | This is a helper function to generate package data 22 | without non-ASCII character and omit the warning at R CMD check. 23 | } 24 | 25 | -------------------------------------------------------------------------------- /man/YahooNewsSource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{YahooNewsSource} 4 | \alias{YahooNewsSource} 5 | \alias{readYahooHTML} 6 | \title{Get news data from Yahoo! News (\url{https://news.search.yahoo.com/search/}).} 7 | \usage{ 8 | YahooNewsSource(query, params = list(p = query), ...) 9 | } 10 | \arguments{ 11 | \item{query}{words to be searched in Yahoo News, multiple words must be separated by '+'} 12 | 13 | \item{params,}{additional query parameters, see \url{http://developer.yahoo.com/rss/}} 14 | 15 | \item{...}{additional parameters to \code{\link{WebSource}}} 16 | } 17 | \value{ 18 | WebXMLSource 19 | } 20 | \description{ 21 | Currently, only a maximum of 10 items can be retrieved. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | corpus <- WebCorpus(YahooNewsSource("Microsoft")) 26 | } 27 | } 28 | \author{ 29 | Mario Annau 30 | } 31 | \seealso{ 32 | \code{\link{WebSource}} 33 | } 34 | 35 | -------------------------------------------------------------------------------- /vignettes/tables/sources.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{l|r|l|c|l} 2 | \textbf{Source Name} & \textbf{Items} & \textbf{URL} & \textbf{Auth} & 3 | \textbf{Format}\\ 4 | \hline \class{GoogleBlogSearchSource} & 100 & 5 | \url{http://www.google.com/blogsearch} & - & RSS\\ 6 | \class{GoogleFinanceSource} & 20 & \url{http://www.google.com/finance} & - & 7 | RSS\\ 8 | \class{GoogleNewsSource} & 100 & \url{http://news.google.com} & - & RSS\\ 9 | \class{NYTimesSource} & 100 & \url{http://api.nytimes.com} & x & JSON\\ 10 | \class{ReutersNewsSource} & 20 & \url{http://www.reuters.com/tools/rss} & - & 11 | ATOM\\ 12 | %\class{TwitterSource} & 1500 & \url{http://search.twitter.com/api} & - & ATOM\\ 13 | \class{YahooFinanceSource} & 20 & \url{http://finance.yahoo.com} & - & RSS\\ 14 | \class{YahooInplaySource} & 100+ & 15 | \url{http://finance.yahoo.com/marketupdate/inplay} & - & HTML\\ 16 | \class{YahooNewsSource} & 20 & \url{http://news.search.yahoo.com/rss} & - & RSS 17 | \end{tabular} -------------------------------------------------------------------------------- /R/parser.R: -------------------------------------------------------------------------------- 1 | #' @title Wrapper/Convenience function to ensure right encoding for different Platforms 2 | #' @description Depending on specified type one of the following parser functions is called: 3 | #' \describe{ 4 | #' \item{XML}{\code{\link{xmlInternalTreeParse}}} 5 | #' \item{HTML}{\code{\link{htmlTreeParse}}} 6 | #' \item{JSON}{\code{\link{fromJSON}}} 7 | #' } 8 | #' @param ... arguments to be passed to specified parser function 9 | #' @param asText defines if input should be treated as text/character, default to TRUE 10 | #' @param type either "XML", "HTML" or "JSON". Defaults to "XML" 11 | #' @export 12 | parse <- function(..., asText = TRUE, type = c("XML", "HTML", "JSON")){ 13 | parsetype <- match.arg(type) 14 | encoding <- switch(.Platform$OS.type, 15 | unix = "UTF-8", 16 | windows = "latin1") 17 | parser <- switch(parsetype, 18 | XML = xmlInternalTreeParse, 19 | HTML = htmlTreeParse, 20 | JSON = fromJSON) 21 | parser(..., encoding = encoding, asText = asText) 22 | } 23 | -------------------------------------------------------------------------------- /man/GoogleNewsSource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{GoogleNewsSource} 4 | \alias{GoogleNewsSource} 5 | \title{Get feed data from Google News Search \url{http://news.google.com/}} 6 | \usage{ 7 | GoogleNewsSource(query, params = list(hl = "en", q = query, ie = "utf-8", num 8 | = 30, output = "rss"), ...) 9 | } 10 | \arguments{ 11 | \item{query}{Google News Search query} 12 | 13 | \item{params,}{additional query parameters} 14 | 15 | \item{...}{additional parameters to \code{\link{WebSource}}} 16 | } 17 | \value{ 18 | WebXMLSource 19 | } 20 | \description{ 21 | Google News Search is one of the most popular news aggregators on the web. News 22 | can be retrieved for any customized user query. Up to 30 can be retrieved per 23 | request. 24 | } 25 | \examples{ 26 | \dontrun{ 27 | corpus <- WebCorpus(GoogleNewsSource("Microsoft")) 28 | } 29 | } 30 | \author{ 31 | Mario Annau 32 | } 33 | \seealso{ 34 | \code{\link{WebSource}} 35 | } 36 | 37 | -------------------------------------------------------------------------------- /man/feedquery.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/feedquery.R 3 | \name{feedquery} 4 | \alias{feedquery} 5 | \title{Buildup string for feedquery.} 6 | \usage{ 7 | feedquery(url, params) 8 | } 9 | \arguments{ 10 | \item{url}{character specifying feed url} 11 | 12 | \item{params}{list which contains feed parameters, e.g. list(param1="value1", param2="value2")} 13 | } 14 | \description{ 15 | Function has partly been taken from \code{\link[RCurl]{getForm}} function. 16 | Generally, a feed query is a string built up as follows: \cr 17 | \code{?&&...&} \cr 18 | By specifying a feed url and parameter--value pairs (as list) we can easily 19 | generate a feed query in R. 20 | } 21 | \examples{ 22 | \dontrun{ 23 | feedquery(url = "http://dummy.com", 24 | params = list(param1 = "value1", param2 = "value2")) 25 | } 26 | } 27 | \author{ 28 | Mario Annau 29 | } 30 | \seealso{ 31 | \code{\link{xmlNode}} \code{\link{getForm}} 32 | } 33 | 34 | -------------------------------------------------------------------------------- /man/extractHTMLStrip.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/extract.R 3 | \name{extractHTMLStrip} 4 | \alias{extractHTMLStrip} 5 | \title{Simply strip HTML Tags from Document} 6 | \usage{ 7 | extractHTMLStrip(url, asText = TRUE, encoding, ...) 8 | } 9 | \arguments{ 10 | \item{url}{character, url or filename} 11 | 12 | \item{asText}{specifies if url parameter is a \code{character}, defaults to TRUE} 13 | 14 | \item{encoding}{specifies local encoding to be used, depending on platform} 15 | 16 | \item{...}{Additional parameters for \code{\link{htmlTreeParse}}} 17 | } 18 | \description{ 19 | \code{extractHTMLStrip} parses an url, character or filename, reads the DOM 20 | tree, removes all HTML tags in the tree and outputs the source text without 21 | markup. 22 | } 23 | \note{ 24 | Input text should be enclosed in 'TEXT' tags to ensure correct 25 | DOM parsing (issue especially under .Platform$os.type = 'windows') 26 | } 27 | \author{ 28 | Mario Annau 29 | } 30 | \seealso{ 31 | \code{\link{xmlNode}} 32 | 33 | \code{\link{htmlTreeParse}} \code{\link{encloseHTML}} 34 | } 35 | 36 | -------------------------------------------------------------------------------- /man/YahooFinanceSource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{YahooFinanceSource} 4 | \alias{YahooFinanceSource} 5 | \alias{readYahoo} 6 | \title{Get feed data from Yahoo! Finance.} 7 | \usage{ 8 | YahooFinanceSource(query, params = list(s = query, region = "US", lang = 9 | "en-US"), ...) 10 | } 11 | \arguments{ 12 | \item{query}{ticker symbols of companies to be searched for, see \url{http://finance.yahoo.com/lookup}.} 13 | 14 | \item{params,}{additional query parameters, see \url{http://developer.yahoo.com/rss/}} 15 | 16 | \item{...}{additional parameters to \code{\link{WebSource}}} 17 | } 18 | \value{ 19 | WebXMLSource 20 | } 21 | \description{ 22 | Yahoo! Finance is a popular site which provides financial news and information. It is a large source 23 | for historical price data as well as financial news. Using the typical Yahoo! Finance ticker 24 | news items can easily be retrieved. However, the maximum number of items is 20. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | corpus <- WebCorpus(YahooFinanceSource("MSFT")) 29 | } 30 | } 31 | \author{ 32 | Mario Annau 33 | } 34 | \seealso{ 35 | \code{\link{WebSource}} 36 | } 37 | 38 | -------------------------------------------------------------------------------- /man/GoogleFinanceSource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{GoogleFinanceSource} 4 | \alias{GoogleFinanceSource} 5 | \alias{readGoogle} 6 | \title{Get feed Meta Data from Google Finance.} 7 | \usage{ 8 | GoogleFinanceSource(query, params = list(hl = "en", q = query, ie = "utf-8", 9 | start = 0, num = 20, output = "rss"), ...) 10 | } 11 | \arguments{ 12 | \item{query}{ticker symbols of companies to be searched for, see \url{http://www.google.com/finance}. 13 | Please note that Google ticker symbols need to be prefixed with the exchange name, e.g. NASDAQ:MSFT} 14 | 15 | \item{params}{additional query parameters} 16 | 17 | \item{...}{additional parameters to \code{\link{WebSource}}} 18 | } 19 | \value{ 20 | WebXMLSource 21 | } 22 | \description{ 23 | Google Finance provides business and enterprise headlines for many companies. Coverage is 24 | particularly strong for US-Markets. However, only up to 20 feed items can be retrieved. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | corpus <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT")) 29 | } 30 | } 31 | \author{ 32 | Mario Annau 33 | } 34 | \seealso{ 35 | \code{\link{WebSource}} 36 | } 37 | 38 | -------------------------------------------------------------------------------- /man/ReutersNewsSource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{ReutersNewsSource} 4 | \alias{ReutersNewsSource} 5 | \alias{readReutersNews} 6 | \title{Get feed data from Reuters News RSS feed channels. Reuters provides numerous feed} 7 | \usage{ 8 | ReutersNewsSource(query = "businessNews", ...) 9 | } 10 | \arguments{ 11 | \item{query}{Reuters News RSS Feed, see \url{http://www.reuters.com/tools/rss} for a list of all feeds provided. Note that only string after 'http://feeds.reuters.com/reuters/' must be given. Defaults to 'businessNews'.} 12 | 13 | \item{...}{additional parameters to \code{\link{WebSource}}} 14 | } 15 | \value{ 16 | WebXMLSource 17 | } 18 | \description{ 19 | channels (\url{http://www.reuters.com/tools/rss}) which can be retrieved through RSS 20 | feeds. Only up to 25 items can be retrieved---therefore an alternative retrieval 21 | through the Google Reader API (\code{link{GoogleReaderSource}}) could be considered. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | corpus <- WebCorpus(ReutersNewsSource("businessNews")) 26 | } 27 | } 28 | \author{ 29 | Mario Annau 30 | } 31 | \seealso{ 32 | \code{\link{WebSource}} 33 | } 34 | 35 | -------------------------------------------------------------------------------- /man/readWeb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/reader.R 3 | \name{readWeb} 4 | \alias{json_content} 5 | \alias{readWeb} 6 | \alias{readWebHTML} 7 | \alias{readWebJSON} 8 | \alias{readWebXML} 9 | \title{Read content from WebXMLSource/WebHTMLSource/WebJSONSource.} 10 | \usage{ 11 | readWeb(spec, doc, parser, contentparser, freeFUN = NULL) 12 | } 13 | \arguments{ 14 | \item{spec}{specification of content reader} 15 | 16 | \item{doc}{document to be parsed} 17 | 18 | \item{parser}{parser function to be used} 19 | 20 | \item{contentparser}{content parser function to be used, see also \code{tm:::xml_content} or \code{json_content}} 21 | 22 | \item{freeFUN}{function to free memory from parsed object (actually only relevant for XML and HTML trees)} 23 | } 24 | \value{ 25 | FunctionGenerator 26 | } 27 | \description{ 28 | \code{readWeb} is a FunctionGenerator which specifies content retrieval from a \code{\link{WebSource}} 29 | content elements. Currently, it is defined for XML, HTML and JSON feeds through \code{readWebXML}, 30 | \code{readWebHTML} and \code{readWebJSON}. Also content parsers (\code{xml_content}, \code{json_content}) 31 | need to be defined. 32 | } 33 | 34 | -------------------------------------------------------------------------------- /man/corpus.update.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/corpus.R 3 | \name{corpus.update} 4 | \alias{corpus.update} 5 | \alias{corpus.update.WebCorpus} 6 | \title{Update/Extend \code{\link{WebCorpus}} with new feed items.} 7 | \usage{ 8 | corpus.update(x, ...) 9 | } 10 | \arguments{ 11 | \item{x}{object of type \code{\link{WebCorpus}}} 12 | 13 | \item{...}{\describe{ 14 | \item{fieldname}{name of \code{\link{Corpus}} field name to be used as ID, defaults to "ID"} 15 | \item{retryempty}{specifies if empty corpus elements should be downloaded again, defaults to TRUE} 16 | \item{...}{additional parameters to \code{\link{Corpus}} function} 17 | }} 18 | } 19 | \description{ 20 | The \code{corpus.update} method ensures, that the original 21 | \code{\link{WebCorpus}} feed sources are downloaded and checked against 22 | already included \code{TextDocument}s. Based on the \code{ID} included 23 | in the \code{TextDocument}'s meta data, only new feed elements are 24 | downloaded and added to the \code{\link{WebCorpus}}. 25 | All relevant information regariding the original source feeds are stored 26 | in the \code{\link{WebCorpus}}' meta data (\code{\link[tm]{meta}}). 27 | } 28 | 29 | -------------------------------------------------------------------------------- /R/feedquery.R: -------------------------------------------------------------------------------- 1 | #' @title Buildup string for feedquery. 2 | #' @description Function has partly been taken from \code{\link[RCurl]{getForm}} function. 3 | #' Generally, a feed query is a string built up as follows: \cr 4 | #' \code{?&&...&} \cr 5 | #' By specifying a feed url and parameter--value pairs (as list) we can easily 6 | #' generate a feed query in R. 7 | #' @author Mario Annau 8 | #' @param url character specifying feed url 9 | #' @param params list which contains feed parameters, e.g. list(param1="value1", param2="value2") 10 | #' @seealso \code{\link{xmlNode}} \code{\link{getForm}} 11 | #' @examples 12 | #' \dontrun{ 13 | #' feedquery(url = "http://dummy.com", 14 | #' params = list(param1 = "value1", param2 = "value2")) 15 | #' } 16 | #' @export 17 | #' @importFrom RCurl curlEscape 18 | feedquery <- 19 | function(url, params){ 20 | els <- lapply(names(params), function(n) { 21 | paste(n, curlEscape(params[[n]]), sep = "=") 22 | }) 23 | names(els) <- names(params) 24 | 25 | feeds <- "" 26 | for(i in names(els)){ 27 | if(feeds[1] == ""){ 28 | sep = "" 29 | } 30 | else{ 31 | sep = "&" 32 | } 33 | feeds <- paste(feeds, els[[i]], sep = sep) 34 | } 35 | 36 | feeds <- paste(url, feeds, sep = "?") 37 | return(feeds) 38 | } -------------------------------------------------------------------------------- /man/extractContentDOM.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/extract.R 3 | \name{extractContentDOM} 4 | \alias{assignValues} 5 | \alias{calcDensity} 6 | \alias{extractContentDOM} 7 | \alias{getMainText} 8 | \alias{removeTags} 9 | \title{Extract Main HTML Content from DOM} 10 | \usage{ 11 | extractContentDOM(url, threshold, asText = TRUE, ...) 12 | } 13 | \arguments{ 14 | \item{url}{character, url or filename} 15 | 16 | \item{threshold}{threshold for extraction, defaults to 0.5} 17 | 18 | \item{asText}{boolean, specifies if url should be interpreted as character} 19 | 20 | \item{...}{Additional Parameters to \code{\link{htmlTreeParse}}} 21 | } 22 | \description{ 23 | Function extracts main HTML Content using its Document Object Model. 24 | Idea comes basically from the fact, that main content of an HTML Document 25 | is in a subnode of the HTML DOM Tree with a high text-to-tag ratio. 26 | Internally, this function also calls 27 | \code{assignValues}, \code{calcDensity}, \code{getMainText} 28 | and \code{removeTags}. 29 | } 30 | \author{ 31 | Mario Annau 32 | } 33 | \references{ 34 | \url{http://www.elias.cn/En/ExtMainText}, 35 | \url{http://ai-depot.com/articles/the-easy-way-to-extract-useful-text-from-arbitrary-html/} 36 | \cite{Gupta et al., DOM-based Content Extraction of HTML Documents},\url{http://www2003.org/cdrom/papers/refereed/p583/p583-gupta.html} 37 | } 38 | \seealso{ 39 | \code{\link{xmlNode}} 40 | } 41 | 42 | -------------------------------------------------------------------------------- /man/WebCorpus.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/corpus.R 3 | \name{WebCorpus} 4 | \alias{WebCorpus} 5 | \title{WebCorpus constructor function.} 6 | \usage{ 7 | WebCorpus(x, readerControl = list(reader = reader(x), language = "en"), 8 | postFUN = x$postFUN, retryEmpty = TRUE, ...) 9 | } 10 | \arguments{ 11 | \item{x}{object of type Source, see also \code{\link{Corpus}}} 12 | 13 | \item{readerControl}{specifies reader to be used for \code{Source}, defaults to 14 | list(reader = x$DefaultReader, language = "en"} 15 | 16 | \item{postFUN}{function to be applied to WebCorpus after web retrieval has been completed, 17 | defaults to x$PostFUN} 18 | 19 | \item{retryEmpty}{specifies if retrieval for empty content elements should be repeated, 20 | defaults to TRUE} 21 | 22 | \item{...}{additional parameters for Corpus function (actually Corpus reader)} 23 | } 24 | \description{ 25 | \code{WebCorpus} adds further methods and meta data to \code{\link[tm]{Corpus}} and therefore 26 | constructs a derived class of \code{\link[tm]{Corpus}}. Most importantly, \code{WebCorpus} 27 | calls \code{$PostFUN} on the generated \code{WebCorpus}, which retrieves the main content 28 | for most implemented \code{WebSource}s. Thus it enables an efficient retrieval of new feed items 29 | (\code{\link{corpus.update}}). All additional WebCorpus fields are added to \code{tm$meta} 30 | like \code{$source}, \code{$readerControl} and \code{$postFUN}. 31 | } 32 | 33 | -------------------------------------------------------------------------------- /tests/testthat/test-source-yahooinplay.R: -------------------------------------------------------------------------------- 1 | context("YahooInPlaySource") 2 | 3 | test_that("YahooInPlaySource",{ 4 | 5 | minlengthcorp <- 1 6 | 7 | testcorp <- WebCorpus(YahooInplaySource()) 8 | lengthcorp <- length(testcorp) 9 | # Check Corpus object 10 | expect_that(length(testcorp) >= minlengthcorp, is_true()) 11 | expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus"))) 12 | 13 | # Check Content 14 | #expect_that(all(sapply(testcorp, nchar) > 0), is_true()) 15 | contentlength <- sapply(testcorp, function(x) 16 | if( length(content(x)) < 1) 0 else nchar(content(x))) 17 | contentratio <- length(which(contentlength > 0)) / length(testcorp) 18 | expect_that(contentratio > 0.5, is_true()) 19 | 20 | # Check Meta Data 21 | datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp")) 22 | #FIXME: Date should be fixed 23 | expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "character")), is_true()) 24 | 25 | heading <- lapply(testcorp, function(x) meta(x, "heading")[1]) 26 | expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true()) 27 | expect_that(all(sapply(heading, nchar) > 0), is_true()) 28 | 29 | id <- lapply(testcorp, function(x) meta(x, "id")[1]) 30 | expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true()) 31 | expect_that(all(sapply(id, nchar) > 0), is_true()) 32 | 33 | testcorp <- testcorp[1:length(minlengthcorp)] 34 | # TODO: test should be re-activated again 35 | #testcorp <- corpus.update(testcorp) 36 | #expect_that(length(testcorp) >= lengthcorp, is_true()) 37 | 38 | cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100)) 39 | }) 40 | 41 | -------------------------------------------------------------------------------- /R/transform.R: -------------------------------------------------------------------------------- 1 | #' @title Enclose Text Content in HTML tags 2 | #' @description Simple helper function which encloses text content of character 3 | #' (or \code{\link[tm]{TextDocument}}) in HTML-tags. That way, HTML 4 | #' content can be easier parsed by \code{\link[XML]{htmlTreeParse}} 5 | #' @param x object of PlainTextDocument class 6 | #' @export 7 | #' @aliases encloseHTML.PlainTextDocument encloseHTML.character 8 | encloseHTML <- function(x) UseMethod("encloseHTML", x) 9 | 10 | #' @importFrom NLP content<- 11 | #' @noRd 12 | #' @export 13 | # FIXME: Could be done easier?? 14 | encloseHTML.PlainTextDocument <- function(x){ 15 | content(x) <- sprintf("%s", x) 16 | x 17 | } 18 | 19 | #' @title Remove non-ASCII characters from Text. 20 | #' @description This is a helper function to generate package data 21 | #' without non-ASCII character and omit the warning at R CMD check. 22 | #' @param x object of PlainTextDocument class 23 | #' @param fields specifies fields to be converted, defaults to fields = c("Content", "Heading", "Description") 24 | #' @param from specifies encoding from which conversion should be done, defaults to "UTF-8" 25 | #' @param to speciefies target encoding, defaults to "ASCII//TRANSLIT" 26 | #' @export 27 | #' @aliases removeNonASCII.PlainTextDocument 28 | removeNonASCII <- function(x, fields = c("Content", "Heading", "Description"), from = "UTF-8", to = "ASCII//TRANSLIT") 29 | UseMethod("removeNonASCII", x) 30 | 31 | #' @noRd 32 | #' @export 33 | removeNonASCII.PlainTextDocument <- function(x, fields = c("Content", "Heading", "Description"), from = "UTF-8", to = "ASCII//TRANSLIT"){ 34 | if("Content" %in% fields){ 35 | content(x) <- iconv(x, from, to) 36 | } 37 | for(fn in setdiff(fields, "Content")){ 38 | meta(x, fn) <- iconv(meta(x, fn), from, to) 39 | } 40 | x 41 | } -------------------------------------------------------------------------------- /tests/testthat/test-source-reutersnews.R: -------------------------------------------------------------------------------- 1 | context("ReutersNewsSource") 2 | 3 | test_that("ReutersNewsSource",{ 4 | 5 | lengthcorp <- 20 6 | 7 | testcorp <- WebCorpus(ReutersNewsSource("businessNews")) 8 | # Check Corpus object 9 | expect_that(length(testcorp), equals(lengthcorp)) 10 | expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus"))) 11 | 12 | # Check Content 13 | #expect_that(all(sapply(testcorp, nchar) > 0), is_true()) 14 | contentlength <- sapply(testcorp, function(x) 15 | if( length(content(x)) < 1) 0 else nchar(content(x))) 16 | contentratio <- length(which(contentlength > 0)) / length(testcorp) 17 | expect_that(contentratio > 0.5, is_true()) 18 | 19 | # Check Meta Data 20 | datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp")) 21 | expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true()) 22 | 23 | description <- lapply(testcorp, function(x) meta(x, "description")) 24 | expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true()) 25 | 26 | heading <- lapply(testcorp, function(x) meta(x, "heading")) 27 | expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true()) 28 | expect_that(all(sapply(heading, nchar) > 0), is_true()) 29 | 30 | id <- lapply(testcorp, function(x) meta(x, "id")) 31 | expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true()) 32 | expect_that(all(sapply(id, nchar) > 0), is_true()) 33 | 34 | origin <- lapply(testcorp, function(x) meta(x, "origin")) 35 | expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true()) 36 | expect_that(all(sapply(origin, nchar) > 0), is_true()) 37 | 38 | testcorp <- testcorp[1:5] 39 | testcorp <- corpus.update(testcorp) 40 | expect_that(length(testcorp) >= lengthcorp, is_true()) 41 | 42 | cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100)) 43 | }) 44 | 45 | -------------------------------------------------------------------------------- /tests/testthat/test-source-googlefinance.R: -------------------------------------------------------------------------------- 1 | context("GoogleFinanceSource") 2 | 3 | test_that("GoogleFinanceSource",{ 4 | 5 | lengthcorp <- 20 6 | 7 | testcorp <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT")) 8 | # Check Corpus object 9 | expect_that(length(testcorp), equals(lengthcorp)) 10 | expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus"))) 11 | 12 | # Check Content 13 | #expect_that(all(sapply(testcorp, nchar) > 0), is_true()) 14 | contentlength <- sapply(testcorp, function(x) 15 | if( length(content(x)) < 1) 0 else nchar(content(x))) 16 | contentratio <- length(which(contentlength > 0)) / length(testcorp) 17 | expect_that(contentratio > 0.5, is_true()) 18 | 19 | # Check Meta Data 20 | datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp")) 21 | expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true()) 22 | 23 | description <- lapply(testcorp, function(x) meta(x, "description")) 24 | expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true()) 25 | 26 | heading <- lapply(testcorp, function(x) meta(x, "heading")) 27 | expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true()) 28 | expect_that(all(sapply(heading, nchar) > 0), is_true()) 29 | 30 | id <- lapply(testcorp, function(x) meta(x, "id")) 31 | expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true()) 32 | expect_that(all(sapply(id, nchar) > 0), is_true()) 33 | 34 | origin <- lapply(testcorp, function(x) meta(x, "origin")) 35 | expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true()) 36 | expect_that(all(sapply(origin, nchar) > 0), is_true()) 37 | 38 | testcorp <- testcorp[1:10] 39 | testcorp <- corpus.update(testcorp) 40 | expect_that(length(testcorp) >= lengthcorp, is_true()) 41 | 42 | cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100)) 43 | }) 44 | 45 | -------------------------------------------------------------------------------- /man/WebSource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{WebSource} 4 | \alias{WebSource} 5 | \title{Read Web Content and respective Link Content from feedurls.} 6 | \usage{ 7 | WebSource(feedurls, class = "WebXMLSource", reader, parser, 8 | encoding = "UTF-8", curlOpts = curlOptions(followlocation = TRUE, 9 | maxconnects = 5, maxredirs = 20, timeout = 30, connecttimeout = 30, 10 | ssl.verifyhost = FALSE, ssl.verifypeer = FALSE), postFUN = NULL, 11 | retrieveFeedURL = TRUE, ...) 12 | } 13 | \arguments{ 14 | \item{feedurls}{urls from feeds to be retrieved} 15 | 16 | \item{class}{class label to be assigned to \code{Source} object, defaults to "WebXMLSource"} 17 | 18 | \item{reader}{function to be used to read content, see also \code{\link{readWeb}}} 19 | 20 | \item{parser}{function to be used to split feed content into chunks, returns list of content elements} 21 | 22 | \item{encoding}{specifies default encoding, defaults to 'UTF-8'} 23 | 24 | \item{curlOpts}{a named list or CURLOptions object identifying the curl options for the handle. Type \code{listCurlOptions()} for all Curl options available.} 25 | 26 | \item{postFUN}{function saved in WebSource object and called to retrieve full text content from feed urls} 27 | 28 | \item{retrieveFeedURL}{logical; Specify if feedurls should be downloaded first.} 29 | 30 | \item{...}{additional parameters passed to \code{WebSource} object/structure} 31 | } 32 | \value{ 33 | WebSource 34 | } 35 | \description{ 36 | WebSource is derived from \code{\link[tm]{Source}}. In addition to calling the 37 | base \code{\link[tm]{Source}} constructor function it also retrieves the specified 38 | feedurls and pre--parses the content with the parser function. 39 | The fields \code{$Content}, \code{$Feedurls} \code{$Parser} and \code{$CurlOpts} are finally 40 | added to the \code{Source} object. 41 | } 42 | \author{ 43 | Mario Annau 44 | } 45 | 46 | -------------------------------------------------------------------------------- /tests/testthat/test-source-yahoonews.R: -------------------------------------------------------------------------------- 1 | context("YahooNewsSource") 2 | 3 | test_that("YahooNewsSource",{ 4 | 5 | lengthcorp <- 10 6 | 7 | testcorp <- WebCorpus(YahooNewsSource("Microsoft")) 8 | # Check Corpus object 9 | expect_that(length(testcorp), equals(lengthcorp)) 10 | expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus"))) 11 | 12 | # Check Content 13 | #FIXME: No content is retrieved 14 | #expect_that(all(sapply(testcorp, nchar) > 0), is_true()) 15 | contentlength <- sapply(testcorp, function(x) 16 | if( length(content(x)) < 1) 0 else nchar(content(x))) 17 | contentratio <- length(which(contentlength > 0)) / length(testcorp) 18 | expect_that(contentratio > 0.5, is_true()) 19 | 20 | # Check Meta Data 21 | datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp")) 22 | expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true()) 23 | 24 | description <- lapply(testcorp, function(x) meta(x, "description")) 25 | expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true()) 26 | 27 | heading <- lapply(testcorp, function(x) meta(x, "heading")) 28 | expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true()) 29 | expect_that(all(sapply(heading, nchar) > 0), is_true()) 30 | 31 | id <- lapply(testcorp, function(x) meta(x, "id")) 32 | expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true()) 33 | expect_that(all(sapply(id, nchar) > 0), is_true()) 34 | 35 | origin <- lapply(testcorp, function(x) meta(x, "origin")) 36 | expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true()) 37 | expect_that(all(sapply(origin, nchar) > 0), is_true()) 38 | 39 | testcorp <- testcorp[1:10] 40 | testcorp <- corpus.update(testcorp) 41 | expect_that(length(testcorp) >= lengthcorp, is_true()) 42 | 43 | cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100)) 44 | }) 45 | 46 | -------------------------------------------------------------------------------- /tests/testthat/test-source-googlenews.R: -------------------------------------------------------------------------------- 1 | context("GoogleNewsSource") 2 | 3 | test_that("GoogleNewsSource",{ 4 | 5 | lengthcorp <- 30 6 | query <- "Microsoft" 7 | 8 | testcorp <- WebCorpus(GoogleNewsSource(query, 9 | params = list(hl = "en", q = query, ie = "utf-8", 10 | num = lengthcorp, output = "rss"))) 11 | # Check Corpus object 12 | expect_that(length(testcorp), equals(lengthcorp)) 13 | expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus"))) 14 | 15 | # Check Content 16 | contentlength <- sapply(testcorp, function(x) 17 | if( length(content(x)) < 1) 0 else nchar(content(x))) 18 | contentratio <- length(which(contentlength > 0)) / length(testcorp) 19 | expect_that(contentratio > 0.5, is_true()) 20 | 21 | # Check Meta Data 22 | datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp")) 23 | expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true()) 24 | 25 | description <- lapply(testcorp, function(x) meta(x, "description")) 26 | expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true()) 27 | 28 | heading <- lapply(testcorp, function(x) meta(x, "heading")) 29 | expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true()) 30 | expect_that(all(sapply(heading, nchar) > 0), is_true()) 31 | 32 | id <- lapply(testcorp, function(x) meta(x, "id")) 33 | expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true()) 34 | expect_that(all(sapply(id, nchar) > 0), is_true()) 35 | 36 | origin <- lapply(testcorp, function(x) meta(x, "origin")) 37 | expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true()) 38 | expect_that(all(sapply(origin, nchar) > 0), is_true()) 39 | 40 | testcorp <- testcorp[1:10] 41 | testcorp <- corpus.update(testcorp) 42 | expect_that(length(testcorp) >= lengthcorp, is_true()) 43 | 44 | cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100)) 45 | }) 46 | 47 | -------------------------------------------------------------------------------- /tests/testthat/test-source-yahoofinance.R: -------------------------------------------------------------------------------- 1 | context("YahooFinanceSource") 2 | 3 | test_that("YahooFinanceSource",{ 4 | 5 | lengthcorp <- 20 6 | 7 | testcorp <- WebCorpus(YahooFinanceSource("MSFT")) 8 | # Check Corpus object 9 | #FIXME: Content in Yahoo Finance is not retrieved 10 | expect_that(length(testcorp), equals(lengthcorp)) 11 | expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus"))) 12 | 13 | 14 | 15 | # Check Content 16 | #expect_that(all(sapply(testcorp, nchar) > 0), is_true()) 17 | contentlength <- sapply(testcorp, function(x) 18 | if( length(content(x)) < 1) 0 else nchar(content(x))) 19 | contentratio <- length(which(contentlength > 0)) / length(testcorp) 20 | expect_that(contentratio > 0.5, is_true()) 21 | 22 | # Check Meta Data 23 | datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp")) 24 | expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true()) 25 | 26 | description <- lapply(testcorp, function(x) meta(x, "description")) 27 | expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true()) 28 | 29 | heading <- lapply(testcorp, function(x) meta(x, "heading")) 30 | expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true()) 31 | expect_that(all(sapply(heading, nchar) > 0), is_true()) 32 | 33 | id <- lapply(testcorp, function(x) meta(x, "id")) 34 | expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true()) 35 | expect_that(all(sapply(id, nchar) > 0), is_true()) 36 | 37 | origin <- lapply(testcorp, function(x) meta(x, "origin")) 38 | expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true()) 39 | expect_that(all(sapply(origin, nchar) > 0), is_true()) 40 | 41 | testcorp <- testcorp[1:10] 42 | testcorp <- corpus.update(testcorp) 43 | expect_that(length(testcorp) >= lengthcorp, is_true()) 44 | 45 | cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100)) 46 | }) 47 | 48 | -------------------------------------------------------------------------------- /man/tm.plugin.webmining-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/tm.plugin.webmining-package.R 3 | \docType{package} 4 | \name{tm.plugin.webmining-package} 5 | \alias{tm.plugin.webmining} 6 | \alias{tm.plugin.webmining-package} 7 | \alias{webmining} 8 | \title{Retrieve structured, textual data from various web sources} 9 | \description{ 10 | tm.plugin.webmining facilitates the retrieval of textual data through various 11 | web feed formats like XML and JSON. Also direct retrieval from HTML 12 | is supported. As most (news) feeds only incorporate small fractions 13 | of the original text tm.plugin.webmining goes a step further and even 14 | retrieves and extracts the text of the original text source. 15 | Generally, the retrieval procedure can be described as a two--step process: 16 | \describe{ 17 | \item{Meta Retrieval}{In a first step, all relevant meta feeds are retrieved. 18 | From these feeds all relevant meta data items are extracted. 19 | } 20 | \item{Content Retrieval}{In a second step the relevant source content is retrieved. 21 | Using the \code{boilerpipeR} package even the main content of \code{HTML} pages can 22 | be extracted. 23 | }} 24 | } 25 | \examples{ 26 | \dontrun{ 27 | googlefinance <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT")) 28 | googlenews <- WebCorpus(GoogleNewsSource("Microsoft")) 29 | nytimes <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid)) 30 | reutersnews <- WebCorpus(ReutersNewsSource("businessNews")) 31 | yahoofinance <- WebCorpus(YahooFinanceSource("MSFT")) 32 | yahooinplay <- WebCorpus(YahooInplaySource()) 33 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft")) 34 | liberation <- WebCorpus(LiberationSource("latest")) 35 | } 36 | } 37 | \author{ 38 | Mario Annau \email{mario.annau@gmail} 39 | } 40 | \seealso{ 41 | \code{\link{WebCorpus}} \code{\link{GoogleFinanceSource}} \code{\link{GoogleNewsSource}} \code{\link{NYTimesSource}} \code{\link{ReutersNewsSource}} \code{\link{YahooFinanceSource}} \code{\link{YahooInplaySource}} \code{\link{YahooNewsSource}} 42 | } 43 | \keyword{package} 44 | 45 | -------------------------------------------------------------------------------- /man/NYTimesSource.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/source.R 3 | \name{NYTimesSource} 4 | \alias{NYTimesSource} 5 | \alias{readNYTimes} 6 | \title{Get feed data from NYTimes Article Search (\url{http://developer.nytimes.com/docs/read/article_search_api_v2}).} 7 | \usage{ 8 | NYTimesSource(query, n = 100, appid, sleep = 1, params = list(format = 9 | "json", q = query, page = 0:(ceiling(n/10) - 1), `api-key` = appid), 10 | curlOpts = curlOptions(followlocation = TRUE, maxconnects = 10, maxredirs = 11 | 10, timeout = 30, connecttimeout = 30), ...) 12 | } 13 | \arguments{ 14 | \item{query}{character specifying query to be used to search NYTimes articles} 15 | 16 | \item{n}{number of items, defaults to 100} 17 | 18 | \item{appid}{Developer App id to be used, obtained from \url{http://developer.nytimes.com/}} 19 | 20 | \item{sleep}{integer; Seconds to sleep between feed retrieval.} 21 | 22 | \item{params}{additional query parameters, specified as list, see \url{http://developer.nytimes.com/docs/read/article_search_api}} 23 | 24 | \item{curlOpts}{CURLOptions; RCurl options used for feed retrieval.} 25 | 26 | \item{...}{additional parameters to \code{\link{WebSource}}} 27 | } 28 | \description{ 29 | Excerpt from the website: "With the NYTimes Article Search API, you can search New York Times articles 30 | from 1981 to today, retrieving headlines, abstracts, lead paragraphs, links to associated multimedia 31 | and other article metadata. Along with standard keyword searching, the API also offers faceted searching. 32 | The available facets include Times-specific fields such as sections, taxonomic classifiers and controlled 33 | vocabulary terms (names of people, organizations and geographic locations)." 34 | Feed retrieval is limited to 1000 items (or 100 pages). 35 | } 36 | \examples{ 37 | \dontrun{ 38 | #nytimes_appid needs to be specified 39 | corpus <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid)) 40 | } 41 | } 42 | \author{ 43 | Mario Annau 44 | } 45 | \seealso{ 46 | \code{\link{WebSource}}, \code{\link{readNYTimes}} 47 | } 48 | 49 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2 (4.1.1): do not edit by hand 2 | 3 | S3method("[",WebCorpus) 4 | S3method(corpus.update,WebCorpus) 5 | S3method(encloseHTML,PlainTextDocument) 6 | S3method(extract,PlainTextDocument) 7 | S3method(getElem,WebJSONSource) 8 | S3method(getElem,WebXMLSource) 9 | S3method(getEmpty,WebCorpus) 10 | S3method(removeNonASCII,PlainTextDocument) 11 | S3method(source.update,WebXMLSource) 12 | export(GoogleFinanceSource) 13 | export(GoogleNewsSource) 14 | export(LiberationSource) 15 | export(NYTimesSource) 16 | export(ReutersNewsSource) 17 | export(WebCorpus) 18 | export(WebSource) 19 | export(YahooFinanceSource) 20 | export(YahooInplaySource) 21 | export(YahooNewsSource) 22 | export(corpus.update) 23 | export(encloseHTML) 24 | export(extract) 25 | export(extractContentDOM) 26 | export(extractHTMLStrip) 27 | export(feedquery) 28 | export(getEmpty) 29 | export(getLinkContent) 30 | export(json_content) 31 | export(parse) 32 | export(readGoogle) 33 | export(readLiberationSource) 34 | export(readNYTimes) 35 | export(readReutersNews) 36 | export(readWeb) 37 | export(readWebHTML) 38 | export(readWebJSON) 39 | export(readWebXML) 40 | export(readYahoo) 41 | export(readYahooHTML) 42 | export(readYahooInplay) 43 | export(removeNonASCII) 44 | export(removeTags) 45 | export(source.update) 46 | export(trimWhiteSpaces) 47 | importFrom(NLP,"content<-") 48 | importFrom(NLP,"meta<-") 49 | importFrom(NLP,content) 50 | importFrom(NLP,meta) 51 | importFrom(RCurl,curlEscape) 52 | importFrom(RCurl,curlOptions) 53 | importFrom(RCurl,getURL) 54 | importFrom(RJSONIO,fromJSON) 55 | importFrom(XML,addAttributes) 56 | importFrom(XML,free) 57 | importFrom(XML,getNodeSet) 58 | importFrom(XML,htmlTreeParse) 59 | importFrom(XML,newXMLNamespace) 60 | importFrom(XML,removeNodes) 61 | importFrom(XML,saveXML) 62 | importFrom(XML,toString.XMLNode) 63 | importFrom(XML,xmlApply) 64 | importFrom(XML,xmlChildren) 65 | importFrom(XML,xmlInternalTreeParse) 66 | importFrom(XML,xmlValue) 67 | importFrom(XML,xpathSApply) 68 | importFrom(boilerpipeR,ArticleExtractor) 69 | importFrom(tm,Corpus) 70 | importFrom(tm,FunctionGenerator) 71 | importFrom(tm,PlainTextDocument) 72 | importFrom(tm,SimpleSource) 73 | importFrom(tm,eoi) 74 | importFrom(tm,getElem) 75 | importFrom(tm,reader) 76 | importFrom(tm,stepNext) 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tm.plugin.webmining 2 | [![Build Status](https://travis-ci.org/mannau/tm.plugin.webmining.svg?branch=master)](https://travis-ci.org/mannau/tm.plugin.webmining) [![codecov.io](http://codecov.io/github/mannau/tm.plugin.webmining/coverage.svg?branch=master)](http://codecov.io/github/mannau/tm.plugin.webmining?branch=master) [![License](http://img.shields.io/badge/license-GPL%20%28%3E=%203%29-blue.svg?style=flat)](http://www.gnu.org/licenses/gpl-3.0.html) 3 | 4 | **tm.plugin.webmining** is an R-package which facilitates text retrieval from feed formats like XML (RSS, ATOM) and JSON. Also direct retrieval from HTML is supported. As most (news) feeds only incorporate small fractions of the original text **tm.plugin.webmining** even extracts the text from the original text source. 5 | 6 | ## Install 7 | To install the [latest version from CRAN](http://cran.r-project.org/web/packages/tm.plugin.webmining/index.html) simply 8 | ```python 9 | install.packages("tm.plugin.webmining") 10 | ``` 11 | 12 | Using the **devtools** package you can easily install the latest development version of **tm.plugin.webmining** from github with 13 | 14 | ```python 15 | library(devtools) 16 | install_github("mannau/tm.plugin.webmining") 17 | ``` 18 | 19 | Windows users need to use the following command to install from github: 20 | 21 | ```python 22 | library(devtools) 23 | install_github("mannau/boilerpipeR", args = "--no-multiarch") 24 | ``` 25 | 26 | ## Usage 27 | The next snippet shows how to download and extract the main text from all supported sources as WebCorpus objects including a rich set of metadata like *Author*, *DateTimeStamp* or *Source*: 28 | 29 | ```python 30 | library(tm.plugin.webmining) 31 | googlefinance <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT")) 32 | googlenews <- WebCorpus(GoogleNewsSource("Microsoft")) 33 | nytimes <- WebCorpus(NYTimesSource("Microsoft", appid = "")) 34 | reutersnews <- WebCorpus(ReutersNewsSource("businessNews")) 35 | #twitter <- WebCorpus(TwitterSource("Microsoft")) -> not supported yet 36 | yahoofinance <- WebCorpus(YahooFinanceSource("MSFT")) 37 | yahooinplay <- WebCorpus(YahooInplaySource()) 38 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft")) 39 | liberation <- WebCorpus(LiberationSource("latest")) 40 | ``` 41 | 42 | ## License 43 | **tm.plugin.webmining** is released under the [GNU General Public License Version 3](http://www.gnu.org/copyleft/gpl.html) 44 | -------------------------------------------------------------------------------- /tests/testthat/test-source-nytimes.R: -------------------------------------------------------------------------------- 1 | context("NYTimesSource") 2 | 3 | data(nytimes_appid) 4 | 5 | test_that("NYTimesSource",{ 6 | 7 | lengthcorp <- 200 8 | 9 | if(!exists(as.character(substitute(nytimes_appid)))){ 10 | cat("No Variable nytimes_appid provided. Skipping Test...\n") 11 | return() 12 | } 13 | 14 | testcorp <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid, n = lengthcorp)) 15 | # Check Corpus object 16 | expect_that(length(testcorp), equals(lengthcorp)) 17 | expect_that(class(testcorp), equals(c("WebCorpus","VCorpus","Corpus"))) 18 | 19 | # Check Content 20 | #expect_that(all(sapply(testcorp, nchar) > 0), is_true()) 21 | contentlength <- sapply(testcorp, function(x) 22 | if( length(content(x)) < 1) 0 else nchar(content(x))) 23 | contentratio <- length(which(contentlength > 0)) / length(testcorp) 24 | expect_that(contentratio > 0.5, is_true()) 25 | 26 | # Check Meta Data 27 | datetimestamp <- lapply(testcorp, function(x) meta(x, "datetimestamp")) 28 | expect_that(all(sapply(datetimestamp, function(x) class(x)[1] == "POSIXlt")), is_true()) 29 | 30 | description <- lapply(testcorp, function(x) meta(x, "description")) 31 | expect_that(all(sapply(description, function(x) class(x)[1] == "character")), is_true()) 32 | expect_that(all(sapply(description, nchar) > 0), is_true()) 33 | 34 | heading <- lapply(testcorp, function(x) meta(x, "heading")) 35 | expect_that(all(sapply(heading, function(x) class(x)[1] == "character")), is_true()) 36 | expect_that(all(sapply(heading, nchar) > 0), is_true()) 37 | 38 | id <- lapply(testcorp, function(x) meta(x, "id")) 39 | expect_that(all(sapply(id, function(x) class(x)[1] == "character")), is_true()) 40 | expect_that(all(sapply(id, nchar) > 0), is_true()) 41 | 42 | language <- lapply(testcorp, function(x) meta(x, "language")) 43 | expect_that(all(sapply(language, function(x) class(x)[1] == "character")), is_true()) 44 | expect_that(all(sapply(language, nchar) > 0), is_true()) 45 | 46 | origin <- lapply(testcorp, function(x) meta(x, "origin")) 47 | expect_that(all(sapply(origin, function(x) class(x)[1] == "character")), is_true()) 48 | expect_that(all(sapply(origin, nchar) > 0), is_true()) 49 | 50 | testcorp <- testcorp[1:10] 51 | testcorp <- corpus.update(testcorp) 52 | expect_that(length(testcorp) >= lengthcorp, is_true()) 53 | 54 | cat(" | Contentratio: ", sprintf("%.0f%%", contentratio * 100)) 55 | }) 56 | 57 | -------------------------------------------------------------------------------- /man/getLinkContent.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2 (4.1.1): do not edit by hand 2 | % Please edit documentation in R/getLinkContent.R 3 | \name{getLinkContent} 4 | \alias{getLinkContent} 5 | \title{Get main content for corpus items, specified by links.} 6 | \usage{ 7 | getLinkContent(corpus, links = sapply(corpus, meta, "origin"), 8 | timeout.request = 30, chunksize = 20, verbose = getOption("verbose"), 9 | curlOpts = curlOptions(verbose = FALSE, followlocation = TRUE, maxconnects = 10 | 5, maxredirs = 20, timeout = timeout.request, connecttimeout = 11 | timeout.request, ssl.verifyhost = FALSE, ssl.verifypeer = FALSE, useragent = 12 | "R", cookiejar = tempfile()), retry.empty = 3, sleep.time = 3, 13 | extractor = ArticleExtractor, .encoding = integer(), ...) 14 | } 15 | \arguments{ 16 | \item{corpus}{object of class \code{\link[tm]{Corpus}} for which link content should be downloaded} 17 | 18 | \item{links}{character vector specifyinig links to be used for download, defaults to 19 | sapply(corpus, meta, "Origin")} 20 | 21 | \item{timeout.request}{timeout (in seconds) to be used for connections/requests, defaults to 30} 22 | 23 | \item{chunksize}{Size of download chunks to be used for parallel retrieval, defaults to 20} 24 | 25 | \item{verbose}{Specifies if retrieval info should be printed, defaults to getOption("verbose")} 26 | 27 | \item{curlOpts}{curl options to be passed to \code{\link{getURL}}} 28 | 29 | \item{retry.empty}{Specifies number of times empty content sites should be retried, defaults to 3} 30 | 31 | \item{sleep.time}{Sleep time to be used between chunked download, defaults to 3 (seconds)} 32 | 33 | \item{extractor}{Extractor to be used for content extraction, defaults to extractContentDOM} 34 | 35 | \item{.encoding}{encoding to be used for \code{\link{getURL}}, defaults to integer() (=autodetect)} 36 | 37 | \item{...}{additional parameters to \code{\link{getURL}}} 38 | } 39 | \value{ 40 | corpus including downloaded link content 41 | } 42 | \description{ 43 | \code{getLinkContent} downloads and extracts content from weblinks for \code{\link[tm]{Corpus}} objects. 44 | Typically it is integrated and called as a post-processing function (field:\code{$postFUN}) for most \code{\link{WebSource}} 45 | objects. \code{getLinkContent} implements content download in chunks which has been proven to be a stabler approach for 46 | large content requests. 47 | } 48 | \seealso{ 49 | \code{\link{WebSource}} \code{\link[RCurl]{getURL}} \code{\link[boilerpipeR]{Extractor}} 50 | } 51 | 52 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | Rscript := $(shell whereis Rscript) --vanilla -e 2 | R := $(shell whereis R) 3 | 4 | PKG_VERSION := $(shell grep -i ^version DESCRIPTION | cut -d : -d \ -f 2) 5 | PKG_NAME := $(shell grep -i ^package DESCRIPTION | cut -d : -d \ -f 2) 6 | 7 | #DATA_FILES := $(wildcard data/*.rda) 8 | R_FILES := $(wildcard R/*.R) 9 | TEST_FILES := $(wildcard tests/*.R) $(wildcard tests/testthat/*.R) 10 | #ALL_SRC_FILES := $(wildcard src/*.cpp) $(wildcard src/*.h) src/Makevars 11 | #SRC_FILES := $(filter-out src/RcppExports.cpp, $(ALL_SRC_FILES)) 12 | #HEADER_FILES := $(wildcard src/*.h) 13 | #RCPPEXPORTS := src/RcppExports.cpp R/RcppExports.R 14 | ROXYGENFILES := $(wildcard man/*.Rd) NAMESPACE 15 | PKG_FILES := DESCRIPTION $(ROXYGENFILES) $(R_FILES) $(TEST_FILES) 16 | #OBJECTS := $(wildcard src/*.o) $(wildcard src/*.o-*) $(wildcard src/*.dll) $(wildcard src/*.so) $(wildcard src/*.rds) 17 | CHECKPATH := $(PKG_NAME).Rcheck 18 | CHECKLOG := `cat $(CHECKPATH)/00check.log` 19 | 20 | .PHONY: all build build-cran check check-cran manual install clean compileAttributes 21 | 22 | all: 23 | install 24 | 25 | build: $(PKG_NAME)_$(PKG_VERSION).tar.gz 26 | 27 | build-cran: 28 | @make clean 29 | @make roxygen 30 | @cp tests/testthat.R tests/testthat.R.temp 31 | @cp tests/testthat.R.cran tests/testthat.R 32 | $(R) CMD build --resave-data . 33 | @cp tests/testthat.R.temp tests/testthat.R 34 | @rm tests/testthat.R.temp 35 | 36 | $(PKG_NAME)_$(PKG_VERSION).tar.gz: $(PKG_FILES) 37 | @make roxygen 38 | $(R) CMD build --resave-data --no-build-vignettes . 39 | 40 | roxygen: $(R_FILES) 41 | $(Rscript) 'library(roxygen2); roxygenize()' 42 | 43 | check: $(PKG_NAME)_$(PKG_VERSION).tar.gz 44 | @rm -rf $(CHECKPATH) 45 | $(R) CMD check --no-multiarch --no-manual --no-clean $(PKG_NAME)_$(PKG_VERSION).tar.gz 46 | 47 | check-cran: 48 | @make build-cran 49 | @rm -rf $(CHECKPATH) 50 | $(R) CMD check --as-cran --no-clean $(PKG_NAME)_$(PKG_VERSION).tar.gz 51 | 52 | 00check.log: check 53 | @mv $(CHECKPATH)\\00check.log . 54 | @rm -rf $(CHECKPATH) 55 | 56 | manual: $(PKG_NAME)-manual.pdf 57 | 58 | $(PKG_NAME)-manual.pdf: $(ROXYGENFILES) 59 | $(R) CMD Rd2pdf --no-preview -o $(PKG_NAME)-manual.pdf . 60 | 61 | install: $(PKG_NAME)_$(PKG_VERSION).tar.gz 62 | $(R) CMD INSTALL --no-multiarch --byte-compile $(PKG_NAME)_$(PKG_VERSION).tar.gz 63 | 64 | clean: 65 | @rm -rf $(wildcard *.Rcheck) 66 | @rm -f $(wildcard *.tar.gz) 67 | @echo '*** PACKAGE CLEANUP COMPLETE ***' 68 | -------------------------------------------------------------------------------- /R/tm.plugin.webmining-package.R: -------------------------------------------------------------------------------- 1 | #' tm.plugin.webmining facilitates the retrieval of textual data through various 2 | #' web feed formats like XML and JSON. Also direct retrieval from HTML 3 | #' is supported. As most (news) feeds only incorporate small fractions 4 | #' of the original text tm.plugin.webmining goes a step further and even 5 | #' retrieves and extracts the text of the original text source. 6 | #' Generally, the retrieval procedure can be described as a two--step process: 7 | #' \describe{ 8 | #' \item{Meta Retrieval}{In a first step, all relevant meta feeds are retrieved. 9 | #' From these feeds all relevant meta data items are extracted. 10 | #' } 11 | #' \item{Content Retrieval}{In a second step the relevant source content is retrieved. 12 | #' Using the \code{boilerpipeR} package even the main content of \code{HTML} pages can 13 | #' be extracted. 14 | #' }} 15 | #' 16 | #' @name tm.plugin.webmining-package 17 | #' @aliases tm.plugin.webmining webmining 18 | #' @docType package 19 | #' @title Retrieve structured, textual data from various web sources 20 | #' @author Mario Annau \email{mario.annau@@gmail} 21 | #' @keywords package 22 | #' @seealso \code{\link{WebCorpus}} \code{\link{GoogleFinanceSource}} \code{\link{GoogleNewsSource}} \code{\link{NYTimesSource}} \code{\link{ReutersNewsSource}} \code{\link{YahooFinanceSource}} \code{\link{YahooInplaySource}} \code{\link{YahooNewsSource}} 23 | #' @examples 24 | #' \dontrun{ 25 | #' googlefinance <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT")) 26 | #' googlenews <- WebCorpus(GoogleNewsSource("Microsoft")) 27 | #' nytimes <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid)) 28 | #' reutersnews <- WebCorpus(ReutersNewsSource("businessNews")) 29 | #' yahoofinance <- WebCorpus(YahooFinanceSource("MSFT")) 30 | #' yahooinplay <- WebCorpus(YahooInplaySource()) 31 | #' yahoonews <- WebCorpus(YahooNewsSource("Microsoft")) 32 | #' liberation <- WebCorpus(LiberationSource("latest")) 33 | #' } 34 | NULL 35 | 36 | #' WebCorpus retrieved from Yahoo! News for the search term "Microsoft" 37 | #' through the YahooNewsSource. Length of retrieved corpus is 20. 38 | #' @name yahoonews 39 | #' @docType data 40 | #' @author Mario Annau 41 | #' @keywords data 42 | #' @examples 43 | #' #Data set has been generated as follows: 44 | #' \dontrun{ 45 | #' yahoonews <- WebCorpus(YahooNewsSource("Microsoft")) 46 | #' } 47 | NULL 48 | 49 | #' AppID for the NYtimes-API. 50 | #' 51 | #' USED ONLY FOR PACKAGE TESTING. PLEASE DOWNLOAD YOUR OWN KEY AT \url{http://developer.nytimes.com/}!!! 52 | #' @name nytimes_appid 53 | #' @docType data 54 | #' @author Mario Annau 55 | #' @keywords data 56 | NULL 57 | -------------------------------------------------------------------------------- /R/getLinkContent.R: -------------------------------------------------------------------------------- 1 | #' @title Get main content for corpus items, specified by links. 2 | #' @description \code{getLinkContent} downloads and extracts content from weblinks for \code{\link[tm]{Corpus}} objects. 3 | #' Typically it is integrated and called as a post-processing function (field:\code{$postFUN}) for most \code{\link{WebSource}} 4 | #' objects. \code{getLinkContent} implements content download in chunks which has been proven to be a stabler approach for 5 | #' large content requests. 6 | #' @param corpus object of class \code{\link[tm]{Corpus}} for which link content should be downloaded 7 | #' @param links character vector specifyinig links to be used for download, defaults to 8 | #' sapply(corpus, meta, "Origin") 9 | #' @param timeout.request timeout (in seconds) to be used for connections/requests, defaults to 30 10 | #' @param curlOpts curl options to be passed to \code{\link{getURL}} 11 | #' @param chunksize Size of download chunks to be used for parallel retrieval, defaults to 20 12 | #' @param verbose Specifies if retrieval info should be printed, defaults to getOption("verbose") 13 | #' @param retry.empty Specifies number of times empty content sites should be retried, defaults to 3 14 | #' @param sleep.time Sleep time to be used between chunked download, defaults to 3 (seconds) 15 | #' @param extractor Extractor to be used for content extraction, defaults to extractContentDOM 16 | #' @param ... additional parameters to \code{\link{getURL}} 17 | #' @param .encoding encoding to be used for \code{\link{getURL}}, defaults to integer() (=autodetect) 18 | #' @return corpus including downloaded link content 19 | #' @seealso \code{\link{WebSource}} \code{\link[RCurl]{getURL}} \code{\link[boilerpipeR]{Extractor}} 20 | #' @importFrom NLP content 21 | #' @importFrom RCurl getURL 22 | #' @export 23 | getLinkContent <- function(corpus, links = sapply(corpus, meta, "origin"), 24 | timeout.request = 30, chunksize = 20, verbose = getOption("verbose"), 25 | curlOpts = curlOptions(verbose = FALSE, 26 | followlocation = TRUE, 27 | maxconnects = 5, 28 | maxredirs = 20, 29 | timeout = timeout.request, 30 | connecttimeout = timeout.request, 31 | ssl.verifyhost=FALSE, 32 | ssl.verifypeer = FALSE, 33 | useragent = "R", 34 | cookiejar = tempfile()), 35 | retry.empty = 3, 36 | sleep.time = 3, 37 | extractor = ArticleExtractor, 38 | .encoding = integer(), 39 | ...){ 40 | 41 | if(length(corpus) != length(links)) 42 | stop("Corpus length not equal to links length\n") 43 | 44 | #content_urls <- unlist(sapply(content_parsed, linkreader)) 45 | if(verbose){ 46 | cat("Starting URL Download ...\n") 47 | } 48 | retries <- 0 49 | while(any(empty <- sapply(corpus, function(x) identical(content(x), character(0)))) & (retries <= retry.empty)){ 50 | retries <- retries + 1 51 | emptycontent.ids <- which(empty) 52 | 53 | if(verbose){ 54 | cat("Run ", retries, ", retrieving ", length(emptycontent.ids), " content items\n") 55 | } 56 | 57 | #for(cstart in seq(from = 1, to = length(links), by = chunksize)){ 58 | for(cstart in seq(from = 1, to = length(emptycontent.ids), by = chunksize)){ 59 | if(sleep.time > 0){ 60 | if(verbose){ 61 | cat("Sleeping ", sleep.time, " seconds...\n") 62 | } 63 | Sys.sleep(sleep.time) 64 | } 65 | 66 | cend <- min(cstart[1] + chunksize-1, length(emptycontent.ids)) 67 | chunk.ids <- emptycontent.ids[cstart:cend] 68 | chunk <- links[chunk.ids] 69 | 70 | # TODO Enable chunk download 71 | content <- tryCatch({ 72 | getURL(chunk, .opts = curlOpts, .encoding = .encoding, ...) 73 | }, 74 | error=function(e){ 75 | print(e) 76 | # TODO: Check if single retrieval part is really necessary 77 | cat("\nError on retrieval, single retrieval fallback... \n") 78 | content <- list() 79 | for(i in 1:length(chunk)){ 80 | content[[i]] <- tryCatch({ 81 | getURL(chunk[i], .opts = curlOpts, .encoding = .encoding, ...) 82 | },error = function(f) { 83 | print(f) 84 | ""}) 85 | } 86 | #cat("Done\n") 87 | do.call(c, content)}) 88 | 89 | 90 | # Extract Content 91 | extract <- sapply(content, extractor) 92 | 93 | # Put Content Into Corpus 94 | for(i in 1:length(chunk.ids)){ 95 | cid <- chunk.ids[i] 96 | content(corpus[[cid]]) <- extract[i] 97 | 98 | } 99 | if(verbose){ 100 | progress <- floor(cend/length(links)*100) 101 | cat(paste(progress, "% (",cend,"/",length(emptycontent.ids), ") ", Sys.time(), "\n",sep = "")) 102 | } 103 | } 104 | } 105 | corpus 106 | } -------------------------------------------------------------------------------- /R/corpus.R: -------------------------------------------------------------------------------- 1 | #' @title WebCorpus constructor function. 2 | #' @description \code{WebCorpus} adds further methods and meta data to \code{\link[tm]{Corpus}} and therefore 3 | #' constructs a derived class of \code{\link[tm]{Corpus}}. Most importantly, \code{WebCorpus} 4 | #' calls \code{$PostFUN} on the generated \code{WebCorpus}, which retrieves the main content 5 | #' for most implemented \code{WebSource}s. Thus it enables an efficient retrieval of new feed items 6 | #' (\code{\link{corpus.update}}). All additional WebCorpus fields are added to \code{tm$meta} 7 | #' like \code{$source}, \code{$readerControl} and \code{$postFUN}. 8 | #' @param x object of type Source, see also \code{\link{Corpus}} 9 | #' @param readerControl specifies reader to be used for \code{Source}, defaults to 10 | #' list(reader = x$DefaultReader, language = "en" 11 | #' @param postFUN function to be applied to WebCorpus after web retrieval has been completed, 12 | #' defaults to x$PostFUN 13 | #' @param retryEmpty specifies if retrieval for empty content elements should be repeated, 14 | #' defaults to TRUE 15 | #' @param ... additional parameters for Corpus function (actually Corpus reader) 16 | #' @importFrom tm Corpus reader getElem stepNext eoi SimpleSource 17 | #' @export 18 | WebCorpus <- function(x, readerControl = list(reader = reader(x), language = "en"), 19 | postFUN = x$postFUN, retryEmpty = TRUE, ...) 20 | { 21 | stopifnot(inherits(x, "WebSource")) 22 | 23 | readerControl <- prepareReader(readerControl, reader(x)) 24 | 25 | if (is.function(readerControl$init)) 26 | readerControl$init() 27 | 28 | if (is.function(readerControl$exit)) 29 | on.exit(readerControl$exit()) 30 | 31 | tdl <- vector("list", length(x)) 32 | counter <- 1 33 | while (!eoi(x)) { 34 | x <- stepNext(x) 35 | elem <- getElem(x) 36 | doc <- readerControl$reader(elem, 37 | readerControl$language, 38 | as.character(counter)) 39 | tdl[[counter]] <- doc 40 | counter <- counter + 1 41 | } 42 | 43 | corpus <- structure(list(content = tdl, 44 | meta = CorpusMeta(source = x, readerControl = readerControl, postFUN = postFUN), 45 | dmeta = data.frame(row.names = seq_along(tdl))), 46 | class = c("WebCorpus", "VCorpus", "Corpus")) 47 | if(retryEmpty){ 48 | corpus <- getEmpty(corpus) 49 | } 50 | corpus 51 | } 52 | 53 | # TODO: Tell Ingo to export CorpusMeta 54 | CorpusMeta <- 55 | function(..., meta = NULL) 56 | { 57 | if (is.null(meta)) 58 | meta <- list(...) 59 | 60 | stopifnot(is.list(meta)) 61 | 62 | structure(meta, class = "CorpusMeta") 63 | } 64 | 65 | # TODO: Tell Ingo to export prepareReader 66 | prepareReader <- 67 | function(readerControl, reader = NULL, ...) 68 | { 69 | if (is.null(readerControl$reader)) 70 | readerControl$reader <- reader 71 | if (inherits(readerControl$reader, "FunctionGenerator")) 72 | readerControl$reader <- readerControl$reader(...) 73 | if (is.null(readerControl$language)) 74 | readerControl$language <- "en" 75 | readerControl 76 | } 77 | 78 | 79 | #' @noRd 80 | #' @export 81 | `[.WebCorpus` <- function(x, i) { 82 | if (missing(i)) return(x) 83 | corpus <- NextMethod("[") 84 | class(corpus) <- c("WebCorpus", class(corpus)) 85 | corpus 86 | } 87 | 88 | #' @title Update/Extend \code{\link{WebCorpus}} with new feed items. 89 | #' @description The \code{corpus.update} method ensures, that the original 90 | #' \code{\link{WebCorpus}} feed sources are downloaded and checked against 91 | #' already included \code{TextDocument}s. Based on the \code{ID} included 92 | #' in the \code{TextDocument}'s meta data, only new feed elements are 93 | #' downloaded and added to the \code{\link{WebCorpus}}. 94 | #' All relevant information regariding the original source feeds are stored 95 | #' in the \code{\link{WebCorpus}}' meta data (\code{\link[tm]{meta}}). 96 | #' @param x object of type \code{\link{WebCorpus}} 97 | #' @param ... 98 | #' \describe{ 99 | #' \item{fieldname}{name of \code{\link{Corpus}} field name to be used as ID, defaults to "ID"} 100 | #' \item{retryempty}{specifies if empty corpus elements should be downloaded again, defaults to TRUE} 101 | #' \item{...}{additional parameters to \code{\link{Corpus}} function} 102 | #' } 103 | #' @export corpus.update 104 | #' @aliases corpus.update.WebCorpus 105 | corpus.update <- function(x, ...){ 106 | UseMethod("corpus.update", x) 107 | } 108 | 109 | #' Update/Extend \code{\link{WebCorpus}} with new feed items. 110 | #' @param x \code{\link{WebCorpus}} 111 | #' @param fieldname name of \code{\link{Corpus}} field name to be used as ID, defaults to "ID" 112 | #' @param retryempty specifies if empty corpus elements should be downloaded again, defaults to TRUE 113 | #' @param ... additional parameters to \code{\link{Corpus}} function 114 | #' @importFrom tm Corpus 115 | #' @importFrom NLP meta 116 | #' @noRd 117 | #' @export 118 | corpus.update.WebCorpus <- 119 | function(x, fieldname = "id", retryempty = TRUE, verbose = FALSE, ...) { 120 | cm <- x$meta 121 | 122 | newsource <- source.update(cm$source) 123 | 124 | #WebCorpus 125 | newcorpus <- WebCorpus(newsource, readerControl = cm$MetaData$ReaderControl, 126 | retryEmpty = FALSE, ...) 127 | #intersect on ID 128 | id_old <- sapply(x, meta, fieldname) 129 | if(any(sapply(id_old, length) == 0)) 130 | stop(paste("Not all elements in corpus to update have field '", fieldname, "' defined", sep = "")) 131 | 132 | id_new <- sapply(newcorpus, meta, fieldname) 133 | if(any(sapply(id_new, length) == 0)) 134 | stop(paste("Not all elements in corpus to update have field '", fieldname, "' defined", sep = "")) 135 | 136 | newcorpus <- newcorpus[!id_new %in% id_old] 137 | 138 | if(length(newcorpus) > 0){ 139 | if(!is.null(cm$postFUN)){ 140 | newcorpus <- cm$postFUN(newcorpus) 141 | } 142 | corpus <- c(x, newcorpus) 143 | #attr(corpus, "CMetaData") <- CMetaData(x) 144 | class(corpus) <- c("WebCorpus", class(corpus)) 145 | }else{ 146 | corpus <- x 147 | } 148 | 149 | if(retryempty){ 150 | corpus <- getEmpty(corpus) 151 | } 152 | 153 | if(verbose){ 154 | cat(length(newcorpus), " corpus items added.\n") 155 | } 156 | 157 | corpus 158 | } 159 | 160 | 161 | #' @title Retrieve Empty Corpus Elements through \code{$postFUN}. 162 | #' @description Retrieve content of all empty (textlength equals zero) corpus elements. If 163 | #' corpus element is empty, \code{$postFUN} is called (specified in \code{\link{meta}}) 164 | #' @param x object of type \code{\link{WebCorpus}} 165 | #' @param ... additional parameters to PostFUN 166 | #' @seealso \code{\link{WebCorpus}} 167 | #' @export getEmpty 168 | #' @aliases getEmpty.WebCorpus 169 | getEmpty <- function(x, ...){ 170 | UseMethod("getEmpty", x) 171 | } 172 | 173 | 174 | 175 | #' @importFrom NLP content 176 | #' @noRd 177 | #' @export 178 | getEmpty.WebCorpus <- 179 | function(x, nChar = 0, ...){ 180 | cm <- x$meta 181 | noContent <- which(sapply(x, function(y){ 182 | cy <- content(y) 183 | if(length(cy) == 0L) 0 184 | else nchar(content(y)) 185 | }) <= nChar) 186 | if(length(noContent) > 0){ 187 | corp_nocontent <- x[noContent] 188 | if(!is.null(cm$postFUN)){ 189 | corp_nocontent <- cm$postFUN(corp_nocontent, ...) 190 | } 191 | # TODO: stupid construct because direct assignment of corpus does not work 192 | for(i in 1:length(noContent)){ 193 | x[[noContent[i]]] <- corp_nocontent[[i]] 194 | } 195 | } 196 | x 197 | } 198 | 199 | 200 | -------------------------------------------------------------------------------- /vignettes/ShortIntro.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{article} 2 | \usepackage{Sweave} 3 | \usepackage[margin=2cm]{geometry} 4 | \usepackage[round]{natbib} 5 | \usepackage{url} 6 | \usepackage{hyperref} 7 | \usepackage{listings} 8 | 9 | \let\code=\texttt 10 | \newcommand{\acronym}[1]{\textsc{#1}} 11 | \newcommand{\class}[1]{\mbox{\textsf{#1}}} 12 | \newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}} 13 | \newcommand{\proglang}[1]{\textsf{#1}} 14 | \newcommand{\fkt}[1]{\code{#1()}} 15 | \newcommand{\todo}[1]{\begin{center}\code{}\end{center}} 16 | \newcommand{\field}[1]{\code{\$#1}} 17 | 18 | \sloppy 19 | %% \VignetteIndexEntry{Introduction to the tm.plugin.webmining Package} 20 | \SweaveOpts{prefix.string=webmining} 21 | \SweaveOpts{include=FALSE} 22 | 23 | 24 | \begin{document} 25 | 26 | <>= 27 | library(tm) 28 | library(tm.plugin.webmining) 29 | data(yahoonews) 30 | options(width = 60) 31 | @ 32 | 33 | \title{Short Introduction to \pkg{tm.plugin.webmining}} 34 | \author{Mario Annau\\ 35 | \texttt{mario.annau@gmail.com}} 36 | 37 | \maketitle 38 | 39 | \abstract{ 40 | This vignette gives a short introduction to \pkg{tm.plugin.webmining} which 41 | facilitates the retrieval of textual data from the web. The main focus of 42 | \pkg{tm.plugin.webmining} is the retrieval of web content from structured news 43 | feeds in the \proglang{XML} (\proglang{RSS}, \proglang{ATOM}) and 44 | \proglang{JSON} format. Additionally, retrieval and extraction of 45 | \proglang{HTML} documents is implemented. Numerous data sources are currently 46 | supported through public feeds/APIs, including Google-- and Yahoo! News, 47 | Reuters and the New York Times. 48 | } 49 | 50 | 51 | \section{Getting Started} 52 | After package installation we make the functionality of 53 | \pkg{tm.plugin.webmining} available through 54 | 55 | <>= 56 | library(tm) 57 | library(tm.plugin.webmining) 58 | @ 59 | 60 | \pkg{tm.plugin.webmining} depends on numerous packages, most 61 | importantly \pkg{tm} by \cite{hornik:Feinerer+Hornik+Meyer:2008} for text 62 | mining capabilities and data structures. 63 | \pkg{RCurl} functions are used for web data retrieval and \pkg{XML} for the 64 | extraction of \proglang{XML}/\proglang{HTML} based feeds. 65 | As a first experiment, we can retrieve a \class{(Web-)Corpus} using data from 66 | Yahoo! News and the search query \code{"Microsoft"}: 67 | 68 | <>= 69 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft")) 70 | @ 71 | 72 | Users already familiar with \pkg{tm} 73 | will notice the different function call \fkt{WebCorpus} for corpus construction. Like 74 | \pkg{tm}'s \fkt{Corpus} constructor it takes a \class{(Web-)Source} object as 75 | input and constructs a \class{(Web-)Corpus} object. 76 | A Review of the object's \fkt{class} 77 | 78 | <>= 79 | class(yahoonews) 80 | @ 81 | 82 | reveals, that \class{WebCorpus} is directly derived from \class{Corpus} and adds 83 | further functionality to it. It can therefore be used like a "normal" 84 | \class{Corpus} using \pkg{tm}'s text mining capabilities. 85 | 86 | <>= 87 | yahoonews 88 | @ 89 | 90 | Under the hood, a call of \fkt{YahooNewsSource} retrieves a data feed from 91 | Yahoo! News and pre--parses its contents. 92 | Subsequently, \fkt{WebCorpus} extracts (meta--)data from the \class{WebSource} 93 | object and also downloads and extracts the actual main content 94 | of the news item (most commonly an \proglang{HTML}--Webpage). 95 | In effect, it implements a two--step procedure to 96 | 97 | \begin{enumerate} 98 | \item Download meta data from the feed (through \class{WebSource}) 99 | \item Download and extract main content for the feed item (through 100 | \class{WebCorpus}) 101 | \end{enumerate} 102 | 103 | These procedures ensure that the resulting \class{WebCorpus} not only includes 104 | a rich set of meta data but also the full main text content for text mining 105 | purposes. An examination of the meta data for the first element in the corpus 106 | is shown below. 107 | 108 | <>= 109 | # Little hack to restrict output width 110 | meta(yahoonews[[1]], "description") <- 111 | paste(substring(meta(yahoonews[[1]], "description"), 1, 70), "...", sep = "") 112 | meta(yahoonews[[1]], "id") <- 113 | paste(substring(meta(yahoonews[[1]], "id"), 1, 70), "...", sep = "") 114 | meta(yahoonews[[1]], "origin") <- 115 | paste(substring(meta(yahoonews[[1]], "origin"), 1, 70), "...", sep = "") 116 | @ 117 | <>= 118 | meta(yahoonews[[1]]) 119 | @ 120 | 121 | For a Yahoo! News \class{TextDocument} we get useful meta--data like 122 | \code{DateTimeStamp}, \code{Description}, \code{Heading}, \code{ID} and 123 | \code{Origin}. The main content, as specified in the \code{Origin} of a 124 | \class{TextDocument} can be examined as follows (shortened for output): 125 | 126 | <>= 127 | # Little hack to restrict output length 128 | content(yahoonews[[1]]) <- 129 | paste(substring(yahoonews[[1]], 1, 100), "...", sep = "") 130 | @ 131 | <>= 132 | yahoonews[[1]] 133 | @ 134 | 135 | It has been extracted from an unstructured \proglang{HTML} page and freed from 136 | ads and sidebar content by \pkg{boilerpipeR}'s \fkt{DefaultExtractor}. To view the 137 | entire corpus main content also consider \fkt{inspect} (output omitted): 138 | 139 | <>= 140 | inspect(yahoonews) 141 | @ 142 | 143 | \section{Implemented Sources} 144 | \begin{table}[t] 145 | \begin{center} 146 | \input{tables/sources} 147 | \end{center} 148 | \caption{Overview of implemented \class{WebSources} listing the maximum number 149 | of items per feed, a descriptive URL, if authentification is necessary (x 150 | for yes) and the feed format.} 151 | \label{tab:sources} 152 | \end{table} 153 | 154 | All currently implemented (web--)sources are listed on Table~\ref{tab:sources}. 155 | The following commands show, how to use the implemented Sources. If available, 156 | the search query/stock ticker \code{Microsoft} has been used. Since Reuters News 157 | only offers a predefined number of channels we selected \code{businessNews}. 158 | 159 | <>= 160 | googlefinance <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT")) 161 | googlenews <- WebCorpus(GoogleNewsSource("Microsoft")) 162 | nytimes <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid)) 163 | reutersnews <- WebCorpus(ReutersNewsSource("businessNews")) 164 | yahoofinance <- WebCorpus(YahooFinanceSource("MSFT")) 165 | yahooinplay <- WebCorpus(YahooInplaySource()) 166 | yahoonews <- WebCorpus(YahooNewsSource("Microsoft")) 167 | @ 168 | 169 | \section{Extending/Updating Corpora} 170 | Most data feeds only contain 20--100 feed items. A text corpus of such a small 171 | size may not be sufficient for text mining purposes. For that reason, 172 | the \fkt{corpus.update} method has been implemented. In a nutshell, it first 173 | downloads a feed's meta data, checks which items are new (as determined by the meta--data 174 | ID field) and finally downloads the main content of new web documents. Since 175 | most time of \class{WebCorpus} construction is spend downloading the main content of 176 | corpus items, this procedures ensures a more efficient and faster 177 | \class{WebCorpus}--update. \\ 178 | The Yahoo! News corpus can now simply be updated: 179 | 180 | <>= 181 | yahoonews <- corpus.update(yahoonews) 182 | @ 183 | 184 | To continously update a \class{WebCorpus} a scheduled task/cron job could be set 185 | up which runs \fkt{corpus.update} in a script. 186 | \newpage 187 | 188 | \section{Conclusion} 189 | This vignette has given a short introduction to \pkg{tm.plugin.webmining}, a 190 | package to retrieve textual data from the web. Although 191 | \pkg{tm.plugin.webmining} has been tested for the retrieval of 10000+ items per 192 | feed it is generally not recommended to start massive feed downloads due to 193 | memory-- and \pkg{RCurl} restrictions. For this purpose, web scraping 194 | frameworks like Scrapy (\url{scrapy.org}), Heritrix (\url{crawler.archive.org}) 195 | or Nutch (\url{nutch.apache.org}) are much better suited. 196 | \\ 197 | Keeping these issues in mind, \pkg{tm.plugin.webmining} is well suited for the 198 | retrieval and processing of small to medium sized text corpora. By using the 199 | full meta data and textual contents, quite interesting text mining experiments 200 | can be done using the full capabilities of the \pkg{tm} package. 201 | 202 | 203 | \bibliographystyle{plainnat} 204 | \bibliography{references} 205 | 206 | 207 | 208 | \end{document} 209 | -------------------------------------------------------------------------------- /R/extract.R: -------------------------------------------------------------------------------- 1 | #' @title Extract main content from \code{TextDocument}s. 2 | #' @description Use implemented extraction functions (through boilerpipeR) to extract main content from 3 | #' \code{TextDocument}s. 4 | #' @param x PlainTextDocument 5 | #' @param extractor default extraction function to be used, defaults to \code{\link{extractContentDOM}} 6 | #' @param ... additional parameters to extractor function 7 | #' @export 8 | #' @aliases extract.PlainTextDocument 9 | extract <- function(x, extractor, ...) UseMethod("extract", x) 10 | 11 | 12 | #' Extract Main Content from Text Documents 13 | #' Use implemented extraction functions (through boilerpipeR) to extract main content from 14 | #' \code{TextDocument}s. 15 | #' @param x PlainTextDocument 16 | #' @param extractor default extraction function to be used, defaults to \code{\link{extractContentDOM}} 17 | #' @param ... additional parameters to extractor function 18 | #' @importFrom NLP content 19 | #' @noRd 20 | #' @export 21 | extract.PlainTextDocument <- function(x, extractor = extractContentDOM, ...){ 22 | content(x) <- tryCatch(extractor(x, ...), 23 | error = function(e){ 24 | warning(e) 25 | content(x) 26 | }) 27 | x 28 | } 29 | 30 | #' @title Simply strip HTML Tags from Document 31 | #' @description \code{extractHTMLStrip} parses an url, character or filename, reads the DOM 32 | #' tree, removes all HTML tags in the tree and outputs the source text without 33 | #' markup. 34 | #' @author Mario Annau 35 | #' @param url character, url or filename 36 | #' @param asText specifies if url parameter is a \code{character}, defaults to TRUE 37 | #' @param encoding specifies local encoding to be used, depending on platform 38 | #' @param ... Additional parameters for \code{\link{htmlTreeParse}} 39 | #' @seealso \code{\link{xmlNode}} 40 | #' @importFrom XML htmlTreeParse toString.XMLNode xmlChildren xmlValue free 41 | #' @seealso \code{\link{htmlTreeParse}} \code{\link{encloseHTML}} 42 | #' @note Input text should be enclosed in 'TEXT' tags to ensure correct 43 | #' DOM parsing (issue especially under .Platform$os.type = 'windows') 44 | #' @export 45 | extractHTMLStrip <- 46 | function(url, asText = TRUE, encoding, ...){ 47 | if(missing(encoding)){ 48 | encoding <- switch(.Platform$OS.type, 49 | unix = "UTF-8", 50 | windows = "latin1") 51 | } 52 | 53 | if(url == ""){ 54 | return("") 55 | } 56 | 57 | parseerror <- capture.output(tree <- htmlTreeParse(url, asText = asText, 58 | useInternalNodes = TRUE, encoding = encoding, ...)) 59 | 60 | children <- xmlChildren(tree) 61 | children <- children[!sapply(children, function(x) 62 | grepl(" mintextlen) & (dens[,1] < threshold)], assignValues, FUN, ...) 167 | return(t) 168 | 169 | } 170 | #' Get Main Text from Annotated HTML Tree 171 | #' Main Text is obtained from Tree -Subnode where threshold > threshold and 172 | #' textlength is at maximum 173 | #' @author Mario Annau 174 | #' @param xml object of class xmlNode 175 | #' @param threshold minimum threshold needed to be considered 176 | #' @seealso \code{\link{extractContentDOM}}, \code{\link{xmlNode}} 177 | #' @importFrom XML xpathSApply 178 | #' @importFrom XML xmlValue 179 | #' @noRd 180 | getMainText <- 181 | function(xml, threshold){ 182 | # FIXME: Hack because of roxygen2 bug (dot replaced by comma): 183 | if(missing(threshold)){ 184 | threshold <- 0.5 185 | } 186 | 187 | textlen <- as.numeric( xpathSApply(xml, path = "//attribute::textlen")) 188 | dens <- as.numeric( xpathSApply(xml, path = "//attribute::dens")) 189 | 190 | textlen[dens < threshold] <- 0 191 | idxmaintext <- which(textlen == max(textlen)) 192 | if(max(textlen) == 0){ 193 | return("") 194 | } 195 | 196 | content <- xpathSApply(xml, path = paste("//*[@textlen][@dens]",sep = ""))[[idxmaintext]] 197 | 198 | cleancontent <- xmlValue(content) 199 | cleancontent <- trimWhiteSpaces(cleancontent) 200 | 201 | return(cleancontent) 202 | } 203 | 204 | #' Remove specified tags from (XML) Document Tree. 205 | #' Tags and all of its inner content will be removed. 206 | #' @author Mario Annau 207 | #' @param xmldoc xmlDoc object of class xmlDoc 208 | #' @param tags character vector which specifies tags to remove 209 | #' @seealso \code{\link{extractContentDOM}} 210 | #' @export 211 | #' @importFrom XML getNodeSet 212 | #' @importFrom XML removeNodes 213 | #' @noRd 214 | removeTags <- 215 | function(xmldoc, tags){ 216 | #remove scripts tags 217 | xquery <- paste("//", tags, sep = "", collapse = " | ") 218 | scripts <- getNodeSet(xmldoc, path = xquery) 219 | ret <- removeNodes(scripts , free = rep(FALSE, length(scripts))) 220 | removeTags <- xmldoc 221 | } 222 | 223 | 224 | -------------------------------------------------------------------------------- /R/reader.R: -------------------------------------------------------------------------------- 1 | #' @title Read content from WebXMLSource/WebHTMLSource/WebJSONSource. 2 | #' @description \code{readWeb} is a FunctionGenerator which specifies content retrieval from a \code{\link{WebSource}} 3 | #' content elements. Currently, it is defined for XML, HTML and JSON feeds through \code{readWebXML}, 4 | #' \code{readWebHTML} and \code{readWebJSON}. Also content parsers (\code{xml_content}, \code{json_content}) 5 | #' need to be defined. 6 | #' @param spec specification of content reader 7 | #' @param doc document to be parsed 8 | #' @param parser parser function to be used 9 | #' @param contentparser content parser function to be used, see also \code{tm:::xml_content} or \code{json_content} 10 | #' @param freeFUN function to free memory from parsed object (actually only relevant for XML and HTML trees) 11 | #' @return FunctionGenerator 12 | #' @importFrom tm FunctionGenerator PlainTextDocument 13 | #' @aliases readWebXML readWebHTML readWebJSON json_content 14 | #' @export 15 | readWeb <- FunctionGenerator(function(spec, doc, parser, contentparser, freeFUN = NULL) { 16 | 17 | parser <- parser 18 | contentparser <- contentparser 19 | freeFUN <- freeFUN 20 | spec <- spec 21 | doc <- doc 22 | 23 | function(elem, language, id) { 24 | tree <- parser(elem$content) 25 | 26 | ###Set Content 27 | content(doc) <- if ("content" %in% names(spec)){ 28 | content <- contentparser(tree, spec[["content"]]) 29 | } 30 | else{ 31 | character(0) 32 | } 33 | 34 | for (n in setdiff(names(spec), "content")){ 35 | meta(doc, n) <- contentparser(tree, spec[[n]]) 36 | } 37 | 38 | if(!is.null(freeFUN)){ 39 | freeFUN(tree) 40 | } 41 | doc 42 | } 43 | }) 44 | 45 | #' Read content from WebXMLSource 46 | #' @param ... additional parameters to \code{\link{readWeb}} 47 | #' @export 48 | #' @importFrom XML xmlInternalTreeParse free 49 | #' @noRd 50 | readWebXML <- function(...){ 51 | parser <- function(x){ 52 | #XML::xmlInternalTreeParse(x, asText = TRUE) 53 | parse(x, type = "XML") 54 | } 55 | contentparser <- xml_content 56 | freeFUN <- free 57 | readWeb(parser = parser, contentparser = contentparser, freeFUN = freeFUN, ...) 58 | } 59 | 60 | #' Read content from WebHTMLSource 61 | #' @param ... additional parameters to \code{\link{readWeb}} 62 | #' @export 63 | #' @importFrom XML htmlTreeParse free 64 | #' @noRd 65 | readWebHTML <- function(...){ 66 | #parser <- function(x) XML::htmlTreeParse(x, asText = TRUE, useInternalNodes = TRUE) 67 | parser <- function(x) parse(x, type = "HTML", useInternalNodes = TRUE) 68 | contentparser <- function(x, cspec) xml_content(x, cspec) 69 | freeFUN <- free 70 | readWeb(parser = parser, contentparser = contentparser, freeFUN = freeFUN, ...) 71 | } 72 | 73 | #' Read content from WebJSONSource 74 | #' @param ... additional parameters to \code{\link{readWeb}} 75 | #' @export 76 | #' @noRd 77 | readWebJSON <- function(...){ 78 | parser <- function(x) identity(x) 79 | contentparser <- function(x, cspec) json_content(x, cspec) 80 | freeFUN <- rm 81 | readWeb(parser = parser, contentparser = contentparser, freeFUN = freeFUN, ...) 82 | } 83 | 84 | #' Read content from XMLSource 85 | #' @param doc list object from which content should be retrieved 86 | #' @param spec list field name as character 87 | #' @noRd 88 | #' @importFrom XML xmlValue 89 | xml_content <- function(doc, spec) { 90 | type <- spec[[1]] 91 | fun <- switch(type, 92 | node = XML::xmlValue, 93 | attribute = identity) 94 | 95 | if (identical(type, "unevaluated")) 96 | spec[[2]] 97 | else if (identical(type, "function") && is.function(spec[[2]])) 98 | spec[[2]](doc) 99 | else 100 | as.character(sapply(XML::getNodeSet(doc, spec[[2]]), fun)) 101 | } 102 | 103 | #' Read content from JSONSource 104 | #' @param doc list object from which content should be retrieved 105 | #' @param spec list field name as character 106 | #' @export 107 | #' @noRd 108 | json_content <- 109 | function (doc, spec) 110 | { 111 | type <- spec[[1]] 112 | fun <- switch(type, field = identity, node = identity) 113 | if (identical(type, "unevaluated")) 114 | spec[[2]] 115 | else if (identical(type, "function") && is.function(spec[[2]])) 116 | spec[[2]](doc) 117 | else{ 118 | as.character(sapply(doc[[spec[[2]]]], 119 | fun)) 120 | } 121 | } 122 | 123 | #' Read content from NYTimesSource 124 | #' @noRd 125 | #' @export 126 | readNYTimes <- readWebJSON(spec = list( 127 | author = list("field", c("byline", "original")), 128 | description = list("field", "snippet"), 129 | datetimestamp = list("function", function(node) 130 | strptime(node[["pub_date"]], 131 | format = "%Y-%m-%dT%H:%M:%SZ", 132 | tz = "EST")), 133 | heading = list("field", c("headline", "main")), 134 | origin = list("field", "web_url"), 135 | language = list("unevaluated", "en"), 136 | id = list("field", "_id")), 137 | doc = PlainTextDocument()) 138 | 139 | #' Read content from Google...Source 140 | #' @importFrom XML getNodeSet xmlValue 141 | #' @importFrom NLP meta<- 142 | #' @noRd 143 | #' @export 144 | readGoogle <- readWebXML(spec = list( 145 | heading = list("node", "//title"), 146 | datetimestamp = list("function", function(node){ 147 | loc <- Sys.getlocale("LC_TIME") 148 | Sys.setlocale("LC_TIME", "C") 149 | val <- sapply(getNodeSet(node, "//pubDate"), xmlValue) 150 | time <- strptime(val,format = "%a, %d %b %Y %H:%M:%S",tz = "GMT") 151 | Sys.setlocale("LC_TIME", loc) 152 | time 153 | }), 154 | origin = list("node", "//link"), 155 | description = list("function", function(node){ 156 | val <- sapply(getNodeSet(node, "//item/description"), xmlValue) 157 | extractHTMLStrip(sprintf("%s", val), asText = T) 158 | }), 159 | id = list("node", "//guid")), 160 | doc = PlainTextDocument()) 161 | 162 | #' Read content from Yahoo RSS Source 163 | #' @importFrom XML getNodeSet xmlValue 164 | #' @seealso \code{\link{YahooFinanceSource}} 165 | #' @noRd 166 | #' @export 167 | readYahoo <- readWebXML(spec = list( 168 | heading = list("node", "//title"), 169 | datetimestamp = list("function", function(node){ 170 | loc <- Sys.getlocale("LC_TIME") 171 | Sys.setlocale("LC_TIME", "C") 172 | val <- sapply(getNodeSet(node, "//pubDate"), xmlValue) 173 | time <- strptime(val,format = "%a, %d %b %Y %H:%M:%S",tz = "GMT") 174 | Sys.setlocale("LC_TIME", loc) 175 | time 176 | }), 177 | origin = list("node", "//link"), 178 | description = list("node", "//item/description"), 179 | id = list("node", "//guid")), 180 | doc = PlainTextDocument()) 181 | 182 | #' Read content from Yahoo HTML Source 183 | #' @importFrom XML getNodeSet xmlValue 184 | #' @seealso \code{\link{YahooNewsSource}} 185 | #' @noRd 186 | #' @export 187 | readYahooHTML <- readWebHTML(spec = list( 188 | heading = list("node", "//div[@class='compTitle']/h3[@class='title']/a"), 189 | datetimestamp = list("function", function(node){ 190 | loc <- Sys.getlocale("LC_TIME") 191 | Sys.setlocale("LC_TIME", "C") 192 | val <- sapply(getNodeSet(node, "//span[@class='tri fc-2nd ml-10']"), xmlValue) 193 | time <- strptime(val, format = "%b %d %H:%M %p",tz = "GMT") 194 | Sys.setlocale("LC_TIME", loc) 195 | time 196 | }), 197 | origin = list("attribute", "//div[@class='compTitle']/h3[@class='title']/a/@href"), 198 | author = list("node", "//span[@class='cite']"), 199 | description = list("node", "//div[@class='compText']/p"), 200 | id = list("attribute", "//div[@class='compTitle']/h3[@class='title']/a/@href")), 201 | doc = PlainTextDocument()) 202 | 203 | #' Read content from YahooInplaySource 204 | #' @importFrom XML getNodeSet xmlValue 205 | #' @noRd 206 | #' @export 207 | readYahooInplay <- readWebHTML(spec = list( 208 | heading = list("node", "//b[1]"), 209 | id = list("node", "//b[1]"), 210 | content = list("node", "//p"), 211 | datetimestamp = list("function", function(node){ 212 | val <- unlist(getNodeSet(node, "//b[1]", fun = xmlValue)) 213 | substr(val, 1, regexpr("\\s", val)-1) 214 | }), 215 | ticker = list("node", "//p/b/a")), 216 | doc = PlainTextDocument()) 217 | 218 | 219 | 220 | 221 | #' Read content from ReutersNewsSource 222 | #' @importFrom XML getNodeSet xmlValue 223 | #' @noRd 224 | #' @export 225 | readReutersNews <- readWebXML(spec = list( 226 | heading = list("node", "//title"), 227 | datetimestamp = list("function", function(node){ 228 | loc <- Sys.getlocale("LC_TIME") 229 | Sys.setlocale("LC_TIME", "C") 230 | val <- sapply(getNodeSet(node, "//pubDate"), xmlValue) 231 | time <- strptime(val,format = "%a, %d %b %Y %H:%M:%S",tz = "GMT") 232 | Sys.setlocale("LC_TIME", loc) 233 | time 234 | }), 235 | origin = list("node", "//link"), 236 | description = list("function", function(node){ 237 | val <- sapply(getNodeSet(node, "//item/description"), xmlValue) 238 | extractHTMLStrip(sprintf("%s", val), asText = T) 239 | }), 240 | id = list("node", "//guid"), 241 | category = list("node", "//category")), 242 | doc = PlainTextDocument()) 243 | 244 | #' Read content from LiberationSource 245 | #' @importFrom XML getNodeSet xmlValue 246 | #' @importFrom NLP meta<- 247 | #' @noRd 248 | #' @export 249 | readLiberationSource <- readWebXML(spec = list( 250 | heading = list("node", "//title"), 251 | datetimestamp = list("function", function(node){ 252 | loc <- Sys.getlocale("LC_TIME") 253 | Sys.setlocale("LC_TIME", "C") 254 | val <- sapply(getNodeSet(node, "//updated"), xmlValue) 255 | time <- strptime(val, format = "%Y-%m-%dT%H:%M:%S",tz = "GMT") 256 | Sys.setlocale("LC_TIME", loc) 257 | time 258 | }), 259 | origin = list("attribute", "//link[1]/@href"), 260 | author = list("node", "//author/name"), 261 | description = list("function", function(node){ 262 | val <- sapply(getNodeSet(node, "//summary"), xmlValue) 263 | extractHTMLStrip(sprintf("%s", val), asText = T) 264 | }), 265 | id = list("node", "//id"), 266 | language = list("unevaluated", "fr")), 267 | doc = PlainTextDocument()) 268 | -------------------------------------------------------------------------------- /R/source.R: -------------------------------------------------------------------------------- 1 | #' @title Read Web Content and respective Link Content from feedurls. 2 | #' @description WebSource is derived from \code{\link[tm]{Source}}. In addition to calling the 3 | #' base \code{\link[tm]{Source}} constructor function it also retrieves the specified 4 | #' feedurls and pre--parses the content with the parser function. 5 | #' The fields \code{$Content}, \code{$Feedurls} \code{$Parser} and \code{$CurlOpts} are finally 6 | #' added to the \code{Source} object. 7 | #' @author Mario Annau 8 | #' @param feedurls urls from feeds to be retrieved 9 | #' @param class class label to be assigned to \code{Source} object, defaults to "WebXMLSource" 10 | #' @param reader function to be used to read content, see also \code{\link{readWeb}} 11 | #' @param parser function to be used to split feed content into chunks, returns list of content elements 12 | #' @param encoding specifies default encoding, defaults to 'UTF-8' 13 | #' @param curlOpts a named list or CURLOptions object identifying the curl options for the handle. Type \code{listCurlOptions()} for all Curl options available. 14 | #' @param postFUN function saved in WebSource object and called to retrieve full text content from feed urls 15 | #' @param retrieveFeedURL logical; Specify if feedurls should be downloaded first. 16 | #' @param ... additional parameters passed to \code{WebSource} object/structure 17 | #' @return WebSource 18 | #' @export 19 | #' @importFrom XML getNodeSet xmlValue 20 | #' @importFrom RCurl curlOptions 21 | WebSource <- function(feedurls, class = "WebXMLSource", reader, parser, encoding = "UTF-8", 22 | curlOpts = curlOptions( 23 | followlocation = TRUE, 24 | maxconnects = 5, 25 | maxredirs = 20, 26 | timeout = 30, 27 | connecttimeout = 30, 28 | ssl.verifyhost = FALSE, 29 | ssl.verifypeer = FALSE), 30 | postFUN = NULL, retrieveFeedURL = TRUE, ...){ 31 | 32 | content_raw <- NULL 33 | if(retrieveFeedURL) { 34 | content_raw <- getURL(feedurls, .opts = curlOpts) 35 | } else { 36 | content_raw <- feedurls 37 | } 38 | # Filter empty content 39 | content_raw <- content_raw[sapply(content_raw, nchar) > 0] 40 | content_parsed <- unlist(lapply(content_raw, parser), recursive = FALSE) 41 | structure(list(encoding = encoding, length = length(content_parsed), names = NA_character_, 42 | position = 0, reader = reader, content = content_parsed, feedurls = feedurls, 43 | parser = parser, curlOpts = curlOpts, postFUN = postFUN, retrieveFeedURL = retrieveFeedURL, ...), 44 | class = unique(c(class, "WebSource", "SimpleSource"))) 45 | } 46 | 47 | 48 | #' @title Update WebXMLSource/WebHTMLSource/WebJSONSource 49 | #' @description Typically, update is called from \code{link{corpus.update}} and refreshes \code{$Content} in 50 | #' Source object. 51 | #' @param x Source object to be updated 52 | #' @export source.update 53 | #' @aliases source.update.WebXMLSource source.update.WebHTMLSource source.update.WebJSONSource 54 | source.update <- function(x){ 55 | UseMethod("source.update", x) 56 | } 57 | 58 | #'update WebSource 59 | #' @noRd 60 | #' @export 61 | source.update.WebXMLSource <- 62 | source.update.WebHTMLSource <- 63 | source.update.WebJSONSource <- 64 | function(x) { 65 | content_raw <- NULL 66 | if(x$retrieveFeedURL) { 67 | content_raw <- getURL(x$feedurls, .opts = x$curlOpts) 68 | } else { 69 | content_raw <- x$feedurls 70 | } 71 | # Filter empty content 72 | content_raw <- content_raw[sapply(content_raw, nchar) > 0] 73 | 74 | content_parsed <- unlist(lapply(content_raw, x$parser), recursive = FALSE) 75 | x$content <- content_parsed 76 | x$position <- 0 77 | x 78 | } 79 | 80 | #' @title Get feed Meta Data from Google Finance. 81 | #' @description Google Finance provides business and enterprise headlines for many companies. Coverage is 82 | #' particularly strong for US-Markets. However, only up to 20 feed items can be retrieved. 83 | #' @author Mario Annau 84 | #' @param query ticker symbols of companies to be searched for, see \url{http://www.google.com/finance}. 85 | #' Please note that Google ticker symbols need to be prefixed with the exchange name, e.g. NASDAQ:MSFT 86 | #' @param params additional query parameters 87 | #' @param ... additional parameters to \code{\link{WebSource}} 88 | #' @return WebXMLSource 89 | #' @seealso \code{\link{WebSource}} 90 | #' @export 91 | #' @examples 92 | #' \dontrun{ 93 | #' corpus <- WebCorpus(GoogleFinanceSource("NASDAQ:MSFT")) 94 | #' } 95 | #' @importFrom XML xmlInternalTreeParse 96 | #' @importFrom XML xpathSApply 97 | #' @importFrom XML getNodeSet 98 | #' @importFrom XML xmlValue 99 | #' @aliases readGoogle 100 | GoogleFinanceSource <- function(query, params = 101 | list( hl= 'en', 102 | q=query, 103 | ie='utf-8', 104 | start = 0, 105 | num = 20, 106 | output='rss'),...){ 107 | feed <- "http://www.google.com/finance/company_news" 108 | parser <- function(cr){ 109 | tree <- parse(cr, type = "XML", asText = FALSE) 110 | xpathSApply(tree, path = "//item") 111 | } 112 | fq <- feedquery(feed, params) 113 | ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readGoogle, 114 | postFUN = getLinkContent, retrieveFeedURL = FALSE,...) 115 | ws 116 | } 117 | 118 | #' @title Get feed data from Yahoo! Finance. 119 | #' @description Yahoo! Finance is a popular site which provides financial news and information. It is a large source 120 | #' for historical price data as well as financial news. Using the typical Yahoo! Finance ticker 121 | #' news items can easily be retrieved. However, the maximum number of items is 20. 122 | #' @author Mario Annau 123 | #' @param query ticker symbols of companies to be searched for, see \url{http://finance.yahoo.com/lookup}. 124 | #' @param params, additional query parameters, see \url{http://developer.yahoo.com/rss/} 125 | #' @param ... additional parameters to \code{\link{WebSource}} 126 | #' @return WebXMLSource 127 | #' @export 128 | #' @examples 129 | #' \dontrun{ 130 | #' corpus <- WebCorpus(YahooFinanceSource("MSFT")) 131 | #' } 132 | #' @seealso \code{\link{WebSource}} 133 | #' @importFrom XML xmlInternalTreeParse 134 | #' @importFrom XML xpathSApply 135 | #' @importFrom XML getNodeSet 136 | #' @importFrom XML xmlValue 137 | #' @aliases readYahoo 138 | YahooFinanceSource <- function(query, params = 139 | list( s= query, 140 | region = "US", 141 | lang = "en-US"), ...){ 142 | feed <- "https://feeds.finance.yahoo.com/rss/2.0/headline" 143 | 144 | fq <- feedquery(feed, params) 145 | parser <- function(cr){ 146 | tree <- parse(cr, type = "XML", asText = TRUE) 147 | xpathSApply(tree, path = "//item") 148 | } 149 | ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readYahoo, 150 | postFUN = getLinkContent, retrieveFeedURL = TRUE, ...) 151 | ws 152 | } 153 | 154 | #' @title Get feed data from Google News Search \url{http://news.google.com/} 155 | #' @description Google News Search is one of the most popular news aggregators on the web. News 156 | #' can be retrieved for any customized user query. Up to 30 can be retrieved per 157 | #' request. 158 | #' @author Mario Annau 159 | #' @param query Google News Search query 160 | #' @param params, additional query parameters 161 | #' @param ... additional parameters to \code{\link{WebSource}} 162 | #' @return WebXMLSource 163 | #' @seealso \code{\link{WebSource}} 164 | #' @export 165 | #' @examples 166 | #' \dontrun{ 167 | #' corpus <- WebCorpus(GoogleNewsSource("Microsoft")) 168 | #' } 169 | #' @importFrom XML xmlInternalTreeParse xpathSApply getNodeSet xmlValue newXMLNamespace 170 | GoogleNewsSource <- function(query, params = 171 | list( hl= 'en', 172 | q = query, 173 | ie='utf-8', 174 | num = 30, 175 | output='rss'), ...){ 176 | feed <- "http://news.google.com/news" 177 | fq <- feedquery(feed, params) 178 | parser <- function(cr){ 179 | tree <- parse(cr, type = "XML", asText = TRUE) 180 | nodes <- xpathSApply(tree, path = "//item") 181 | xmlns1 <- lapply(nodes, newXMLNamespace, "http://purl.org/dc/elements/1.1/", "dc") 182 | nodes 183 | } 184 | ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readGoogle, 185 | postFUN = getLinkContent, retrieveFeedURL = TRUE, ...) 186 | ws 187 | } 188 | 189 | #' @title Get feed data from Reuters News RSS feed channels. Reuters provides numerous feed 190 | #' @description channels (\url{http://www.reuters.com/tools/rss}) which can be retrieved through RSS 191 | #' feeds. Only up to 25 items can be retrieved---therefore an alternative retrieval 192 | #' through the Google Reader API (\code{link{GoogleReaderSource}}) could be considered. 193 | #' @author Mario Annau 194 | #' @param query Reuters News RSS Feed, see \url{http://www.reuters.com/tools/rss} for a list of all feeds provided. Note that only string after 'http://feeds.reuters.com/reuters/' must be given. Defaults to 'businessNews'. 195 | #' @param ... additional parameters to \code{\link{WebSource}} 196 | #' @return WebXMLSource 197 | #' @seealso \code{\link{WebSource}} 198 | #' @export 199 | #' @examples 200 | #' \dontrun{ 201 | #' corpus <- WebCorpus(ReutersNewsSource("businessNews")) 202 | #' } 203 | #' @importFrom XML xmlInternalTreeParse xpathSApply getNodeSet xmlValue newXMLNamespace 204 | #' @aliases readReutersNews 205 | ReutersNewsSource <- function(query = 'businessNews', ...){ 206 | feed <- "http://feeds.reuters.com/reuters" 207 | 208 | fq <- paste(feed, query, sep = "/") 209 | parser <- function(cr){ 210 | tree <- parse(cr, type = "XML") 211 | nodes <- xpathSApply(tree, path = "//item") 212 | xmlns1 <- lapply(nodes, newXMLNamespace, "http://rssnamespace.org/feedburner/ext/1.0", "feedburner") 213 | nodes 214 | } 215 | 216 | ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readReutersNews, 217 | postFUN = getLinkContent, ...) 218 | ws 219 | } 220 | 221 | #' @title Get news data from Yahoo! News (\url{https://news.search.yahoo.com/search/}). 222 | #' @description Currently, only a maximum of 10 items can be retrieved. 223 | #' @author Mario Annau 224 | #' @param query words to be searched in Yahoo News, multiple words must be separated by '+' 225 | #' @param params, additional query parameters, see \url{http://developer.yahoo.com/rss/} 226 | #' @param ... additional parameters to \code{\link{WebSource}} 227 | #' @return WebXMLSource 228 | #' @export 229 | #' @examples 230 | #' \dontrun{ 231 | #' corpus <- WebCorpus(YahooNewsSource("Microsoft")) 232 | #' } 233 | #' @seealso \code{\link{WebSource}} 234 | #' @importFrom XML xmlInternalTreeParse 235 | #' @importFrom XML xpathSApply 236 | #' @importFrom XML getNodeSet 237 | #' @importFrom XML xmlValue 238 | #' @aliases readYahooHTML 239 | YahooNewsSource <- function(query, params = 240 | list( p= query), ...){ 241 | feed <- "https://news.search.yahoo.com/search" 242 | fq <- feedquery(feed, params) 243 | parser <- function(cr){ 244 | tree <- parse(cr, type = "HTML", useInternalNodes = TRUE) 245 | xpathSApply(tree, path = "//div[contains(@class, 'NewsArticle')]") 246 | } 247 | ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readYahooHTML, 248 | postFUN = getLinkContent, ...) 249 | ws 250 | } 251 | 252 | 253 | #' @title Get feed data from NYTimes Article Search (\url{http://developer.nytimes.com/docs/read/article_search_api_v2}). 254 | #' @description Excerpt from the website: "With the NYTimes Article Search API, you can search New York Times articles 255 | #' from 1981 to today, retrieving headlines, abstracts, lead paragraphs, links to associated multimedia 256 | #' and other article metadata. Along with standard keyword searching, the API also offers faceted searching. 257 | #' The available facets include Times-specific fields such as sections, taxonomic classifiers and controlled 258 | #' vocabulary terms (names of people, organizations and geographic locations)." 259 | #' Feed retrieval is limited to 1000 items (or 100 pages). 260 | #' @author Mario Annau 261 | #' @param query character specifying query to be used to search NYTimes articles 262 | #' @param n number of items, defaults to 100 263 | #' @param sleep integer; Seconds to sleep between feed retrieval. 264 | #' @param curlOpts CURLOptions; RCurl options used for feed retrieval. 265 | #' @param appid Developer App id to be used, obtained from \url{http://developer.nytimes.com/} 266 | #' @param params additional query parameters, specified as list, see \url{http://developer.nytimes.com/docs/read/article_search_api} 267 | #' @param ... additional parameters to \code{\link{WebSource}} 268 | #' @seealso \code{\link{WebSource}}, \code{\link{readNYTimes}} 269 | #' @export 270 | #' @examples 271 | #' \dontrun{ 272 | #' #nytimes_appid needs to be specified 273 | #' corpus <- WebCorpus(NYTimesSource("Microsoft", appid = nytimes_appid)) 274 | #' } 275 | #' @export 276 | #' @importFrom RJSONIO fromJSON 277 | #' @importFrom boilerpipeR ArticleExtractor 278 | #' @aliases readNYTimes 279 | NYTimesSource <- function(query, n = 100, appid, 280 | sleep = 1, params = 281 | list( format="json", 282 | q = query, 283 | page = 0:(ceiling(n/10)-1), 284 | "api-key" = appid), 285 | curlOpts = curlOptions( followlocation = TRUE, 286 | maxconnects = 10, 287 | maxredirs = 10, 288 | timeout = 30, 289 | connecttimeout = 30), ...){ 290 | feed <- "http://api.nytimes.com/svc/search/v2/articlesearch.json" 291 | fq <- feedquery(feed, params) 292 | 293 | parser <- function(cr){ 294 | json <- parse(cr, type = "JSON") 295 | json$response$docs 296 | } 297 | 298 | count <- 10 299 | start <- seq(1, length(fq), by = count) 300 | end <- if(n < count) length(fq) else seq(count, length(fq), length.out = length(start)) 301 | 302 | feedcontent <- sapply(1:length(start), function(i) { 303 | fcontent <- getURL(fq[start[i]:end[i]], .opts = curlOpts) 304 | Sys.sleep(sleep) 305 | fcontent 306 | }) 307 | 308 | ws <- WebSource(feedurls = feedcontent, class = "WebJSONSource", parser = parser, reader = readNYTimes, 309 | postFUN = getLinkContent, retrieveFeedURL = FALSE, ...) 310 | 311 | ws 312 | } 313 | 314 | #' @title Get News from Yahoo Inplay. 315 | #' @description Yahoo Inplay lists a range of company news provided by Briefing.com. Since Yahoo Inplay 316 | #' does not provide a structured XML news feed, content is parsed directly from the HTML page. 317 | #' Therefore, no further Source parameters can be specified. The number of feed items per 318 | #' request can vary substantially. 319 | #' @author Mario Annau 320 | #' @param ... additional parameters to \code{\link{WebSource}} 321 | #' @return WebHTMLSource 322 | #' @export 323 | #' @examples 324 | #' \dontrun{ 325 | #' corpus <- WebCorpus(YahooInplaySource()) 326 | #' } 327 | #' @importFrom XML htmlTreeParse 328 | #' @importFrom XML xpathSApply 329 | #' @aliases readYahooInplay 330 | YahooInplaySource <- function(...){ 331 | url <- "http://finance.yahoo.com/marketupdate/inplay" 332 | parser <- function(cr){ 333 | tree <- parse(cr, useInternalNodes = T, type = "HTML") 334 | xp_expr = "//div[@class= 'body yom-art-content clearfix']/p" 335 | paragraphs = xpathSApply(tree, xp_expr) 336 | } 337 | 338 | ws <- WebSource(feedurls = url, class = "WebHTMLSource", parser = parser, reader = readYahooInplay, ...) 339 | ws 340 | } 341 | 342 | #' @title Get news data from french Liberation News Paper (\url{http://rss.liberation.fr/rss}). 343 | #' @author Mario Annau 344 | #' @param query feed to be retrieved, defaults to 'latest' 345 | #' @param ... additional parameters to \code{\link{WebSource}} 346 | #' @return WebXMLSource 347 | #' @export 348 | #' @examples 349 | #' \dontrun{ 350 | #' corpus <- WebCorpus(LiberationSource("latest")) 351 | #' } 352 | #' @seealso \code{\link{WebSource}} 353 | #' @importFrom XML xmlInternalTreeParse 354 | #' @importFrom XML xpathSApply 355 | #' @importFrom XML getNodeSet 356 | #' @importFrom XML xmlValue 357 | #' @aliases readLiberationSource 358 | LiberationSource <- function(query = "latest", ...){ 359 | fq <- paste("http://rss.liberation.fr/rss", query, sep = "/") 360 | parser <- function(cr){ 361 | tree <- parse(cr, type = "XML", useInternalNodes = TRUE) 362 | namespaces <- c(ns = "http://www.w3.org/2005/Atom") 363 | xpathSApply(tree, "//ns:entry", namespaces = namespaces) 364 | } 365 | ws <- WebSource(feedurls = fq, class = "WebXMLSource", parser = parser, reader = readLiberationSource, 366 | postFUN = getLinkContent, retrieveFeedURL = TRUE, ...) 367 | ws 368 | } 369 | 370 | #' @importFrom XML saveXML 371 | #' @noRd 372 | #' @export 373 | getElem.WebXMLSource <- 374 | getElem.WebHTMLSource <- function(x) { 375 | list(content = saveXML(x$content[[x$position]]), linkcontent = NULL, uri = NULL) 376 | } 377 | 378 | #' @noRd 379 | #' @export 380 | getElem.WebJSONSource <- function(x) { 381 | list(content = x$content[[x$position]], linkcontent = NULL, uri = NULL) 382 | } 383 | -------------------------------------------------------------------------------- /vignettes/references.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{kohlschuetter:webextract, 2 | abstract = {{In addition to the actual content Web pages consist of navigational elements, templates, and advertisements. This boilerplate text typically is not related to the main content, may deteriorate search precision and thus needs to be detected properly. In this paper, we analyze a small set of shallow text features for classifying the individual text elements in a Web page. We compare the approach to complex, state-of-the-art techniques and show that competitive accuracy can be achieved, at almost no cost. Moreover, we derive a simple and plausible stochastic model for describing the boilerplate creation process. With the help of our model, we also quantify the impact of boilerplate removal to retrieval performance and show significant improvements over the baseline. Finally, we extend the principled approach by straight-forward heuristics, achieving a remarkable detection accuracy.}}, 3 | address = {New York, NY, USA}, 4 | author = {Kohlsch\"{u}tter, Christian and Fankhauser, Peter and Nejdl, Wolfgang}, 5 | booktitle = {Proceedings of the third ACM international conference on Web search and data mining}, 6 | citeulike-article-id = {8241255}, 7 | citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1718542}, 8 | citeulike-linkout-1 = {http://dx.doi.org/10.1145/1718487.1718542}, 9 | doi = {10.1145/1718487.1718542}, 10 | isbn = {978-1-60558-889-6}, 11 | location = {New York, New York, USA}, 12 | pages = {441--450}, 13 | posted-at = {2010-11-23 07:02:43}, 14 | priority = {2}, 15 | publisher = {ACM}, 16 | series = {WSDM '10}, 17 | title = {{Boilerplate detection using shallow text features}}, 18 | url = {http://code.google.com/p/boilerpipe/}, 19 | year = {2010} 20 | } 21 | 22 | 23 | 24 | 25 | @inproceedings{Goog:MapReduce, 26 | abstract = {MapReduce is a programming model and an associated implementation for processing and generating large data sets. Users specify a \_map\_ function that processes a key/value pair to generate a set of intermediate key/value pairs, and a \_reduce\_ function that merges all intermediate values associated with the same intermediate key. Many real world tasks are expressible in this model, as shown in the paper.

Programs written in this functional style are automatically parallelized and executed on a large cluster of commodity machines. The run-time system takes care of the details of partitioning the input data, scheduling the program's execution across a set of machines, handling machine failures, and managing the required inter- machine communication. This allows programmers without any experience with parallel and distributed systems to easily utilize the resources of a large distributed system.

Our implementation of MapReduce runs on a large cluster of commodity machines and is highly scalable: a typical MapReduce computation processes many terabytes of data on thousands of machines. Programmers find the system easy to use: hundreds of MapReduce programs have been implemented and upwards of one thousand MapReduce jobs are executed on Google's clusters every day.

}, 27 | author = {Dean, Jeffrey and Ghemawat, Sanjay}, 28 | citeulike-article-id = {430834}, 29 | citeulike-linkout-0 = {http://www.usenix.org/events/osdi04/tech/dean.html}, 30 | journal = {OSDI '04}, 31 | keywords = {cluster, google, parallel}, 32 | pages = {137--150}, 33 | posted-at = {2008-03-27 02:27:59}, 34 | priority = {3}, 35 | title = {MapReduce: Simplified Data Processing on Large Clusters}, 36 | url = {http://www.usenix.org/events/osdi04/tech/dean.html}, 37 | year = {2008}, 38 | booktitle = {MapReduce: Simplified Data Processing on Large Clusters} 39 | } 40 | 41 | @ARTICLE{Pang+Lee:08b, 42 | author = {Bo Pang and Lillian Lee}, 43 | title = {Opinion mining and sentiment analysis}, 44 | journal = {Foundations and Trends in Information Retrieval}, 45 | year = {2008}, 46 | volume = {2}, 47 | pages = {1--135}, 48 | number = {1-2} 49 | } 50 | 51 | 52 | @article{Msft:MapReduce, 53 | abstract = {Google's MapReduce programming model serves for processing large data sets in a massively parallel manner. We deliver the first rigorous description of the model including its advancement as Google's domain-specific language Sawzall. To this end, we reverse-engineer the seminal papers on MapReduce and Sawzall, and we capture our findings as an executable specification. We also identify and resolve some obscurities in the informal presentation given in the seminal papers. We use typed functional programming (specifically Haskell) as a tool for design recovery and executable specification. Our development comprises three components: (i) the basic program skeleton that underlies MapReduce computations; (ii) the opportunities for parallelism in executing MapReduce computations; (iii) the fundamental characteristics of Sawzall's aggregators as an advancement of the MapReduce approach. Our development does not formalize the more implementational aspects of an actual, distributed execution of MapReduce computations.}, 54 | author = {Lammel, Ralf}, 55 | citeulike-article-id = {2152671}, 56 | citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1290549.1290812}, 57 | citeulike-linkout-1 = {http://dx.doi.org/10.1016/j.scico.2007.07.001}, 58 | citeulike-linkout-2 = {http://linkinghub.elsevier.com/retrieve/pii/S0167642307001281}, 59 | citeulike-linkout-3 = {http://www.sciencedirect.com/science/article/B6V17-4P718HK-1/2/77f5109e6e40c6c24df92250b314c2f1}, 60 | doi = {10.1016/j.scico.2007.07.001}, 61 | journal = {Science of Computer Programming}, 62 | month = {January}, 63 | number = {1}, 64 | pages = {1--30}, 65 | posted-at = {2009-09-08 04:26:54}, 66 | priority = {2}, 67 | title = {Google's MapReduce programming model -- Revisited}, 68 | url = {http://dx.doi.org/10.1016/j.scico.2007.07.001}, 69 | volume = {70}, 70 | year = {2008} 71 | } 72 | 73 | @inproceedings{Theu:RHadoop, 74 | author = {Theussl, Stefan}, 75 | booktitle = {Computational Finance and Financial Engineering, Second R/Rmetrics User and Developer Workshop}, 76 | year = {2009}, 77 | month = {June}, 78 | address = {Meielisalp, Lake Thune, Switzerland}, 79 | keywords = {cluster, google, parallel}, 80 | title = {Simple Parallel Computing in R Using Hadoop}, 81 | url = {http://www.rmetrics.org/Meielisalp2009/Presentations/Theussl1.pdf} 82 | } 83 | 84 | 85 | @inproceedings{Theu:RParallel, 86 | author = {Theussl, Stefan}, 87 | booktitle = {Computational Finance and Financial Engineering, Second R/Rmetrics User and Developer Workshop}, 88 | year = {2008}, 89 | month = {June}, 90 | address = {Meielisalp, Lake Thune, Switzerland}, 91 | keywords = {cluster, parallel, r}, 92 | title = {Getting the most out of your CPUs: Parallel computing strategies in R}, 93 | url = {http://www.rmetrics.org/Meielisalp2008/Presentations/Theussl1.pdf}, 94 | lastchecked = {\today} 95 | } 96 | 97 | @webpage{Feinerer:TM, 98 | author = {Feinerer,Ingo}, 99 | title = "tm: Text Mining Package", 100 | url = "http://cran.r-project.org/web/packages/tm/index.html", 101 | lastchecked = {\today} 102 | } 103 | 104 | @article{Bharat:Rank, 105 | author = {Bharat, Krishna and Mihaila, George A.}, 106 | title = {When experts agree: using non-affiliated experts to rank popular topics}, 107 | journal = {ACM Trans. Inf. Syst.}, 108 | volume = {20}, 109 | issue = {1}, 110 | month = {January}, 111 | year = {2002}, 112 | issn = {1046-8188}, 113 | pages = {47--58}, 114 | numpages = {12}, 115 | url = {http://doi.acm.org/10.1145/503104.503107}, 116 | doi = {http://doi.acm.org/10.1145/503104.503107}, 117 | acmid = {503107}, 118 | publisher = {ACM}, 119 | address = {New York, NY, USA}, 120 | keywords = {WWW search, authorities, connectivity, host affiliation, link analysis, ranking, topic experts}, 121 | } 122 | 123 | 124 | @webpage{Apache:Hadoop, 125 | author = {Apache, Software Foundation}, 126 | title = "Hadoop", 127 | url = "http://hadoop.apache.org/", 128 | year = 2011, 129 | lastchecked = {\today} 130 | } 131 | 132 | @webpage{Spotlight, 133 | author = {Reuters Labs}, 134 | title = "Reuters Spotlight", 135 | url = "http://spotlight.reuters.com", 136 | year = 2011, 137 | lastchecked = {\today} 138 | } 139 | 140 | @webpage{SK:RGrowth, 141 | author = {Reader SK, Revolution Analytics}, 142 | title = "R's exponential package growth, ctd.", 143 | url = "http://blog.revolutionanalytics.com/2010/01/rs-exponential-package-growth-ctd.html", 144 | year = 2010, 145 | month = 1, 146 | day = 7, 147 | lastchecked = {\today} 148 | } 149 | 150 | @article{fama:EMH, 151 | author = {Fama, Eugene F.}, 152 | citeulike-article-id = {1571390}, 153 | citeulike-linkout-0 = {http://dx.doi.org/10.2307/2350752}, 154 | citeulike-linkout-1 = {http://www.jstor.org/stable/2350752}, 155 | doi = {10.2307/2350752}, 156 | issn = {00219398}, 157 | journal = {The Journal of Business}, 158 | keywords = {behavior, stock-market}, 159 | number = {1}, 160 | pages = {34--105}, 161 | posted-at = {2008-09-23 23:37:46}, 162 | priority = {2}, 163 | publisher = {The University of Chicago Press}, 164 | title = {{The Behavior of Stock-Market Prices}}, 165 | url = {http://dx.doi.org/10.2307/2350752}, 166 | volume = {38}, 167 | year = {1965} 168 | } 169 | 170 | @article{fama:EMH2, 171 | author = {Fama, Eugene F.}, 172 | citeulike-article-id = {1485929}, 173 | citeulike-linkout-0 = {http://dx.doi.org/10.2307/2325486}, 174 | citeulike-linkout-1 = {http://www.jstor.org/stable/2325486}, 175 | doi = {10.2307/2325486}, 176 | issn = {00221082}, 177 | journal = {The Journal of Finance}, 178 | keywords = {depaper, efficient, hypothesis, market}, 179 | number = {2}, 180 | pages = {383--417}, 181 | posted-at = {2008-05-25 20:16:01}, 182 | priority = {2}, 183 | publisher = {Blackwell Publishing for the American Finance Association}, 184 | title = {{Efficient Capital Markets: A Review of Theory and Empirical Work}}, 185 | url = {http://dx.doi.org/10.2307/2325486}, 186 | volume = {25}, 187 | year = {1970} 188 | } 189 | 190 | @book{PangLee:Opinion, 191 | abstract = {{An important part of our information-gathering behavior has always been to find out what other people think. With the growing availability and popularity of opinion-rich resources such as online review sites and personal blogs, new opportunities and challenges arise as people can, and do, actively use information technologies to seek out and understand the opinions of others. The sudden eruption of activity in the area of opinion mining and sentiment analysis, which deals with the computational treatment of opinion, sentiment, and subjectivity in text, has thus occurred at least in part as a direct response to the surge of interest in new systems that deal directly with opinions as a first-class object. Opinion Mining and Sentiment Analysis covers techniques and approaches that promise to directly enable opinion-oriented information-seeking systems. The focus is on methods that seek to address the new challenges raised by sentiment-aware applications, as compared to those that are already present in more traditional fact-based analysis. The survey includes an enumeration of the various applications, a look at general challenges and discusses categorization, extraction and summarization. Finally, it moves beyond just the technical issues, devoting significant attention to the broader implications that the development of opinion-oriented information-access services have: questions of privacy, vulnerability to manipulation, and whether or not reviews can have measurable economic impact. To facilitate future work, a discussion of available resources, benchmark datasets, and evaluation campaigns is also provided. Opinion Mining and Sentiment Analysis is the first such comprehensive survey of this vibrant and important research area and will be of interest to anyone with an interest in opinion-oriented information-seeking systems.}}, 192 | author = {Pang, Bo and Lee, Lillian}, 193 | citeulike-article-id = {3481153}, 194 | day = {08}, 195 | howpublished = {Paperback}, 196 | isbn = {1601981503}, 197 | keywords = {information-retrieval, review, sentiment-analysis}, 198 | month = jul, 199 | posted-at = {2009-09-20 21:20:23}, 200 | priority = {4}, 201 | publisher = {Now Publishers Inc}, 202 | title = {{Opinion Mining and Sentiment Analysis}}, 203 | url = {http://www.cs.cornell.edu/home/llee/opinion-mining-sentiment-analysis-survey.html}, 204 | year = {2008} 205 | } 206 | 207 | 208 | 209 | @article{hornik:Feinerer+Hornik+Meyer:2008, 210 | author = {Ingo Feinerer and Kurt Hornik and David Meyer}, 211 | title = {Text Mining Infrastructure in {R}}, 212 | journal = {Journal of Statistical Software}, 213 | volume = 25, 214 | number = 5, 215 | pages = {1--54}, 216 | day = 10, 217 | month = 2, 218 | year = 2008, 219 | coden = {JSSOBK}, 220 | issn = {1548-7660}, 221 | url = {http://www.jstatsoft.org/v25/i05}, 222 | accepted = {2008-02-10}, 223 | submitted = {2007-09-05}, 224 | file = {Feinerer+Hornik+Meyer_j=JSS_y=2008.pdf} 225 | 226 | } 227 | 228 | @MISC{AlpertHajaj:GoogleBigWeb, 229 | author = {Jesse Alpert and Nissan Hajaj}, 230 | title = {We knew the web was big... - The Official Google Blog.}, 231 | year = {2008}, 232 | month = {7}, 233 | day = {25}, 234 | url = { http://googleblog.blogspot.com/2008/07/we-knew-web-was-big.html} 235 | } 236 | 237 | @MISC{Elias:ExtMainText, 238 | author = {Jinliang Song}, 239 | title = {ExtMainText - Extract main text from html document}, 240 | year = {2010}, 241 | url = { http://www.elias.cn/En/ExtMainText} 242 | } 243 | 244 | @MISC{AIDepot:ExtractHTMLEasy, 245 | author = {alexjc}, 246 | title = {The Easy Way to Extract Useful Text from Arbitrary HTML}, 247 | year = {2007}, 248 | url = { http://ai-depot.com/articles/the-easy-way-to-extract-useful-text-from-arbitrary-html/} 249 | } 250 | 251 | @MISC{Snowball:Snowball, 252 | author = "Martin Porter", 253 | title = "Snowball:Snowball", 254 | url = "http://snowball.tartarus.org", 255 | year = {2010}, 256 | lastchecked = {\today} 257 | } 258 | 259 | 260 | @MISC{YahooFinance:RSSIndesx, 261 | author = "Yahoo!, Finance", 262 | title = "RSS Feeds", 263 | url = "http://finance.yahoo.com/rssindex", 264 | year = 2011, 265 | lastchecked = {\today} 266 | } 267 | 268 | @MISC{YahooFinance:RSSAPI, 269 | author = "Yahoo!, Finance", 270 | title = "Company News RSS Feed", 271 | url = "http://developer.yahoo.com/finance/company.html", 272 | year = 2011, 273 | lastchecked = {\today} 274 | } 275 | 276 | 277 | @MISC{Gupta03dom-basedcontent, 278 | author = {Suhit Gupta and Gail Kaiser and David Neistadt and Peter Grimm}, 279 | title = {DOM-based Content Extraction of HTML Documents}, 280 | year = {2003} 281 | } 282 | 283 | @PhdThesis{Gott:ContentExtraction, 284 | author = {Thomas Gottron}, 285 | title = {Content Extraction: Identifying the Main content in HTML Documents}, 286 | school = {Johannes Gutenberg-Universität}, 287 | year = {2008}, 288 | OPTaddress = {Mainz, Germany}, 289 | } 290 | 291 | @TechReport{zhang:FinAnalysisUsingNewsPaperSurvey, 292 | author = {Wenbin Zhang and Steven Skiena}, 293 | title = {Financial Analysis Using News Data}, 294 | institution = {Department of Computer Science, Stony Brook University}, 295 | year = {2008}, 296 | address = {Stony Brook, NY 11794-4400 USA} 297 | } 298 | 299 | @book{chambers:GuidetoS, 300 | abstract = {{Here is a thorough and authoritative guide to the latest version of the S language and to its programming environment, the premier software platform for computing with data. Programming with Data describes a new and greatly extended version of S, and is written by the chief designer of the language. The book is a guide to the complete programming process, starting from simple, interactive use and continuing through ambitious software projects.S is designed for computing with data - for any project in which organizing, visualizing, summarizing, or modeling data is a central concern. Its focus is on the needs of the programmer/user, and its goal is "to turn ideas into software, quickly and faithfully." S is a functional, object-based language with a huge library of functions for all aspects of computing with data. Its long and enthusiastic use in statistics and applied fields has also led to many valuable libraries of user-written functions.The new version of S provides a powerful class/method structure, new techniques to deal with large objects, extended interfaces to other languages and files, object-based documentation compatible with HTML, and powerful new interactive programming techniques. This version of S underlies the S-Plus system, versions 5.0 and higher.John Chambers has been a member of the technical staff in research at Bell Laboratories since 1966. In 1977, he became the first statistician to be named a Bell Labs Fellow, cited for "pioneering contributions to the field of statistical computing." His research has touched on nearly all aspects of computing with data, but he is best known for the design of the S language. He is the author or co-author of seven books on S, on computational methods, and on graphical methods; and he is a Fellow of the American Statistical Association and the American Association for the Advancement of Science.}}, 301 | author = {Chambers, John M.}, 302 | citeulike-article-id = {699469}, 303 | citeulike-linkout-0 = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&path=ASIN/0387985034}, 304 | citeulike-linkout-1 = {http://www.amazon.de/exec/obidos/redirect?tag=citeulike01-21\&path=ASIN/0387985034}, 305 | citeulike-linkout-2 = {http://www.amazon.fr/exec/obidos/redirect?tag=citeulike06-21\&path=ASIN/0387985034}, 306 | citeulike-linkout-3 = {http://www.amazon.jp/exec/obidos/ASIN/0387985034}, 307 | citeulike-linkout-4 = {http://www.amazon.co.uk/exec/obidos/ASIN/0387985034/citeulike00-21}, 308 | citeulike-linkout-5 = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/0387985034}, 309 | citeulike-linkout-6 = {http://www.worldcat.org/isbn/0387985034}, 310 | citeulike-linkout-7 = {http://books.google.com/books?vid=ISBN0387985034}, 311 | citeulike-linkout-8 = {http://www.amazon.com/gp/search?keywords=0387985034\&index=books\&linkCode=qs}, 312 | citeulike-linkout-9 = {http://www.librarything.com/isbn/0387985034}, 313 | day = {19}, 314 | edition = {Corrected}, 315 | howpublished = {Paperback}, 316 | isbn = {0387985034}, 317 | month = jun, 318 | posted-at = {2006-06-17 23:54:54}, 319 | priority = {2}, 320 | publisher = {Springer}, 321 | title = {{Programming with Data: A Guide to the S Language}}, 322 | url = {http://www.worldcat.org/isbn/0387985034}, 323 | year = {1998} 324 | } 325 | 326 | @article{R:Ihaka+Gentleman:1996, 327 | author = {Ross Ihaka and Robert Gentleman}, 328 | title = {R: A Language for Data Analysis and Graphics}, 329 | journal = {Journal of Computational and Graphical Statistics}, 330 | year = 1996, 331 | volume = 5, 332 | number = 3, 333 | pages = {299--314}, 334 | url = {http://www.amstat.org/publications/jcgs/} 335 | } 336 | 337 | 338 | 339 | @article{ tetlock:MediaStockMarket, 340 | type={Accepted Paper Series}, 341 | title={{Giving Content to Investor Sentiment: The Role of Media in the Stock Market}}, 342 | author={Tetlock, Paul C. }, 343 | journal={Journal of Finance}, 344 | publisher={SSRN}, 345 | year = {2007}, 346 | doi={10.2139/ssrn.685145}, 347 | keywords={Investor sentiment, financial news media, content analysis, efficient markets}, 348 | location={http://ssrn.com/paper=685145}, 349 | language={English} 350 | } 351 | 352 | @inproceedings{Godbole+Srinivasaiah+Skiena:07a, 353 | author = {Namrata Godbole and Manjunath Srinivasaiah and Steven Skiena}, 354 | booktitle = {Proceedings of the International Conference on Weblogs and Social 355 | Media (ICWSM)}, 356 | interhash = {db9c97e105d4387821aa7b404cbeb04a}, 357 | intrahash = {b67e0f2a90a04960e14ea8453134ecb5}, 358 | title = {Large-Scale Sentiment Analysis for News and Blogs}, 359 | year = 2007, 360 | keywords = {analysis mining opinion sentiment}, 361 | added-at = {2009-03-18T13:40:43.000+0100}, 362 | biburl = {http://www.bibsonomy.org/bibtex/2b67e0f2a90a04960e14ea8453134ecb5/om} 363 | } 364 | 365 | @PHDTHESIS{Gottron:2008e, 366 | author = {Thomas Gottron}, 367 | title = {Content Extraction: Identifying the Main Content in HTML Documents}, 368 | school = {Johannes Gutenberg-University, Mainz}, 369 | year = {2008}, 370 | owner = {gotti}, 371 | timestamp = {2009.04.24} 372 | } 373 | 374 | 375 | @inproceedings{DBLP:conf/icwsm/ZhangS10, 376 | author = {Wenbin Zhang and 377 | Steven Skiena}, 378 | title = {Trading Strategies to Exploit Blog and News Sentiment}, 379 | booktitle = {ICWSM}, 380 | year = {2010}, 381 | ee = {http://www.aaai.org/ocs/index.php/ICWSM/ICWSM10/paper/view/1529}, 382 | crossref = {DBLP:conf/icwsm/2010}, 383 | bibsource = {DBLP, http://dblp.uni-trier.de} 384 | } 385 | 386 | @proceedings{DBLP:conf/icwsm/2010, 387 | editor = {William W. Cohen and 388 | Samuel Gosling}, 389 | title = {Proceedings of the Fourth International Conference on Weblogs 390 | and Social Media, ICWSM 2010, Washington, DC, USA, May 23-26, 391 | 2010}, 392 | booktitle = {ICWSM}, 393 | publisher = {The AAAI Press}, 394 | year = {2010}, 395 | bibsource = {DBLP, http://dblp.uni-trier.de} 396 | } 397 | 398 | @Book{NLTK, 399 | author = {Steven Bird and Ewan Klein and Edward Loper}, 400 | title = {{How people learn: Brain, mind, experience, and school}}, 401 | publisher = {O'Reilly Media}, 402 | year = 2009, 403 | address = {1005 Gravenstein Highwsay North, Sebastopol, CA 95472}, 404 | edition = {1}, 405 | url = {http://www.nltk.org/book} 406 | } 407 | 408 | @MISC{Fielding96t.berners-lee, 409 | author = {Tim Berners-Lee and R. Fielding and J. Gettys Dec and J. C. Mogul}, 410 | title = {T. Berners-Lee, MIT/LCS}, 411 | year = {1996} 412 | } 413 | 414 | 415 | @MISC{GeneralInquirer, 416 | author = {Philip Stone}, 417 | title = {The General Inquirer Home Page}, 418 | year = {2006} 419 | } 420 | 421 | 422 | 423 | @webpage{Reuters:Newsscope, 424 | author = "Thomson Reuters", 425 | title = "Newsscope", 426 | url = "http://thomsonreuters.com/products_services/financial/financial_products/event_driven_trading/newsscope_archive", 427 | lastchecked = {\today} 428 | } 429 | 430 | @webpage{RPack:RCurl, 431 | author = "Duncan Temple Lang", 432 | title = "The RCurl Package", 433 | url = "http://www.omegahat.org/RCurl/", 434 | lastchecked = {\today} 435 | } 436 | 437 | @webpage{RPack:PerformanceAnalytics, 438 | author = "Peter Carl and Brian G. Peterson", 439 | title = "PerformanceAnalytics: Econometric tools for performance and risk analysis", 440 | url = "http://cran.r-project.org/web/packages/PerformanceAnalytics/", 441 | year = 2010, 442 | month = 9, 443 | day = 15, 444 | lastchecked = {\today} 445 | } 446 | 447 | @Book{fPortfolio, 448 | title = {Portfolio Optimization with R/Rmetrics}, 449 | author = {Diethelm Wuertz and Yohan Chalabi and William Chen and 450 | Andrew Ellis}, 451 | year = {2010}, 452 | month = {April}, 453 | editor = {{Wuertz} and {Diethelm} and {Hanf} and {Martin}}, 454 | publisher = {Rmetrics Association & Finance Online, 455 | www.rmetrics.org}, 456 | note = {R package version 2130.80}, 457 | } 458 | 459 | @Book{Achelis:TechAnal, 460 | title = {Technical Analysis from A to Z}, 461 | author = {Steven Achelis}, 462 | year = {2000}, 463 | month = {October}, 464 | publisher = {McGraw-Hill; 2 edition}, 465 | isbn = {0071363483} 466 | } 467 | 468 | @webpage{hedgefundtwitter, 469 | author = "Jack Jordan", 470 | title = "Hedge Fund Will Track Twitter to Predict Stock Moves", 471 | url = "http://www.bloomberg.com/news/2010-12-22/hedge-fund-will-track-twitter-to-predict-stockmarket-movements.html", 472 | year = 2010, 473 | month = 12, 474 | day = 22, 475 | lastchecked = {\today} 476 | } 477 | 478 | @webpage{universalfeedparser, 479 | author = "Mark Pilgrim", 480 | title = "Universal Feed Parser", 481 | url = "http://feedparser.org/docs/", 482 | year = 2006, 483 | month = 01, 484 | day = 10, 485 | lastchecked = {\today} 486 | } 487 | 488 | @webpage{simplejson, 489 | author = "Bob Ippolito", 490 | title = "simplejson 2.1.5", 491 | url = "http://pypi.python.org/pypi/simplejson/", 492 | year = 2011, 493 | month = 04, 494 | day = 17, 495 | lastchecked = {\today} 496 | } 497 | 498 | @book{oliphant06guide, 499 | author = {Oliphant, T. E.}, 500 | booktitle = {Guide to NumPy}, 501 | citeulike-article-id = {2515650}, 502 | posted-at = {2008-03-11 16:41:13}, 503 | priority = {2}, 504 | publisher = {Trelgol Publishing}, 505 | title = {{Guide to NumPy}}, 506 | year = {2006} 507 | } 508 | 509 | @article{matplotlib, 510 | abstract = {{Matplotlib is a 2D graphics package for Python for application development, interactive scripting, and publication-quality image generation across user interfaces and operating systems.}}, 511 | address = {Los Alamitos, CA, USA}, 512 | author = {Hunter, John D.}, 513 | booktitle = {Computing in Science \& Engineering}, 514 | citeulike-article-id = {2878517}, 515 | citeulike-linkout-0 = {http://doi.ieeecomputersociety.org/10.1109/MCSE.2007.55}, 516 | citeulike-linkout-1 = {http://dx.doi.org/10.1109/MCSE.2007.55}, 517 | citeulike-linkout-2 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4160265}, 518 | doi = {10.1109/MCSE.2007.55}, 519 | issn = {1521-9615}, 520 | journal = {Computing in Science and Engineering}, 521 | keywords = {evaluation, python}, 522 | number = {3}, 523 | pages = {90--95}, 524 | posted-at = {2009-03-26 14:48:42}, 525 | priority = {2}, 526 | publisher = {IEEE Computer Society}, 527 | title = {{Matplotlib: A 2D Graphics Environment}}, 528 | url = {http://dx.doi.org/10.1109/MCSE.2007.55}, 529 | volume = {9}, 530 | year = {2007} 531 | } 532 | 533 | @inproceedings{boilerpipe, 534 | abstract = {{In addition to the actual content Web pages consist of navigational elements, templates, and advertisements. This boilerplate text typically is not related to the main content, may deteriorate search precision and thus needs to be detected properly. In this paper, we analyze a small set of shallow text features for classifying the individual text elements in a Web page. We compare the approach to complex, state-of-the-art techniques and show that competitive accuracy can be achieved, at almost no cost. Moreover, we derive a simple and plausible stochastic model for describing the boilerplate creation process. With the help of our model, we also quantify the impact of boilerplate removal to retrieval performance and show significant improvements over the baseline. Finally, we extend the principled approach by straight-forward heuristics, achieving a remarkable detection accuracy.}}, 535 | address = {New York, NY, USA}, 536 | author = {Kohlsch\"{u}tter, Christian and Fankhauser, Peter and Nejdl, Wolfgang}, 537 | booktitle = {Proceedings of the third ACM international conference on Web search and data mining}, 538 | citeulike-article-id = {8241255}, 539 | citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1718542}, 540 | citeulike-linkout-1 = {http://dx.doi.org/10.1145/1718487.1718542}, 541 | doi = {10.1145/1718487.1718542}, 542 | isbn = {978-1-60558-889-6}, 543 | location = {New York, New York, USA}, 544 | pages = {441--450}, 545 | posted-at = {2010-11-23 07:02:43}, 546 | priority = {2}, 547 | publisher = {ACM}, 548 | series = {WSDM '10}, 549 | title = {{Boilerplate detection using shallow text features}}, 550 | url = {http://dx.doi.org/10.1145/1718487.1718542}, 551 | year = {2010} 552 | } 553 | 554 | 555 | 556 | 557 | @misc{scipy, 558 | author = {Jones, Eric and Oliphant, Travis and Peterson, Pearu and Others}, 559 | citeulike-article-id = {3398487}, 560 | citeulike-linkout-0 = {http://www.scipy.org/}, 561 | keywords = {python, scipy}, 562 | posted-at = {2009-07-23 14:10:37}, 563 | priority = {2}, 564 | title = {{SciPy: Open source scientific tools for Python}}, 565 | url = {http://www.scipy.org/}, 566 | year = {2001} 567 | } 568 | 569 | @misc{mlpy, 570 | author = {Davide Albanese and Giuseppe Jurman and Roberto Visintainer}, 571 | title = {{mlpy Documentation}}, 572 | url = {https://mlpy.fbk.eu/data/mlpy.pdf}, 573 | year = {2010} 574 | } 575 | 576 | 577 | 578 | @misc{BehaveIntro, 579 | author = "Martin Swell", 580 | title = "Introduction to Behavioural Finance", 581 | url = "http://www.behaviouralfinance.net/behavioural-finance.pdf", 582 | year = 2010, 583 | month = 4, 584 | day = 14, 585 | lastchecked = {\today} 586 | } 587 | 588 | @book{Pareto:Homo, 589 | address = {Padova}, 590 | author = {Vilfredo Pareto}, 591 | booktitle = {Manuale Di Economia Politica. Con Una Introduzione Alla Scienza Sociale}, 592 | interhash = {a91d7162f83db6f54698ade6de04f0bb}, 593 | intrahash = {a4fda646bd4bf1e8d58442b790126bb1}, 594 | pages = {404 p.}, 595 | publisher = {CEDAM}, 596 | title = {Manuale di economia politica. Con una introduzione alla scienza sociale (1974)}, 597 | year = {1906}, 598 | date-modified = {2010-02-28 21:15:22 -0500}, 599 | keywords = {economic economy political}, 600 | added-at = {2010-03-02T17:25:53.000+0100}, 601 | biburl = {http://www.bibsonomy.org/bibtex/2a4fda646bd4bf1e8d58442b790126bb1/jrennstich}, 602 | language = {Italian} 603 | } 604 | 605 | @InProceedings{Pang+Lee+Vaithyanathan:02a, 606 | author = {Bo Pang and Lillian Lee and Shivakumar Vaithyanathan}, 607 | title = {Thumbs up? {Sentiment} Classification using Machine Learning Techniques}, 608 | booktitle = "Proceedings of the 2002 Conference on Empirical Methods in Natural 609 | Language Processing (EMNLP)", 610 | pages = {79--86}, 611 | year = 2002 612 | } 613 | 614 | @InProceedings{Cun02b, 615 | author = {H. Cunningham and D. Maynard and K. Bontcheva and V. Tablan}, 616 | title = {{GATE: A framework and graphical development environment for robust NLP tools and applications}}, 617 | booktitle = {Proceedings of the 40th Anniversary Meeting of the 618 | Association for Computational Linguistics}, 619 | year = 2002 620 | } 621 | 622 | @InProceedings{Pang+Lee:04a, 623 | author = {Bo Pang and Lillian Lee}, 624 | title = {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts}, 625 | booktitle = "Proceedings of the ACL", 626 | year = 2004 627 | } 628 | 629 | 630 | 631 | @inproceedings{DBLP:conf/tools/Rossum97, 632 | author = {{Guido van Rossum}}, 633 | title = {A Tour of the Python Language}, 634 | booktitle = {TOOLS (23)}, 635 | year = {1997}, 636 | pages = {370}, 637 | ee = {http://doi.ieeecomputersociety.org/10.1109/TOOLS.1997.10001}, 638 | crossref = {DBLP:conf/tools/23-1997}, 639 | bibsource = {DBLP, http://dblp.uni-trier.de} 640 | } 641 | 642 | 643 | 644 | 645 | 646 | @article{penntreebank, 647 | abstract = {{this paper, we review our experience with constructing one such large annotated 648 | corpus--the Penn Treebank, a corpus consisting of over 4.5 million words of American 649 | English. During the first three-year phase of the Penn Treebank Project (1989-1992), this 650 | corpus has been annotated for part-of-speech (POS) information. In addition, over half 651 | 652 | of it has been annotated for skeletal syntactic structure. These materials are available 653 | to members of the Linguistic Data Consortium; for details, see...}}, 654 | author = {Marcus, Mitchell P. and Santorini, Beatrice and Marcinkiewicz, Mary A.}, 655 | citeulike-article-id = {1205174}, 656 | citeulike-linkout-0 = {http://acl.ldc.upenn.edu/J/J93/J93-2004.pdf}, 657 | citeulike-linkout-1 = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.9706}, 658 | journal = {Computational Linguistics}, 659 | keywords = {annotation, corpora, english, nlp, penn-treebank}, 660 | number = {2}, 661 | pages = {313--330}, 662 | posted-at = {2009-05-18 10:02:58}, 663 | priority = {2}, 664 | title = {{Building a Large Annotated Corpus of English: The Penn Treebank}}, 665 | url = {http://acl.ldc.upenn.edu/J/J93/J93-2004.pdf}, 666 | volume = {19}, 667 | year = {1994} 668 | } 669 | 670 | 671 | 672 | @misc{miningpeanut, 673 | abstract = {{The web contains a wealth of product reviews, but sifting through 674 | them is a daunting task. Ideally, an opinion mining tool would process 675 | a set of search results for a given item, generating a list of 676 | product attributes (quality, features, etc.) and aggregating opinions 677 | about each of them (poor, mixed, good). We begin by identifying 678 | the unique properties of this problem and develop a method 679 | for automatically distinguishing between positive and negative reviews. 680 | Our classifier draws on...}}, 681 | author = {Dave, D. and Lawrence, S.}, 682 | citeulike-article-id = {899598}, 683 | citeulike-linkout-0 = {http://citeseer.ist.psu.edu/dave03mining.html}, 684 | citeulike-linkout-1 = {http://citeseer.lcs.mit.edu/dave03mining.html}, 685 | citeulike-linkout-2 = {http://citeseer.ifi.unizh.ch/dave03mining.html}, 686 | citeulike-linkout-3 = {http://citeseer.comp.nus.edu.sg/dave03mining.html}, 687 | keywords = {blogs, lecture-8, social, web, web\_20}, 688 | posted-at = {2008-02-25 21:56:38}, 689 | priority = {2}, 690 | year = 2003, 691 | title = {{Mining the peanut gallery: opinion extraction and semantic classification of product reviews}}, 692 | url = {http://citeseer.ist.psu.edu/dave03mining.html} 693 | } 694 | 695 | 696 | 697 | @book{webdatamining, 698 | abstract = {{

Web mining aims to discover useful information and knowledge from the Web hyperlink structure, page contents, and usage data. Although Web mining uses many conventional data mining techniques, it is not purely an application of traditional data mining due to the semistructured and unstructured nature of the Web data and its heterogeneity. It has also developed many of its own algorithms and techniques.

Liu has written a comprehensive text on Web data mining. Key topics of structure mining, content mining, and usage mining are covered both in breadth and in depth. His book brings together all the essential concepts and algorithms from related areas such as data mining, machine learning, and text processing to form an authoritative and coherent text.

The book offers a rich blend of theory and practice, addressing seminal research ideas, as well as examining the technology from a practical point of view. It is suitable for students, researchers and practitioners interested in Web mining both as a learning text and a reference book. Lecturers can readily use it for classes on data mining, Web mining, and Web search. Additional teaching materials such as lecture slides, datasets, and implemented algorithms are available online.

}}, 699 | author = {Liu, Bing}, 700 | citeulike-article-id = {975464}, 701 | citeulike-linkout-0 = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&path=ASIN/3540378812}, 702 | citeulike-linkout-1 = {http://www.amazon.de/exec/obidos/redirect?tag=citeulike01-21\&path=ASIN/3540378812}, 703 | citeulike-linkout-2 = {http://www.amazon.fr/exec/obidos/redirect?tag=citeulike06-21\&path=ASIN/3540378812}, 704 | citeulike-linkout-3 = {http://www.amazon.jp/exec/obidos/ASIN/3540378812}, 705 | citeulike-linkout-4 = {http://www.amazon.co.uk/exec/obidos/ASIN/3540378812/citeulike00-21}, 706 | citeulike-linkout-5 = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/3540378812}, 707 | citeulike-linkout-6 = {http://www.worldcat.org/isbn/3540378812}, 708 | citeulike-linkout-7 = {http://books.google.com/books?vid=ISBN3540378812}, 709 | citeulike-linkout-8 = {http://www.amazon.com/gp/search?keywords=3540378812\&index=books\&linkCode=qs}, 710 | citeulike-linkout-9 = {http://www.librarything.com/isbn/3540378812}, 711 | day = {21}, 712 | edition = {1st ed. 2007. Corr. 2nd printing}, 713 | howpublished = {Hardcover}, 714 | isbn = {3540378812}, 715 | keywords = {data-mining, machine-learning}, 716 | month = jan, 717 | posted-at = {2007-10-02 19:55:48}, 718 | priority = {2}, 719 | publisher = {Springer}, 720 | title = {{Web Data Mining: Exploring Hyperlinks, Contents, and Usage Data (Data-Centric Systems and Applications)}}, 721 | url = {http://www.worldcat.org/isbn/3540378812}, 722 | year = {2009} 723 | } 724 | 725 | 726 | 727 | 728 | @webpage{opennlp, 729 | author = "Apache, Incubator", 730 | title = "openNLP", 731 | url = "http://incubator.apache.org/opennlp/", 732 | year = 2011, 733 | month = 01, 734 | day = 29, 735 | lastchecked = {\today} 736 | } 737 | 738 | @webpage{stanfordpos, 739 | author = "Stanford NLP, (The Stanford Natural Language Processing Group)", 740 | title = "Stanford Log-linear Part-Of-Speech Tagger", 741 | url = "http://nlp.stanford.edu/software/tagger.shtml", 742 | year = 2010, 743 | month = 05, 744 | day = 21, 745 | lastchecked = {\today} 746 | } 747 | 748 | @inproceedings{sentimentanalysissvm, 749 | address = {Barcelona, Spain}, 750 | author = {Mullen, Tony and Collier, Nigel}, 751 | booktitle = {Proceedings of EMNLP 2004}, 752 | citeulike-article-id = {4742195}, 753 | citeulike-linkout-0 = {http://www.aclweb.org/anthology-new/W/W04/W04-3253.bib}, 754 | citeulike-linkout-1 = {http://www.aclweb.org/anthology-new/W/W04/W04-3253.pdf}, 755 | editor = {Lin, Dekang and Wu, Dekai}, 756 | keywords = {detection, different, learning, machine, pmi, sentiment, sources, svm}, 757 | month = jul, 758 | pages = {412--418}, 759 | posted-at = {2009-06-04 09:21:18}, 760 | priority = {0}, 761 | publisher = {Association for Computational Linguistics}, 762 | title = {{Sentiment Analysis using Support Vector Machines with Diverse Information Sources}}, 763 | url = {http://www.aclweb.org/anthology-new/W/W04/W04-3253.bib}, 764 | year = {2004} 765 | } 766 | 767 | 768 | @webpage{moviereviews, 769 | author = "Pang, Bo and Lee, Lillian", 770 | title = "Movie Review Data", 771 | url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/", 772 | year = 2009, 773 | month = 10, 774 | day = 1, 775 | lastchecked = {\today} 776 | } 777 | 778 | @webpage{quantly:lingfranc, 779 | author = "Quantivity", 780 | title = "Algorithmic Lingua Franca", 781 | url = "http://quantivity.wordpress.com/2010/01/02/algorithmic-lingua-franca/", 782 | year = 2010, 783 | month = 1, 784 | day = 2, 785 | lastchecked = {\today} 786 | } 787 | 788 | @webpage{RPack:snippets, 789 | author = "Simon Urbanek", 790 | title = "Code snippets, mostly visualization-related", 791 | url = "http://www.rforge.net/snippets/", 792 | year = 2011, 793 | month = 2, 794 | day = 15, 795 | lastchecked = {\today} 796 | } 797 | 798 | 799 | 800 | 801 | @webpage{RMetrics, 802 | author = "Rmetrics, Association", 803 | title = "Rmetrics The premier open source software solution for teaching and training quantitative finance", 804 | url = "https://www.rmetrics.org/", 805 | year = 2011, 806 | month = 4, 807 | day = 6, 808 | lastchecked = {\today} 809 | } 810 | 811 | @MISC{RPack:XML, 812 | author = "Duncan Temple Lang", 813 | title = "XML: Tools for parsing and generating XML within R and S-Plus", 814 | url = "http://www.omegahat.org/RSXML", 815 | lastchecked = {\today} 816 | } 817 | 818 | 819 | @MISC{RPack:tm, 820 | author = "Ingo Feinerer", 821 | title = "tm: Text Mining Package", 822 | url = "http://tm.r-forge.r-project.org/", 823 | lastchecked = {\today} 824 | } 825 | 826 | @MISC{RPack:xts, 827 | author = "Jeffrey A. Ryan and Josh M. Ulrich", 828 | title = "xts: Extensible Time Series", 829 | url = "http://r-forge.r-project.org/projects/xts/", 830 | year = 2011, 831 | lastchecked = {\today} 832 | } 833 | 834 | @MISC{RPack:TTR, 835 | author = "Joshua Ulrich", 836 | title = "TTR: Technical Trading Rules", 837 | url = "http://cran.at.r-project.org/web/packages/TTR/TTR.pdf", 838 | year = 2010, 839 | lastchecked = {\today} 840 | } 841 | 842 | @MISC{RPack:quantmod, 843 | author = "Jeffrey A. Ryan", 844 | title = "quantmod: Quantitative Financial Modelling Framework", 845 | year = {2009}, 846 | url = "http://www.quantmod.com/", 847 | lastchecked = {\today} 848 | } 849 | 850 | @MISC{RPack:slam, 851 | author = "Kurt Hornik and David Meyer and Christian Buchta", 852 | title = "slam: Sparse Lightweight Arrays and Matrices", 853 | url = "http://cran.at.r-project.org/web/packages/slam/slam.pdf", 854 | year = 2011, 855 | lastchecked = {\today} 856 | } 857 | 858 | @MISC{RPack:zoo, 859 | author = "Achim Zeileis and Gabor Grothendieck and Felix Andrews", 860 | title = "zoo: Z's ordered observations", 861 | url = "http://r-forge.r-project.org/projects/zoo/", 862 | year = 2011, 863 | lastchecked = {\today} 864 | } 865 | 866 | @MISC{GoogleNewsArchive, 867 | author = "Google", 868 | title = "Google News Archive Search", 869 | url = "http://news.google.com/archivesearch", 870 | lastchecked = {\today} 871 | } 872 | 873 | @MISC{XML, 874 | author = "W3C", 875 | title = "Extensible Markup Language (XML) 1.0 (Fifth Edition)", 876 | url = "http://www.w3.org/TR/REC-xml/", 877 | year = 2008, 878 | month = 11, 879 | day = 26, 880 | lastchecked = {\today} 881 | } 882 | 883 | @MISC{JavaScript, 884 | author = "Mozilla", 885 | title = "JavaScript", 886 | url = "https://developer.mozilla.org/en/JavaScript#Documentation", 887 | year = 2011, 888 | lastchecked = {\today} 889 | } 890 | 891 | @MISC{RSS, 892 | author = "RSS Advisory Board", 893 | title = "RSS 2.0 Specification", 894 | url = "http://www.rssboard.org/rss-specification", 895 | year = 2002, 896 | lastchecked = {\today} 897 | } 898 | 899 | @MISC{ATOM, 900 | author = "IETF", 901 | title = "The Atom Syndication Format", 902 | url = "http://tools.ietf.org/html/rfc4287", 903 | year = 2005, 904 | lastchecked = {\today} 905 | } 906 | 907 | @MISC{JSON, 908 | author = "Douglas Crockford", 909 | title = "Introducing JSON", 910 | url = "http://www.json.org", 911 | year = 2002, 912 | lastchecked = {\today} 913 | } 914 | 915 | 916 | @MISC{GoogleFinance, 917 | author = "Google", 918 | title = "Google Finance", 919 | url = "http://www.google.com/finance", 920 | lastchecked = {\today} 921 | } 922 | 923 | @MISC{YahooFinance, 924 | author = "Yahoo!", 925 | title = "Yahoo! Finance", 926 | url = "http://finance.yahoo.com/", 927 | lastchecked = {\today} 928 | } 929 | 930 | @mastersthesis{Hariharan04NewsMining, 931 | author = "Gurushyam Hariharan", 932 | title = "News Mining Agent for Automated Stock Trading", 933 | school = "University of Texas, Austin", 934 | year = "2004"} 935 | 936 | @book{hadoop, 937 | abstract = {{Hadoop: The Definitive Guide helps you harness the power of your data. Ideal 938 | for processing large datasets, the Apache Hadoop framework is an open source 939 | implementation of the MapReduce algorithm on which Google built its empire. 940 | This comprehensive resource demonstrates how to use Hadoop to build reliable, 941 | scalable, distributed systems: programmers will find details for analyzing 942 | large datasets, and administrators will learn how to set up and run Hadoop 943 | clusters. Complete with case studies that illustrate how Hadoop solves 944 | specific problems, this book helps you: 945 | 946 | Use the Hadoop Distributed File System (HDFS) for storing large datasets, and 947 | run distributed computations over those datasets using MapReduce Become 948 | familiar with Hadoop's data and I/O building blocks for compression, data 949 | integrity, serialization, and persistence Discover common pitfalls and 950 | advanced features for writing real-world MapReduce programs Design, build, and 951 | administer a dedicated Hadoop cluster, or run Hadoop in the cloud Use Pig, a 952 | high-level query language for large-scale data processing Take advantage of 953 | HBase, Hadoop's database for structured and semi-structured data Learn 954 | ZooKeeper, a toolkit of coordination primitives for building distributed 955 | systems 956 | 957 | If you have lots of data -- whether it's gigabytes or petabytes -- Hadoop is 958 | the perfect solution. Hadoop: The Definitive Guide is the most thorough book 959 | available on the subject. "Now you have the opportunity to learn about Hadoop 960 | from a master-not only of the technology, but also of common sense and plain 961 | talk." -- Doug Cutting, Hadoop Founder, Yahoo!}}, 962 | author = {White, Tom}, 963 | citeulike-article-id = {4882841}, 964 | citeulike-linkout-0 = {http://www.amazon.ca/exec/obidos/redirect?tag=citeulike09-20\&path=ASIN/0596521979}, 965 | citeulike-linkout-1 = {http://www.amazon.de/exec/obidos/redirect?tag=citeulike01-21\&path=ASIN/0596521979}, 966 | citeulike-linkout-2 = {http://www.amazon.fr/exec/obidos/redirect?tag=citeulike06-21\&path=ASIN/0596521979}, 967 | citeulike-linkout-3 = {http://www.amazon.jp/exec/obidos/ASIN/0596521979}, 968 | citeulike-linkout-4 = {http://www.amazon.co.uk/exec/obidos/ASIN/0596521979/citeulike00-21}, 969 | citeulike-linkout-5 = {http://www.amazon.com/exec/obidos/redirect?tag=citeulike07-20\&path=ASIN/0596521979}, 970 | citeulike-linkout-6 = {http://www.worldcat.org/isbn/0596521979}, 971 | citeulike-linkout-7 = {http://books.google.com/books?vid=ISBN0596521979}, 972 | citeulike-linkout-8 = {http://www.amazon.com/gp/search?keywords=0596521979\&index=books\&linkCode=qs}, 973 | citeulike-linkout-9 = {http://www.librarything.com/isbn/0596521979}, 974 | day = {05}, 975 | edition = {1}, 976 | howpublished = {Paperback}, 977 | isbn = {0596521979}, 978 | month = jun, 979 | posted-at = {2009-06-20 17:40:53}, 980 | priority = {2}, 981 | publisher = {O'Reilly Media}, 982 | title = {{Hadoop: The Definitive Guide}}, 983 | url = {http://www.worldcat.org/isbn/0596521979}, 984 | year = {2009} 985 | } 986 | 987 | --------------------------------------------------------------------------------