├── .github
    ├── .gitignore
    └── workflows
    │   ├── pkgdown.yaml
    │   └── R-CMD-check.yaml
├── vignettes
    ├── .gitignore
    └── inspect.png
├── R
    ├── sysdata.rda
    ├── zzz.R
    ├── read_cookies.R
    ├── deliver_3sat_de.R
    ├── deliver_tag24_de.R
    ├── deliver_taz_de.R
    ├── deliver_watson_ch.R
    ├── deliver_watson_de.R
    ├── deliver_epochtimes_de.R
    ├── deliver_infranken_de.R
    ├── deliver_sueddeutsche_de.R
    ├── deliver_welt_de.R
    ├── deliver_berliner_kurier_de.R
    ├── deliver_tagesschau_de.R
    ├── deliver_news_und_nachrichten_de.R
    ├── deliver_t3n_de.R
    ├── deliver_deutschlandfunk_de.R
    ├── deliver_swr3_de.R
    ├── deliver_swr_de.R
    ├── deliver_sfgate_com.R
    ├── deliver_br_de.R
    ├── deliver_shz_de.R
    ├── deliver_mdr_de.R
    ├── deliver_n-tv_de.R
    ├── deliver_news_de.R
    ├── deliver_nw_de.R
    ├── deliver_morgenpost_de.R
    ├── deliver_swrfernsehen_de.R
    ├── deliver_derstandard_at.R
    ├── deliver_newsweek_com.R
    ├── deliver_stern_de.R
    ├── deliver_tagesspiegel_de.R
    ├── deliver_kurier_at.R
    ├── deliver_srf_ch.R
    ├── deliver_wz_de.R
    ├── deliver_marketwatch_com.R
    ├── deliver_forbes_com.R
    ├── deliver_frankenpost_de.R
    ├── deliver_orf_at.R
    ├── deliver_presseportal_de.R
    ├── deliver_berliner_zeitung_de.R
    ├── deliver_kabeleins_de.R
    ├── deliver_nordkurier_de.R
    ├── deliver_swp_de.R
    ├── deliver_noz_de.R
    ├── deliver_dailymail_co_uk.R
    ├── deliver_denverpost_com.R
    ├── deliver_schwaebische_de.R
    ├── deliver_finanzen_net.R
    ├── deliver_nzz_ch.R
    ├── deliver_thueringer_allgemeine_de.R
    ├── deliver_vice_com.R
    ├── deliver_rbb24_de.R
    ├── deliver_rp_online_de.R
    ├── deliver_spiegel_de.R
    ├── deliver_volksstimme_de.R
    ├── deliver_independent_co_uk.R
    ├── deliver_bild_de.R
    ├── deliver_abendzeitung_muenchen_de.R
    ├── deliver_deutschlandfunkkultur_de.R
    ├── deliver_dnn_de.R
    ├── deliver_fr_de.R
    ├── deliver_maz_online_de.R
    ├── deliver_t_online_de.R
    ├── deliver_wsj_com.R
    ├── deliver_badische_zeitung_de.R
    ├── deliver_heise_de.R
    ├── deliver_tz_de.R
    ├── deliver_wa_de.R
    ├── deliver_fnp_de.R
    ├── deliver_hna_de.R
    ├── deliver_ruhr24_de.R
    ├── deliver_waz_de.R
    ├── deliver_echo24_de.R
    ├── deliver_manager_magazin_de.R
    ├── deliver_merkur_de.R
    ├── deliver_heidelberg24_de.R
    ├── deliver_kreiszeitung_de.R
    ├── deliver_latimes_com.R
    ├── deliver_techrepublic_com.R
    ├── deliver_augsburger_allgemeine.R
    ├── deliver_lvz_de.R
    ├── deliver_foxbusiness_com.R
    ├── deliver_saechsische_de.R
    ├── deliver_abendblatt_de.R
    ├── deliver_zeit_de.R
    ├── deliver_bnn_de.R
    ├── deliver_breakingnews_ie.R
    ├── deliver_joe_ie.R
    ├── deliver_derwesten_de.R
    ├── deliver_thesun_ie.R
    ├── deliver_freiepresse_de.R
    ├── deliver_breitbart_com.R
    ├── deliver_thecanary_co.R
    ├── deliver_yahoo_com.R
    ├── deliver_focus_de.R
    ├── deliver_haz_de.R
    ├── deliver_anotherangryvoice_blogspot_com.R
    ├── deliver_cbsnews_com.R
    ├── deliver_ac24_cz.R
    ├── deliver_nypost_com.R
    ├── deliver_telegraph_co_uk.R
    ├── deliver_businessinsider_de.R
    ├── deliver_newsflash24_de.R
    ├── deliver_suedkurier_de.R
    ├── deliver_vox_de.R
    ├── deliver_blesk_cz.R
    ├── deliver_buzzfeed_com.R
    ├── deliver_seznamzpravy_cz.R
    ├── deliver_karlsruhe_insider_de.R
    ├── deliver_wiwo_de.R
    ├── deliver_evolvepolitics_com.R
    ├── deliver_geenstijl_nl.R
    ├── deliver_jungefreiheit_de.R
    ├── deliver_nu_nl.R
    ├── deliver_ostsee_zeitung_de.R
    ├── deliver_stuttgarter_zeitung_de.R
    ├── deliver_wdr_de.R
    ├── deliver_bbc_co_uk.R
    ├── deliver_rollingstone_de.R
    ├── deliver_irishmirror_ie.R
    ├── deliver_skwawkbox_org.R
    ├── deliver_mediacourant_nl.R
    ├── deliver_newstatesman_com.R
    ├── deliver_ndr_de.R
    ├── deliver_ruhrnachrichten_de.R
    ├── deliver_thejournal_ie.R
    ├── inspect.R
    ├── deliver_nos_nl.R
    ├── deliver_metronieuws_nl.R
    ├── deliver_irishexaminer_com.R
    ├── deliver_mopo_de.R
    ├── deliver_aktualne_cz.R
    ├── deliver_idnes_cz.R
    ├── deliver_independent_ie.R
    ├── deliver_rtl_nl.R
    ├── deliver_irozhlas_cz.R
    ├── deliver_denikn_cz.R
    ├── html_search.R
    ├── deliver_novinky_cz.R
    ├── deliver_der_postillon_com.R
    ├── deliver_ksta_de.R
    ├── deliver_rte_ie.R
    ├── deliver_express_de.R
    ├── deliver_irishtimes_com.R
    ├── deliver_rnd_de.R
    ├── deliver_sky_com.R
    ├── deliver_hn_cz.R
    ├── deliver_rtl_de.R
    ├── deliver_ceskatelevize_cz.R
    ├── deliver_prosieben_de.R
    ├── deliver_lidovky_cz.R
    ├── deliver_parlamentnilisty_cz.R
    ├── deliver_huffpost_com.R
    ├── deliver_telegraaf_nl.R
    ├── deliver_nytimes_com.R
    ├── deliver_nrc_nl.R
    └── deliver_cnn_com.R
├── tests
    ├── testthat.R
    ├── spelling.R
    └── testthat
    │   ├── test-rss.R
    │   ├── test-parser.R
    │   ├── test-deliver.R
    │   └── test-misc.R
├── _pkgdown.yml
├── .gitignore
├── .Rbuildignore
├── codecov.yml
├── man
    ├── pb_read_cookies.Rd
    ├── test_parser.Rd
    ├── reexports.Rd
    ├── pb_available.Rd
    ├── pb_inspect.Rd
    ├── pb_new.Rd
    ├── pb_collect_rss.Rd
    ├── html_search.Rd
    ├── pb_deliver_paper.Rd
    ├── pb_find_rss.Rd
    └── pb_deliver.Rd
├── paperboy.Rproj
├── inst
    ├── templates
    │   └── deliver_.R
    └── WORDLIST
├── submit2cran.r
└── DESCRIPTION


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JBGruber/paperboy/HEAD/R/sysdata.rda


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(paperboy)
3 | 
4 | test_check("paperboy")
5 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://jbgruber.github.io/paperboy/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/vignettes/inspect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JBGruber/paperboy/HEAD/vignettes/inspect.png


--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if (requireNamespace("spelling", quietly = TRUE)) {
2 |   spelling::spell_check_test(
3 |     vignettes = TRUE,
4 |     error = FALSE,
5 |     skip_on_cran = TRUE
6 |   )
7 | }
8 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | .onLoad <- function(libname, pkgname) {
2 |   verbose <- getOption("paperboy_verbose")
3 |   if (is.null(verbose)) options(paperboy_verbose = TRUE)
4 | }
5 | paperboy.env <- new.env()
6 | . <- NULL
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .Rdata
 4 | .httr-oauth
 5 | .DS_Store
 6 | tests/spelling.Rout.save
 7 | tests/local-files
 8 | Update_package.R
 9 | test_data.rds
10 | /doc/
11 | /Meta/
12 | docs
13 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^README\.Rmd$
 4 | /tests/local-files
 5 | ^\.github$
 6 | ^codecov\.yml$
 7 | ^submit2cran\.r$
 8 | ^test_data\.rds$
 9 | ^doc$
10 | ^Meta$
11 | ^_pkgdown\.yml$
12 | ^docs$
13 | ^pkgdown$
14 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/R/read_cookies.R:
--------------------------------------------------------------------------------
 1 | #' Read in cookie file
 2 | #'
 3 | #' Deprecated in favour of \link[cookiemonster]{add_cookies}.
 4 | #'
 5 | #' @param ... not used.
 6 | #' @export
 7 | pb_read_cookies <- function(...) {
 8 | 
 9 |   .Deprecated(msg = "this functionality has been moved to the cookiemonster package. See `?cookiemonster::add_cookies`")
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/man/pb_read_cookies.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/read_cookies.R
 3 | \name{pb_read_cookies}
 4 | \alias{pb_read_cookies}
 5 | \title{Read in cookie file}
 6 | \usage{
 7 | pb_read_cookies(...)
 8 | }
 9 | \arguments{
10 | \item{...}{not used.}
11 | }
12 | \description{
13 | Deprecated in favour of \link[cookiemonster]{add_cookies}.
14 | }
15 | 


--------------------------------------------------------------------------------
/man/test_parser.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_dev.R
 3 | \name{test_parser}
 4 | \alias{test_parser}
 5 | \title{Test a Parser}
 6 | \usage{
 7 | test_parser(test_data)
 8 | }
 9 | \arguments{
10 | \item{test_data}{A data frame of raw content.}
11 | }
12 | \value{
13 | A success or failure message.
14 | }
15 | \description{
16 | Test a parser using a data frame from \link{pb_collect}.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/reexports.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \docType{import}
 4 | \name{reexports}
 5 | \alias{reexports}
 6 | \alias{\%>\%}
 7 | \title{Objects exported from other packages}
 8 | \keyword{internal}
 9 | \description{
10 | These objects are imported from other packages. Follow the links
11 | below to see their documentation.
12 | 
13 | \describe{
14 |   \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}}
15 | }}
16 | 
17 | 


--------------------------------------------------------------------------------
/paperboy.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | 
22 | UseNativePipeOperator: No
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test-rss.R:
--------------------------------------------------------------------------------
 1 | test_that("rss is collected", {
 2 |   nyt <- pb_collect_rss("https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml")
 3 |   expect_s3_class(
 4 |     nyt,
 5 |     "data.frame"
 6 |   )
 7 |   expect_more_than(
 8 |     nrow(nyt),
 9 |     0
10 |   )
11 |   expect_equal({
12 |     c(nrow(nyt) > 1, c("title", "link", "published") %in% colnames(nyt))
13 |   }, rep(TRUE, 4))
14 | })
15 | 
16 | test_that("rss is expanded", {
17 |   expect_equal({
18 |     res <- pb_collect(urls = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml")
19 |     c(nrow(res) > 1, ncol(res))
20 |   }, c(1, 5))
21 | })
22 | 


--------------------------------------------------------------------------------
/man/pb_available.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{pb_available}
 4 | \alias{pb_available}
 5 | \title{Show available parsers}
 6 | \usage{
 7 | pb_available(...)
 8 | }
 9 | \arguments{
10 | \item{...}{optionally pass URLs to check if respective parser(s) is/are available.}
11 | }
12 | \value{
13 | A character vector of supported domains.
14 | }
15 | \description{
16 | Show available parsers
17 | }
18 | \examples{
19 | pb_available()
20 | pb_available("https://edition.cnn.com/",
21 |              "https://www.nytimes.com/",
22 |              "https://www.google.com/")
23 | }
24 | 


--------------------------------------------------------------------------------
/man/pb_inspect.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/inspect.R
 3 | \name{pb_inspect}
 4 | \alias{pb_inspect}
 5 | \title{Inspect content collected with pb_collect}
 6 | \usage{
 7 | pb_inspect(x, i = 1L, host_ip = "127.0.0.1", port = httpuv::randomPort())
 8 | }
 9 | \arguments{
10 | \item{x}{a data.frame returned by \link{pb_collect}.}
11 | 
12 | \item{i}{which entry to display.}
13 | 
14 | \item{host_ip, port}{host IP and port to create the temporary web server that
15 | shows the content.}
16 | }
17 | \description{
18 | Opens a browser to display the content saved in a row of a data.frame created
19 | with \link{pb_collect}.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/pb_new.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils_dev.R
 3 | \name{pb_new}
 4 | \alias{pb_new}
 5 | \title{Create new scraper}
 6 | \usage{
 7 | pb_new(np, author = "", issue = "")
 8 | }
 9 | \arguments{
10 | \item{np}{domain or a URL of the newspaper this scraper is for.}
11 | 
12 | \item{author}{who wrote it.}
13 | 
14 | \item{issue}{is there a GitHub issue?}
15 | }
16 | \description{
17 | Create new scraper
18 | }
19 | \examples{
20 | \dontrun{
21 | paperboy:::pb_new(np = "https://www.buzzfeed.com/",
22 |                   author = "[@JBGruber](https://github.com/JBGruber/)")
23 | 
24 | paperboy:::pb_new_done()
25 | }
26 | }
27 | \keyword{internal}
28 | 


--------------------------------------------------------------------------------
/man/pb_collect_rss.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rss.r
 3 | \name{pb_collect_rss}
 4 | \alias{pb_collect_rss}
 5 | \title{Collect RSS feed}
 6 | \usage{
 7 | pb_collect_rss(x, parse = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{URL(s) to RSS or Atom feed(s).}
11 | 
12 | \item{parse}{Whether the results should be parsed into a data.frame. Turn off for debugging.}
13 | 
14 | \item{...}{passed to pb_collect.}
15 | }
16 | \value{
17 | a data.frame or list
18 | }
19 | \description{
20 | Collect articles from RSS or Atom feed(s)
21 | }
22 | \examples{
23 | \dontrun{
24 | pb_collect_rss("https://www.washingtonpost.com/arcio/rss/")
25 | # works with atom feeds too
26 | pb_collect_rss("https://www.nu.nl/rss")
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/testthat/test-parser.R:
--------------------------------------------------------------------------------
 1 | test_parse_rss <- function(rss) {
 2 | 
 3 |   test_that(desc = paste("test:", rss), {
 4 |     expect_no_error({
 5 |       test_df <- pb_collect(rss, collect_rss = TRUE, verbose = FALSE, timeout = 90)
 6 |       if (all(!test_df$status < 400L)) {
 7 |         stop("No data could be retrieved from the RSS feed")
 8 |       }
 9 |       test_parser(test_df)
10 |     })
11 |   })
12 | 
13 | }
14 | 
15 | if (as.logical(Sys.getenv("PB_TEST_PARSER", unset = "FALSE"))) {
16 |   status <- utils::read.csv(system.file("status.csv", package = "paperboy"))
17 |   rss_feeds <- setdiff(
18 |     na.omit(status$rss),
19 |     c(
20 |       "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"
21 |     )
22 |   )
23 |   lapply(rss_feeds, test_parse_rss)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/html_search.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/html_search.R
 3 | \name{html_search}
 4 | \alias{html_search}
 5 | \title{Search raw html for attributes}
 6 | \usage{
 7 | html_search(html, selectors, attributes = NULL, all = TRUE, n = 1L)
 8 | }
 9 | \arguments{
10 | \item{html}{raw html}
11 | 
12 | \item{selectors}{a vector of CSS selectors to include in search.}
13 | 
14 | \item{attributes}{attributes to extract. If NULL, returns text.}
15 | 
16 | \item{all}{if TRUE, all selectors are collected. Otherwise, only the first
17 | non-empty result is used.}
18 | 
19 | \item{n}{if multiple are found, how many to return}
20 | }
21 | \value{
22 | a vector of max length n
23 | }
24 | \description{
25 | Search raw html for attributes
26 | }
27 | \keyword{internal}
28 | 


--------------------------------------------------------------------------------
/R/deliver_3sat_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.3sat_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 |     datetime <- html %>%
 7 |         rvest::html_elements("time") %>%
 8 |         rvest::html_attr("datetime") %>%
 9 |         lubridate::as_datetime()
10 | 
11 |     headline <- html %>%
12 |         rvest::html_elements(".main-content-details h2") %>%
13 |         rvest::html_text()
14 | 
15 |     author <- "" # no author info found
16 | 
17 |     text <- html %>%
18 |         rvest::html_elements(".o--post-long p") %>%
19 |         rvest::html_text2() %>%
20 |         paste(collapse = "\n")
21 | 
22 |     s_n_list(
23 |         datetime,
24 |         author,
25 |         headline,
26 |         text
27 |     )
28 | }
29 | 


--------------------------------------------------------------------------------
/man/pb_deliver_paper.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deliver.R
 3 | \name{pb_deliver_paper}
 4 | \alias{pb_deliver_paper}
 5 | \title{internal function to deliver specific newspapers}
 6 | \usage{
 7 | pb_deliver_paper(x, verbose, pb, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{Either a vector of URLs or a data.frame returned by
11 | \link{pb_collect}.}
12 | 
13 | \item{verbose}{\code{FALSE} turns deliver silent. \code{TRUE} prints status
14 | messages and a progress bar on the screen. \code{2L} turns on debug mode.
15 | If \code{NULL} will be determined from
16 | \code{getOption("paperboy_verbose")}.}
17 | 
18 | \item{pb}{a progress bar object.}
19 | 
20 | \item{...}{Passed on to \link{pb_collect}.}
21 | }
22 | \description{
23 | internal function to deliver specific newspapers
24 | }
25 | \keyword{internal}
26 | 


--------------------------------------------------------------------------------
/R/deliver_tag24_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.tag24_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         datetime <- lubridate::as_datetime(json_df$datePublished)
13 |         headline <- json_df$headline
14 |         author <- toString(json_df$author$name)
15 |         text <- json_df$articleBody
16 | 
17 |         s_n_list(
18 |             datetime,
19 |             author,
20 |             headline,
21 |             text
22 |         )
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/R/deliver_taz_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.taz_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) <= 2) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[3])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- json_df$articleBody
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_watson_ch.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.watson_ch <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- json_df$articleBody
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_watson_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.watson_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- json_df$articleBody
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_epochtimes_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.epochtimes_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- json_df$articleBody
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_infranken_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.infranken_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- json_df$articleBody
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_sueddeutsche_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.sueddeutsche_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- json_df$articleBody
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_welt_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.welt_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- trimws(gsub("<[^>]+>", "", json_df$articleBody))
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_berliner_kurier_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.berliner_kurier_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text()
 8 |     json_df <- jsonlite::fromJSON(json_txt)
 9 | 
10 |     datetime <- lubridate::as_datetime(json_df$datePublished)
11 |     headline <- json_df$headline
12 |     author <- toString(json_df$author$name)
13 |     text <- html %>%
14 |         rvest::html_elements(".article_header-lead__0E3Bn, p.article_paragraph__hXYKJ, h2.article_subtitle__wx1Lu") %>%
15 |         rvest::html_text2() %>%
16 |         paste(collapse = "\n")
17 | 
18 |     s_n_list(
19 |         datetime,
20 |         author,
21 |         headline,
22 |         text
23 |     )
24 | }
25 | 


--------------------------------------------------------------------------------
/R/deliver_tagesschau_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.tagesschau_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- trimws(gsub("<[^>]+>", "", json_df$articleBody))
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_news_und_nachrichten_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.news_und_nachrichten_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(gsub("[\r\n]*", "", json_txt[1]))
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author)
16 |         text <- json_df$articleBody
17 | 
18 |         s_n_list(
19 |             datetime,
20 |             author,
21 |             headline,
22 |             text
23 |         )
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/R/deliver_t3n_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.t3n_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- gsub("\r\n", "\n", json_df$articleBody)
17 |         text <- gsub("\\[.*?\\]", "", text)
18 | 
19 |         s_n_list(
20 |             datetime,
21 |             author,
22 |             headline,
23 |             text
24 |         )
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/R/deliver_deutschlandfunk_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.deutschlandfunk_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     datetime <- html %>%
 8 |         rvest::html_element("time") %>%
 9 |         rvest::html_attr("datetime") %>%
10 |         lubridate::as_datetime()
11 |     headline <- html %>%
12 |         rvest::html_element(".headline-title") %>%
13 |         rvest::html_text()
14 |     author <- "deutschlandfunk.de" # could not find article with author
15 |     text <- html %>%
16 |         rvest::html_elements(".article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>%
17 |         rvest::html_text2() %>%
18 |         paste(collapse = "\n")
19 | 
20 |     s_n_list(
21 |         datetime,
22 |         author,
23 |         headline,
24 |         text
25 |     )
26 | }
27 | 


--------------------------------------------------------------------------------
/R/deliver_swr3_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.swr3_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 |     datetime <- html %>%
 7 |         rvest::html_elements(".meta-top time") %>%
 8 |         rvest::html_attr("datetime") %>%
 9 |         lubridate::as_datetime()
10 | 
11 |     headline <- html %>%
12 |         rvest::html_elements("h1.headline") %>%
13 |         rvest::html_text()
14 | 
15 |     author <- html %>%
16 |         rvest::html_elements(".meta-top .meta-author-name a") %>%
17 |         rvest::html_text2() %>%
18 |         toString()
19 | 
20 |     text <- html %>%
21 |         rvest::html_elements("p.lead, .bodytext p, .bodytext h2") %>%
22 |         rvest::html_text2() %>%
23 |         paste(collapse = "\n")
24 | 
25 |     s_n_list(
26 |         datetime,
27 |         author,
28 |         headline,
29 |         text
30 |     )
31 | }
32 | 


--------------------------------------------------------------------------------
/R/deliver_swr_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.swr_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 | 
 8 |     datetime <- html %>%
 9 |         rvest::html_element("time") %>%
10 |         rvest::html_attr("datetime") %>%
11 |         lubridate::as_datetime()
12 |     headline <- html %>%
13 |         rvest::html_element("h1.headline") %>%
14 |         rvest::html_text()
15 |     author <- html %>%
16 |         rvest::html_elements(".meta-top .meta-authors .meta-author-name a") %>%
17 |         rvest::html_text2() %>%
18 |         toString()
19 |     text <- html %>%
20 |         rvest::html_elements(".detail-body .lead, .bodytext p, .bodytext h2") %>%
21 |         rvest::html_text2() %>%
22 |         paste(collapse = "\n")
23 | 
24 |     s_n_list(
25 |         datetime,
26 |         author,
27 |         headline,
28 |         text
29 |     )
30 | }
31 | 


--------------------------------------------------------------------------------
/R/deliver_sfgate_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.sfgate_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_elements("[name=\"sailthru.date\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_elements("[property=\"sailthru.title\"]") %>%
17 |     rvest::html_attr("content")
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_elements("[name=\"sailthru.author\"]") %>%
22 |     rvest::html_attr("content") %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements("p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   s_n_list(
32 |     datetime,
33 |     author,
34 |     headline,
35 |     text
36 |   )
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/R/deliver_br_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     datetime <- html %>%
 8 |         rvest::html_element("time") %>%
 9 |         rvest::html_attr("datetime") %>%
10 |         lubridate::as_datetime()
11 | 
12 |     headline <- html %>%
13 |         rvest::html_element(".heading1") %>%
14 |         rvest::html_text2()
15 | 
16 |     author <- html %>%
17 |         rvest::html_element(".ArticleModuleTeaser_authorName__Q7ctt") %>%
18 |         rvest::html_text2() %>%
19 |         toString()
20 |     text <- html %>%
21 |         rvest::html_element(".RichText_richText__wS9Rz.body3") %>%
22 |         rvest::html_elements("p, h2") %>%
23 |         rvest::html_text2() %>%
24 |         paste(collapse = "\n")
25 |     s_n_list(
26 |         datetime,
27 |         author,
28 |         headline,
29 |         text
30 |     )
31 | }
32 | 


--------------------------------------------------------------------------------
/inst/templates/deliver_.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.{{newspaper}} <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("") %>%
13 |     rvest::html_attr("") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element("") %>%
19 |     rvest::html_attr("")
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_element("")  %>%
24 |     rvest::html_text2() %>%
25 |     toString()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements("") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   # the helper function safely creates a named list from objects
34 |   s_n_list(
35 |     datetime,
36 |     author,
37 |     headline,
38 |     text
39 |   )
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/R/deliver_shz_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.shz_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("p.w-600, p,h2.h4") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_mdr_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.mdr_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".einleitung,.paragraph") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_n-tv_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.n_tv_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".article__text") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 | 
22 |         s_n_list(
23 |             datetime,
24 |             author,
25 |             headline,
26 |             text
27 |         )
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/R/deliver_news_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.news_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- trimws(gsub("\\+\\+\\+.*?\\+\\+\\+", "", json_df$articleBody))
17 |         text <- gsub("\r\n", "\n", text)
18 |         text <- gsub("Folgen Sie.*", "", text)
19 |         s_n_list(
20 |             datetime,
21 |             author,
22 |             headline,
23 |             text
24 |         )
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/R/deliver_nw_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.nw_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("p.em_text,h2.Zwischenzeile") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_morgenpost_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.morgenpost_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".article-body p") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_swrfernsehen_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.swrfernsehen_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 |     datetime <- html %>%
 7 |         rvest::html_elements(".meta-top .meta-description time") %>%
 8 |         rvest::html_attr("datetime") %>%
 9 |         lubridate::as_datetime()
10 | 
11 |     headline <- html %>%
12 |         rvest::html_elements("h1.headline") %>%
13 |         rvest::html_text()
14 | 
15 |     author <- html %>%
16 |         rvest::html_elements(".meta-top .meta-author-name a") %>%
17 |         rvest::html_text2() %>%
18 |         toString()
19 | 
20 |     text <- html %>%
21 |         rvest::html_elements(".detail-body .lead,.bodytext p,.bodytext h2") %>%
22 |         rvest::html_text2() %>%
23 |         paste(collapse = "\n")
24 | 
25 |     s_n_list(
26 |         datetime,
27 |         author,
28 |         headline,
29 |         text
30 |     )
31 | }
32 | 


--------------------------------------------------------------------------------
/R/deliver_derstandard_at.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.derstandard_at <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 |     datetime <- html %>%
 7 |         rvest::html_elements(".article-meta") %>%
 8 |         rvest::html_text() %>%
 9 |         lubridate::as_datetime()
10 | 
11 |     headline <- html %>%
12 |         rvest::html_elements("h1.article-title") %>%
13 |         rvest::html_text()
14 | 
15 |     author <- html %>%
16 |         rvest::html_elements(".article-origins") %>%
17 |         rvest::html_text() %>%
18 |         toString()
19 | 
20 |     text <- html %>%
21 |         rvest::html_elements(".article-body p, .article-body h3") %>%
22 |         rvest::html_text2() %>%
23 |         paste(collapse = "\n") # There is a note that parts of the website are blocked
24 | 
25 |     s_n_list(
26 |         datetime,
27 |         author,
28 |         headline,
29 |         text
30 |     )
31 | }
32 | 


--------------------------------------------------------------------------------
/R/deliver_newsweek_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | 
 3 | pb_deliver_paper.newsweek_com <- function(x, verbose = NULL, pb, ...) {
 4 | 
 5 |   pb_tick(x, verbose, pb)
 6 |   # raw html is stored in column content_raw
 7 |   html <- rvest::read_html(x$content_raw)
 8 | 
 9 |   # datetime
10 |   datetime <- html %>%
11 |     rvest::html_elements("[property=\"article:published_time\"]") %>%
12 |     rvest::html_attr("content") %>%
13 |     lubridate::as_datetime()
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_elements("[property =\"og:title\"]") %>%
18 |     rvest::html_attr("content")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_elements("[class=\"author\"]") %>%
23 |     rvest::html_text2()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".article-body") %>%
28 |     rvest::html_elements("p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   s_n_list(
33 |     datetime,
34 |     author,
35 |     headline,
36 |     text
37 |   )
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/R/deliver_stern_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.stern_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt)[1, ]
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".intro,.text-element") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 | 
22 |         s_n_list(
23 |             datetime,
24 |             author,
25 |             headline,
26 |             text
27 |         )
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/R/deliver_tagesspiegel_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.tagesspiegel_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("#story-elements p") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_kurier_at.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.kurier_at <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".headerComp-intro,.paragraph.copy") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_srf_ch.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.srf_ch <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_df <- html %>%
 8 |         rvest::html_element("span#config__js") %>%
 9 |         rvest::html_attr("data-analytics-webtrekk-survey-gizmo-value-object") %>%
10 |         jsonlite::fromJSON()
11 | 
12 |     datetime <- lubridate::as_datetime(json_df$params$content_publication_datetime)
13 | 
14 |     headline <- html %>%
15 |         rvest::html_elements("h1 .article-title__text") %>%
16 |         rvest::html_text()
17 | 
18 |     author <- "" # no article with author info founds
19 | 
20 |     text <- html %>%
21 |         rvest::html_elements(".article-content p, .article-content h2") %>%
22 |         rvest::html_text2() %>%
23 |         paste(collapse = "\n")
24 | 
25 |     s_n_list(
26 |         datetime,
27 |         author,
28 |         headline,
29 |         text
30 |     )
31 | }
32 | 


--------------------------------------------------------------------------------
/R/deliver_wz_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.wz_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("article p.richtext,article h2.font-sans") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_marketwatch_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | 
 3 | pb_deliver_paper.marketwatch_com <- function(x, verbose = NULL, pb, ...) {
 4 | 
 5 |   pb_tick(x, verbose, pb)
 6 |   # raw html is stored in column content_raw
 7 |   html <- rvest::read_html(x$content_raw)
 8 | 
 9 |   # datetime
10 |   datetime <- html %>%
11 |     rvest::html_elements("[name=\"parsely-pub-date\"]") %>%
12 |     rvest::html_attr("content") %>%
13 |     lubridate::as_datetime()
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_elements("[property =\"og:title\"]") %>%
18 |     rvest::html_attr("content")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_elements("[name=\"parsely-author\"]") %>%
23 |     rvest::html_attr("content") %>%
24 |     toString()
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements(":not(.bio__description)>p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   s_n_list(
33 |     datetime,
34 |     author,
35 |     headline,
36 |     text
37 |   )
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/R/deliver_forbes_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.forbes_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_elements("[property=\"article:published\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_elements("[property=\"og:title\"]") %>%
17 |     rvest::html_attr("content")
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_elements("[property=\"article:author\"]") %>%
22 |     rvest::html_attr("content")
23 | 
24 |   if (length(author) > 1) author <- toString(author)
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements("p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   s_n_list(
33 |     datetime,
34 |     author,
35 |     headline,
36 |     text
37 |   )
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/R/deliver_frankenpost_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.frankenpost_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) <= 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt)
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".article-text p, .article-text h2") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_orf_at.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.orf_at <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".story-lead-text,.story-story p,.story-story h2") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_presseportal_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.presseportal_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("article.story p:not([class])") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_berliner_zeitung_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.berliner_zeitung_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".article_paragraph__hXYKJ") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_kabeleins_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.kabeleins_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("p.css-1tkp8z5, h2.css-xfddm,p.css-1pcz62z") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_nordkurier_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.nordkurier_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".tw-text-title-md, .paragraph,h2.tw-mb-4") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_swp_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.swp_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 |         datetime <- lubridate::as_datetime(json_df$datePublished)
13 |         headline <- json_df$headline
14 |         author <- toString(json_df$author$name)
15 |         text <- html %>%
16 |             rvest::html_elements(".u-article-header .fs-4,.u-paragraph, .u-title.u-headline") %>%
17 |             rvest::html_text2() %>%
18 |             paste(collapse = "\n")
19 | 
20 |         s_n_list(
21 |             datetime,
22 |             author,
23 |             headline,
24 |             text
25 |         )
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/R/deliver_noz_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.noz_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("p.w-600,section.content--group p, section.content--group h2") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_dailymail_co_uk.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.dailymail_co_uk <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 | 
 9 |   # datetime
10 |   datetime <- html %>%
11 |     rvest::html_elements("[property=\"article:published_time\"]") %>%
12 |     rvest::html_attr("content") %>%
13 |     lubridate::as_datetime()
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_elements("[property =\"mol:headline\"]") %>%
18 |     rvest::html_attr("content")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_elements("[name =\"author\"]") %>%
23 |     rvest::html_attr("content")
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements("[itemprop=\"articleBody\"]") %>%
28 |     rvest::html_elements("p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   s_n_list(
33 |     datetime,
34 |     author,
35 |     headline,
36 |     text
37 |   )
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/R/deliver_denverpost_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.denverpost_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("time") %>%
13 |     rvest::html_attr("datetime") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element(".dfm-title") %>%
19 |     rvest::html_text2()
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_element(".author-name")  %>%
24 |     rvest::html_text2() %>%
25 |     toString()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements(".article-body p") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   # the helper function safely creates a named list from objects
34 |   s_n_list(
35 |     datetime,
36 |     author,
37 |     headline,
38 |     text
39 |   )
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/R/deliver_schwaebische_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.schwaebische_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".tw-text-title-md, p.paragraph, h2.tw-mb-4") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_finanzen_net.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.finanzen_net <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("p.h3, .news-container__text p, .news-container__text h2") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_nzz_ch.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.nzz_ch <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".headline__lead,.articlecomponent.text,.subtitle,.articlecomponent") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_thueringer_allgemeine_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.thueringer_allgemeine_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".article-body p, .article-body h3") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_vice_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.vice_com <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".entry-content.entry-content p,.entry-content entry-content h2") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_rbb24_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.rbb24_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     datetime <- html %>%
 8 |         rvest::html_elements(".technicalline .lineinfo") %>%
 9 |         rvest::html_text2() %>%
10 |         gsub(".*(\\d{2}\\.\\d{2}\\.\\d{2}) \\| (\\d{2}:\\d{2}).*", "\\1 \\2", .) %>%
11 |         lubridate::as_datetime(format = "%d.%m.%y %H:%M", tz = "UTC") # This will not be the correct timezone
12 | 
13 | 
14 |     headline <- html %>%
15 |         rvest::html_elements(".titletext") %>%
16 |         rvest::html_text2()
17 | 
18 |     author <- "" # no article with author info found
19 | 
20 |     text <- html %>%
21 |         rvest::html_elements(".shorttext p, .textblock p, h4.texttitle") %>%
22 |         rvest::html_text2() %>%
23 |         paste(collapse = "\n")
24 | 
25 |     s_n_list(
26 |         datetime,
27 |         author,
28 |         headline,
29 |         text
30 |     )
31 | }
32 | 


--------------------------------------------------------------------------------
/R/deliver_rp_online_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.rp_online_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("strong[data-cy=\"intro\"],div[data-cy=\"article_content\"] p") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_spiegel_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.spiegel_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     datetime <- html %>%
 8 |         html_search(c("time"), c("datetime")) %>%
 9 |         lubridate::as_datetime()
10 | 
11 |     # headline
12 |     headline <- html %>%
13 |         rvest::html_element("article") %>%
14 |         rvest::html_attr("aria-label")
15 | 
16 |     # author
17 |     author <- html %>%
18 |         rvest::html_element("meta[name=\"author\"]") %>%
19 |         rvest::html_attr("content") %>%
20 |         toString()
21 | 
22 |     # text
23 |     text <- html %>%
24 |         rvest::html_elements("div[data-area = \"text\"]") %>%
25 |         rvest::html_text2() %>%
26 |         paste(collapse = "\n")
27 | 
28 |     # the helper function safely creates a named list from objects
29 |     s_n_list(
30 |         datetime,
31 |         author,
32 |         headline,
33 |         text
34 |     )
35 | }
36 | 


--------------------------------------------------------------------------------
/R/deliver_volksstimme_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.volksstimme_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".fp-article-heading__excerpt,.fp-paragraph, .fp-subheading") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_independent_co_uk.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.independent_co_uk <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"date\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("#articleHeader h1") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element("[property=\"article:author_name\"]")  %>%
22 |     rvest::html_attr("content") %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements("#main p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   # the helper function safely creates a named list from objects
32 |   s_n_list(
33 |     datetime,
34 |     author,
35 |     headline,
36 |     text
37 |   )
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/R/deliver_bild_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.bild_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     datetime <- html %>%
 8 |         rvest::html_element("time") %>%
 9 |         rvest::html_attr("datetime") %>%
10 |         lubridate::as_datetime()
11 | 
12 |     # headline
13 |     headline <- html %>%
14 |         rvest::html_elements(".document-title__headline") %>%
15 |         rvest::html_text()
16 | 
17 |     # author
18 |     author <- html %>%
19 |         rvest::html_elements(".author__name") %>%
20 |         rvest::html_text() %>%
21 |         toString()
22 | 
23 |     # text
24 |     text <- html %>%
25 |         rvest::html_elements(".article-body") %>%
26 |         rvest::html_text() %>%
27 |         paste(collapse = "\n")
28 | 
29 |     # the helper function safely creates a named list from objects
30 |     s_n_list(
31 |         datetime,
32 |         author,
33 |         headline,
34 |         text
35 |     )
36 | }
37 | 


--------------------------------------------------------------------------------
/R/deliver_abendzeitung_muenchen_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.abendzeitung_muenchen_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".artdetail_short ,.artdetail_text p,.artdetail_text h2") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_deutschlandfunkkultur_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.deutschlandfunkkultur_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     datetime <- html %>%
 8 |         rvest::html_element("time") %>%
 9 |         rvest::html_attr("datetime") %>%
10 |         lubridate::as_datetime()
11 |     headline <- html %>%
12 |         rvest::html_element(".headline-title,.section-article-head-area-title") %>%
13 |         rvest::html_text()
14 |     author <- html %>%
15 |         rvest::html_element(".article-header-author") %>%
16 |         rvest::html_text()
17 |     text <- html %>%
18 |         rvest::html_elements(".section-article-head-area-description,.article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>%
19 |         rvest::html_text2() %>%
20 |         paste(collapse = "\n")
21 | 
22 |     s_n_list(
23 |         datetime,
24 |         author,
25 |         headline,
26 |         text
27 |     )
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_dnn_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.dnn_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) <= 2) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[3])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 p,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 h2") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_fr_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.fr_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 | 
14 |         datetime <- lubridate::as_datetime(json_df$datePublished)
15 |         headline <- json_df$headline
16 |         author <- toString(json_df$author$name)
17 |         text <- html %>%
18 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph") %>%
19 |             rvest::html_text2() %>%
20 |             paste(collapse = "\n")
21 | 
22 | 
23 |         s_n_list(
24 |             datetime,
25 |             author,
26 |             headline,
27 |             text
28 |         )
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/R/deliver_maz_online_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.maz_online_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[3])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements("header .Textstyled__Text-sc-1cqv9mi-0, article .Textstyled__Text-sc-1cqv9mi-0, article h2") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_t_online_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.t_online_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`[1, ]
13 | 
14 |         datetime <- lubridate::as_datetime(json_df$datePublished)
15 |         headline <- json_df$headline
16 |         author <- toString(json_df$author[[1]]$name)
17 |         text <- html %>%
18 |             rvest::html_elements("div[data-testid=\"ArticleBody.StreamLayout\"] p") %>%
19 |             rvest::html_text2() %>%
20 |             paste(collapse = "\n")
21 | 
22 |         s_n_list(
23 |             datetime,
24 |             author,
25 |             headline,
26 |             text
27 |         )
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/R/deliver_wsj_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | 
 3 | pb_deliver_paper.wsj_com <- function(x, verbose = NULL, pb, ...) {
 4 | 
 5 |   pb_tick(x, verbose, pb)
 6 |   # raw html is stored in column content_raw
 7 |   html <- rvest::read_html(x$content_raw)
 8 | 
 9 |     # datetime
10 |     datetime <- html %>%
11 |       rvest::html_elements("[name=\"article.published\"]") %>%
12 |       rvest::html_attr("content") %>%
13 |       lubridate::as_datetime() %>%
14 |       utils::head(1L)
15 | 
16 |     # headline
17 |     headline <- html %>%
18 |       rvest::html_elements("title") %>%
19 |       rvest::html_text() %>%
20 |       paste(collapse = "\n")
21 | 
22 |     # author
23 |     author <- html %>%
24 |       rvest::html_elements("[name=\"author\"]") %>%
25 |       rvest::html_attr("content") %>%
26 |       toString()
27 | 
28 |     # text
29 |     text <- html %>%
30 |       rvest::html_elements("p:not([id|=\"footer\"])") %>%
31 |       rvest::html_text2() %>%
32 |       paste(collapse = "\n")
33 | 
34 |     s_n_list(
35 |       datetime,
36 |       author,
37 |       headline,
38 |       text
39 |     )
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/R/deliver_badische_zeitung_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.badische_zeitung_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(iconv(x$content_raw, from = "ISO-8859-1", to = "UTF-8"))
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author)
16 |         text <- html %>%
17 |             rvest::html_elements("section[role = \"article\"], .article-site__topic") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_heise_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.heise_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) | length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         datetime <- lubridate::as_datetime(json_df$datePublished)
13 |         headline <- json_df$headline
14 |         author <- toString(json_df$author$name)
15 | 
16 |         text <- html %>%
17 |             rvest::html_elements("#lead,#article-content-body .ringCommonDetail.ringBlockType-paragraph,.article-content,.a-article-header__lead") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_tz_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.tz_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_wa_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.wa_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_fnp_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.fnp_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_hna_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.hna_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_ruhr24_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.ruhr24_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_waz_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.waz_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         if (length(json_txt) == 1) {
12 |             return(s_n_list())
13 |         }
14 |         json_df <- jsonlite::fromJSON(json_txt[2])
15 | 
16 |         datetime <- lubridate::as_datetime(json_df$datePublished)
17 |         headline <- json_df$headline
18 |         author <- toString(json_df$author$name)
19 |         text <- html %>%
20 |             rvest::html_elements(".article-body p,.article-body h3") %>%
21 |             rvest::html_text2() %>%
22 |             paste(collapse = "\n")
23 | 
24 |         s_n_list(
25 |             datetime,
26 |             author,
27 |             headline,
28 |             text
29 |         )
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/R/deliver_echo24_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.echo24_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 | 
14 |         datetime <- lubridate::as_datetime(json_df$datePublished)
15 |         headline <- json_df$headline
16 |         author <- toString(json_df$author$name)
17 |         text <- html %>%
18 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>%
19 |             rvest::html_text2() %>%
20 |             paste(collapse = "\n")
21 | 
22 |         s_n_list(
23 |             datetime,
24 |             author,
25 |             headline,
26 |             text
27 |         )
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/R/deliver_manager_magazin_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.manager_magazin_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df[json_df$`@type` == "NewsArticle", ]
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".leading-loose, .RichText p, .RichText h3") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_merkur_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.merkur_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 | 
17 |         text <- html %>%
18 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>%
19 |             rvest::html_text2() %>%
20 |             paste(collapse = "\n")
21 | 
22 |         s_n_list(
23 |             datetime,
24 |             author,
25 |             headline,
26 |             text
27 |         )
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/R/deliver_heidelberg24_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.heidelberg24_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_kreiszeitung_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.kreiszeitung_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$mainEntity
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/R/deliver_latimes_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | 
 3 | pb_deliver_paper.latimes_com <- function(x, verbose = NULL, pb, ...) {
 4 | 
 5 |   pb_tick(x, verbose, pb)
 6 |   # raw html is stored in column content_raw
 7 |   html <- rvest::read_html(x$content_raw)
 8 | 
 9 |   # datetime
10 |   datetime <- html %>%
11 |     rvest::html_elements("[property=\"article:published_time\"]") %>%
12 |     rvest::html_attr("content") %>%
13 |     lubridate::as_datetime()
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_elements("[property=\"og:title\"]") %>%
18 |     rvest::html_attr("content")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_elements(".authors")  %>%
23 |     rvest::html_text() %>%
24 |     toString() %>%
25 |     gsub("\n", "", .) %>%
26 |     gsub("By", "", ., fixed = TRUE) %>%
27 |     trimws()
28 | 
29 |   # text
30 |   text <- html %>%
31 |     rvest::html_elements(".page-article-container>p,.rich-text-body>p") %>%
32 |     rvest::html_text2() %>%
33 |     paste(collapse = "\n")
34 | 
35 |   s_n_list(
36 |     datetime,
37 |     author,
38 |     headline,
39 |     text
40 |   )
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/R/deliver_techrepublic_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.techrepublic_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("[property=\"article:published_time\"]") %>%
13 |     rvest::html_attr("content") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element("[property=\"og:title\"]") %>%
19 |     rvest::html_attr("content")
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_element("[name=\"author\"]")  %>%
24 |     rvest::html_attr("content") %>%
25 |     toString()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements(".article-summary,section") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   # the helper function safely creates a named list from objects
34 |   s_n_list(
35 |     datetime,
36 |     author,
37 |     headline,
38 |     text
39 |   )
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/R/deliver_augsburger_allgemeine.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.augsburger_allgemeine_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 | 
 8 |     datetime <- html %>%
 9 |         rvest::html_element("time") %>%
10 |         rvest::html_attr("datetime") %>%
11 |         lubridate::as_datetime()
12 |     headline <- html %>%
13 |         rvest::html_element("h2.typo-teaserheadline-SoleXL, h2.typo-articleheadline-Recife") %>%
14 |         rvest::html_text()
15 |     author <- html %>%
16 |         rvest::html_elements("a.typo-author-link") %>%
17 |         rvest::html_text2() %>%
18 |         toString()
19 |     text <- html %>%
20 |         rvest::html_elements(".typo-article-teaser-Recife, .typo-article-teaser, .article-body-paid-content, .typo-subhead, p.text-xs") %>%
21 |         rvest::html_text2() %>%
22 |         unique() %>% # teaser might be duplicated
23 |         paste(collapse = "\n")
24 | 
25 |     s_n_list(
26 |         datetime,
27 |         author,
28 |         headline,
29 |         text
30 |     )
31 | }
32 | 


--------------------------------------------------------------------------------
/R/deliver_lvz_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.lvz_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[3])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/man/pb_find_rss.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rss.r
 3 | \name{pb_find_rss}
 4 | \alias{pb_find_rss}
 5 | \title{Find RSS feed on a newspapers website}
 6 | \usage{
 7 | pb_find_rss(x, use = c("main", "suffixes", "feedly"))
 8 | }
 9 | \arguments{
10 | \item{x}{main domain of the newspaper site to check for RSS feeds.}
11 | 
12 | \item{use}{which steps to include in the search (see Details). Default is to
13 | include all.}
14 | }
15 | \value{
16 | A URL to the RSS feed(s) or NULL if nothing is found
17 | }
18 | \description{
19 | Find RSS feed on a newspapers website
20 | }
21 | \details{
22 | Uses a three step heuristic to find RSS feeds:
23 | \enumerate{
24 | \item Scrapes the main page (without any paths) to see if the RSS feed is
25 | advertised
26 | \item Checks a number of common paths where sites put their RSS feeds
27 | \item Queries the \href{https://feedly.com/}{feedly.com} API to for feeds associated
28 | with a page
29 | }
30 | }
31 | \examples{
32 | pb_find_rss("https://www.buzzfeed.com/")
33 | }
34 | \references{
35 | Approach inspired by \url{https://github.com/mediacloud/feed_seeker}
36 | }
37 | 


--------------------------------------------------------------------------------
/R/deliver_foxbusiness_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | 
 3 | pb_deliver_paper.foxbusiness_com <- function(x, verbose = NULL, pb, ...) {
 4 | 
 5 |   pb_tick(x, verbose, pb)
 6 |   # raw html is stored in column content_raw
 7 |   html <- rvest::read_html(x$content_raw)
 8 | 
 9 |   # datetime
10 |   datetime <- html %>%
11 |     rvest::html_elements("[name=\"dcterms.created\"]") %>%
12 |     rvest::html_attr("content") %>%
13 |     lubridate::as_datetime()
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_elements("[property=\"og:title\"]") %>%
18 |     rvest::html_attr("content")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_elements(".author,.author-byline") %>%
23 |     rvest::html_text2() %>%
24 |     gsub("By ", "", ., fixed = TRUE) %>%
25 |     trimws() %>%
26 |     toString()
27 | 
28 |   # text
29 |   text <- html %>%
30 |     rvest::html_elements(".article-content") %>%
31 |     rvest::html_text2() %>%
32 |     paste(collapse = "\n")
33 | 
34 |   s_n_list(
35 |     datetime,
36 |     author,
37 |     headline,
38 |     text
39 |   )
40 | }
41 | 
42 | 
43 | pb_deliver_paper.foxnews_com <- pb_deliver_paper.foxbusiness_com
44 | 
45 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | CMD
 2 | Codecov
 3 | Datenschutzerklärung
 4 | Guide’s
 5 | Lifecycle
 6 | Nutzungsbedingungen
 7 | POSIXct
 8 | ac
 9 | aktualne
10 | anotherangryvoice
11 | bbc
12 | blesk
13 | blogspot
14 | boston
15 | bostonglobe
16 | breitbart
17 | buzzfeed
18 | cbsileads
19 | cbslnk
20 | cbsnews
21 | ceskatelevize
22 | cnet
23 | cnn
24 | com
25 | csv
26 | cz
27 | dailymail
28 | datetime
29 | denikn
30 | doctype
31 | eu
32 | evolvepolitics
33 | faz
34 | feedly
35 | forbes
36 | foxbusiness
37 | foxnews
38 | ftw
39 | geenstijl
40 | hn
41 | huffingtonpost
42 | huffpost
43 | idnes
44 | irozhlas
45 | itemscope
46 | latimes
47 | lidovky
48 | lnk
49 | marketwatch
50 | mediacloud
51 | mediacourant
52 | metronieuws
53 | msnbc
54 | newsweek
55 | nl
56 | nos
57 | novinky
58 | nrc
59 | nypost
60 | nytimes
61 | org
62 | pagesix
63 | parlamentnilisty
64 | pb
65 | seznamzpravy
66 | sfgate
67 | skwawkbox
68 | stri
69 | stringi
70 | techrepublic
71 | telegraaf
72 | thecanary
73 | theguardian
74 | thelily
75 | thismorningwithgordondeal
76 | tibble
77 | tribpub
78 | uk
79 | un
80 | urls
81 | usatoday
82 | volkskrant
83 | washingtonpost
84 | webscraper
85 | webscraping
86 | wsj
87 | ’A
88 | 


--------------------------------------------------------------------------------
/R/deliver_saechsische_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.saechsische_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         if (length(json_txt) >= 2) {
12 |             return(s_n_list())
13 |         }
14 |         json_df <- jsonlite::fromJSON(json_txt[3])
15 | 
16 |         datetime <- lubridate::as_datetime(json_df$datePublished)
17 |         headline <- json_df$headline
18 |         author <- toString(json_df$author$name)
19 |         text <- html %>%
20 |             rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>%
21 |             rvest::html_text2() %>%
22 |             paste(collapse = "\n")
23 | 
24 |         s_n_list(
25 |             datetime,
26 |             author,
27 |             headline,
28 |             text
29 |         )
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/R/deliver_abendblatt_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.abendblatt_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".article-body h3, .article-body p") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         s_n_list(
22 |             datetime,
23 |             author,
24 |             headline,
25 |             text
26 |         )
27 |     }
28 | }
29 | # rss feed includes pages that cannot be parsed because they are subpages
30 | # rss feed also includes podcast, which cannot be parsed
31 | 


--------------------------------------------------------------------------------
/R/deliver_zeit_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.zeit_de <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     html_search(selectors = c(
13 |       ".metadata__date>time",
14 |       "meta[name=\"date\"]"
15 |     ), attributes = c(
16 |       "datetime", "content"
17 |     )) %>%
18 |     lubridate::as_datetime()
19 | 
20 |   # headline
21 |   headline <- html %>%
22 |     rvest::html_element("[property=\"og:title\"]") %>%
23 |     rvest::html_attr("content")
24 | 
25 |   # author
26 |   author <- html %>%
27 |     rvest::html_element("[rel=\"author\"],.metadata__source")  %>%
28 |     rvest::html_text2() %>%
29 |     toString()
30 | 
31 |   # text
32 |   text <- html %>%
33 |     rvest::html_elements(".article-body p") %>%
34 |     rvest::html_text2() %>%
35 |     paste(collapse = "\n")
36 | 
37 |   # the helper function safely creates a named list from objects
38 |   s_n_list(
39 |     datetime,
40 |     author,
41 |     headline,
42 |     text
43 |   )
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/R/deliver_bnn_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.bnn_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         article <- grepl("\"NewsArticle\"", json_txt)
12 |         if (!any(article)) {
13 |             return(s_n_list())
14 |         }
15 |         json_df <- jsonlite::fromJSON(json_txt[article])
16 | 
17 |         datetime <- lubridate::as_datetime(json_df$datePublished)
18 |         headline <- json_df$headline
19 |         author <- toString(json_df$author$name)
20 |         text <- html %>%
21 |             rvest::html_elements(".intro,.article__body p,.article__body h2") %>%
22 |             rvest::html_text2() %>%
23 |             paste(collapse = "\n")
24 | 
25 |         s_n_list(
26 |             datetime,
27 |             author,
28 |             headline,
29 |             text
30 |         )
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/R/deliver_breakingnews_ie.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.breakingnews_ie <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   data <- html %>%
11 |     rvest::html_element("script") %>%
12 |     rvest::html_text2()
13 | 
14 |   if (!isTRUE(is.na(data))) {
15 |     data <- jsonlite::fromJSON(data)
16 |     # datetime
17 |     datetime <- data$datePublished %>%
18 |       lubridate::as_datetime()
19 | 
20 |     # headline
21 |     headline <- data$headline
22 | 
23 |     # author
24 |     author <- data$author$name %>%
25 |       toString()
26 | 
27 |     # text
28 |     text <- html %>%
29 |       rvest::html_elements("article p") %>%
30 |       rvest::html_text2() %>%
31 |       paste(collapse = "\n")
32 | 
33 |     cover_image_url <- utils::head(data$image$url, 1L)
34 | 
35 |     type <- data$`@type`
36 | 
37 |     s_n_list(
38 |       datetime,
39 |       author,
40 |       headline,
41 |       text,
42 |       type,
43 |       cover_image_url
44 |     )
45 |   } else {
46 |     s_n_list()
47 |   }
48 | 
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/R/deliver_joe_ie.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.joe_ie <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   data <- html %>%
11 |     rvest::html_element("[type=\"application/ld+json\"]") %>%
12 |     rvest::html_text2()
13 | 
14 |   if (!isTRUE(is.na(data))) {
15 |     data <- jsonlite::fromJSON(data)
16 |     # datetime
17 |     datetime <- data$datePublished %>%
18 |       lubridate::as_datetime()
19 | 
20 |     # headline
21 |     headline <- data$headline
22 | 
23 |     # author
24 |     author <- data$author$name %>%
25 |       toString()
26 | 
27 |     # text
28 |     text <- html %>%
29 |       rvest::html_elements("article p") %>%
30 |       rvest::html_text2() %>%
31 |       paste(collapse = "\n")
32 | 
33 |     cover_image_url <- utils::head(data$image$url, 1L)
34 | 
35 |     type <- data$`@type`
36 | 
37 |     s_n_list(
38 |       datetime,
39 |       author,
40 |       headline,
41 |       text,
42 |       type,
43 |       cover_image_url
44 |     )
45 |   } else {
46 |     s_n_list()
47 |   }
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_derwesten_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.derwesten_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`[1, ]
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- html %>%
16 |             rvest::html_elements(".author.vcard .url.fn.n") %>%
17 |             rvest::html_text() %>%
18 |             toString()
19 | 
20 |         text <- html %>%
21 |             rvest::html_elements(".lead p,.article-body p") %>%
22 |             rvest::html_text2() %>%
23 |             paste(collapse = "\n")
24 | 
25 |         s_n_list(
26 |             datetime,
27 |             author,
28 |             headline,
29 |             text
30 |         )
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/R/deliver_thesun_ie.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.thesun_ie <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   data <- html %>%
11 |     rvest::html_element("[type=\"application/ld+json\"]") %>%
12 |     rvest::html_text2()
13 | 
14 |   if (!isTRUE(is.na(data))) {
15 |     data <- jsonlite::fromJSON(data)
16 |     # datetime
17 |     datetime <- data$datePublished %>%
18 |       lubridate::as_datetime()
19 | 
20 |     # headline
21 |     headline <- data$headline
22 | 
23 |     # author
24 |     author <- data$author$name %>%
25 |       toString()
26 | 
27 |     # text
28 |     text <- html %>%
29 |       rvest::html_elements("article p") %>%
30 |       rvest::html_text2() %>%
31 |       paste(collapse = "\n")
32 | 
33 |     cover_image_url <- utils::head(data$image$url, 1L)
34 | 
35 |     type <- data$`@type`
36 | 
37 |     s_n_list(
38 |       datetime,
39 |       author,
40 |       headline,
41 |       text,
42 |       type,
43 |       cover_image_url
44 |     )
45 |   } else {
46 |     s_n_list()
47 |   }
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_freiepresse_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.freiepresse_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_txt <- json_txt[grepl("NewsArticle", json_txt)]
12 |         if (length(json_txt) == 0) {
13 |             return(s_n_list())
14 |         }
15 |         json_df <- jsonlite::fromJSON(json_txt)
16 | 
17 |         datetime <- lubridate::as_datetime(json_df$datePublished)
18 |         headline <- json_df$headline
19 |         author <- toString(json_df$author)
20 |         text <- html %>%
21 |             rvest::html_elements(".article__text p,.article__text h2") %>%
22 |             rvest::html_text2() %>%
23 |             paste(collapse = "\n")
24 | 
25 |         s_n_list(
26 |             datetime,
27 |             author,
28 |             headline,
29 |             text
30 |         )
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/R/deliver_breitbart_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.breitbart_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("time") %>%
11 |     rvest::html_attr("datetime") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element("address")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".entry-content>p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   # in-text links
32 |   text_links <- html %>%
33 |     rvest::html_elements(".entry-content>p>a") %>%
34 |     rvest::html_attr("href") %>%
35 |     as.list()
36 | 
37 |   # the helper function safely creates a named list from objects
38 |   s_n_list(
39 |     datetime,
40 |     author,
41 |     headline,
42 |     text,
43 |     text_links
44 |   )
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/R/deliver_thecanary_co.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.thecanary_co <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("time") %>%
11 |     rvest::html_attr("datetime") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element(".entry-title") %>%
17 |     rvest::html_text()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element(".author")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".entry-content>p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   # in-text links
32 |   text_links <- html %>%
33 |     rvest::html_elements(".entry-content>p>a") %>%
34 |     rvest::html_attr("href") %>%
35 |     as.list()
36 | 
37 |   # the helper function safely creates a named list from objects
38 |   s_n_list(
39 |     datetime,
40 |     author,
41 |     headline,
42 |     text,
43 |     text_links
44 |   )
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/R/deliver_yahoo_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.yahoo_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   data <- html %>%
11 |     rvest::html_element("article [type=\"application/ld+json\"]") %>%
12 |     rvest::html_text2()
13 | 
14 |   if (!isTRUE(is.na(data))) {
15 |     data <- jsonlite::fromJSON(data)
16 |     # datetime
17 |     datetime <- data$datePublished %>%
18 |       lubridate::as_datetime()
19 | 
20 |     # headline
21 |     headline <- data$headline
22 | 
23 |     # author
24 |     author <- data$author$name %>%
25 |       toString()
26 | 
27 |     # text
28 |     text <- html %>%
29 |       rvest::html_elements("article p") %>%
30 |       rvest::html_text2() %>%
31 |       paste(collapse = "\n")
32 | 
33 |     cover_image_url <- utils::head(data$image$url, 1L)
34 | 
35 |     type <- purrr::pluck(data, "@type")
36 | 
37 |     s_n_list(
38 |       datetime,
39 |       author,
40 |       headline,
41 |       text,
42 |       type,
43 |       cover_image_url
44 |     )
45 |   } else {
46 |     s_n_list()
47 |   }
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_focus_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         if ("@graph" %in% names(json_df)) {
13 |             json_df <- json_df$`@graph`
14 |         }
15 |         if (json_df$`@type` != "NewsArticle") {
16 |             return(s_n_list())
17 |         }
18 |         datetime <- lubridate::as_datetime(json_df$datePublished)
19 |         headline <- json_df$headline
20 |         author <- toString(json_df$author$name)
21 |         text <- html %>%
22 |             rvest::html_elements(".leadIn,.textBlock") %>%
23 |             rvest::html_text2() %>%
24 |             paste(collapse = "\n")
25 | 
26 | 
27 |         s_n_list(
28 |             datetime,
29 |             author,
30 |             headline,
31 |             text
32 |         )
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/R/deliver_haz_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.haz_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_txt <- json_txt[grepl("NewsArticle", json_txt)]
12 |         if (length(json_txt) == 0) {
13 |             return(s_n_list())
14 |         }
15 |         json_df <- jsonlite::fromJSON(json_txt)
16 | 
17 |         datetime <- lubridate::as_datetime(json_df$datePublished)
18 |         headline <- json_df$headline
19 |         author <- toString(json_df$author$name)
20 |         text <- html %>%
21 |             rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2,.Textstyled__Text-sc-1cqv9mi-0.gqSIEH") %>%
22 |             rvest::html_text2() %>%
23 |             paste(collapse = "\n")
24 | 
25 |         s_n_list(
26 |             datetime,
27 |             author,
28 |             headline,
29 |             text
30 |         )
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/man/pb_deliver.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deliver.R
 3 | \name{pb_deliver}
 4 | \alias{pb_deliver}
 5 | \title{Deliver online news articles}
 6 | \usage{
 7 | pb_deliver(x, try_default = TRUE, ignore_fails = FALSE, verbose = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{Either a vector of URLs or a data.frame returned by
11 | \link{pb_collect}.}
12 | 
13 | \item{try_default}{if no parser is available, should a generic parser be used
14 | \code{TRUE} or should the URL be skipped \code{FALSE}?}
15 | 
16 | \item{ignore_fails}{normally the function errors raw content for a URL can't
17 | be parsed. Setting to \code{TRUE} ignores all parsing errors (use with
18 | caution).}
19 | 
20 | \item{verbose}{\code{FALSE} turns deliver silent. \code{TRUE} prints status
21 | messages and a progress bar on the screen. \code{2L} turns on debug mode.
22 | If \code{NULL} will be determined from
23 | \code{getOption("paperboy_verbose")}.}
24 | 
25 | \item{...}{Passed on to \link{pb_collect}.}
26 | }
27 | \value{
28 | A data.frame (tibble) with media data and full text.
29 | }
30 | \description{
31 | This function will determine the website of the urls given to it
32 |   and call the appropriate webscraper.
33 | }
34 | 


--------------------------------------------------------------------------------
/R/deliver_anotherangryvoice_blogspot_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.anotherangryvoice_blogspot_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element(".published") %>%
11 |     rvest::html_attr("title") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element(".entry-title") %>%
17 |     rvest::html_text() %>%
18 |     trimws()
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_element(".fn")  %>%
23 |     rvest::html_text2() %>%
24 |     toString()
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_element(".entry-content") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   # in-text links
33 |   text_links <- html %>%
34 |     rvest::html_elements(".entry-content>span>a") %>%
35 |     rvest::html_attr("href") %>%
36 |     as.list()
37 | 
38 |   # the helper function safely creates a named list from objects
39 |   s_n_list(
40 |     datetime,
41 |     author,
42 |     headline,
43 |     text,
44 |     text_links
45 |   )
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/R/deliver_cbsnews_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.cbsnews_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_elements("time") %>%
11 |     rvest::html_attr("datetime") %>%
12 |     lubridate::as_datetime() %>%
13 |     utils::head(1L)
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_elements("[property=\"og:title\"]") %>%
18 |     rvest::html_attr("content")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_element("[class*=\"content__meta--byline\"]") %>%
23 |     rvest::html_text() %>%
24 |     gsub("By\\b\\s+|\n", "", .) %>%
25 |     trimws()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements(".content__body>p") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   content_type <- x$expanded_url %>%
34 |     gsub(".*cbsnews.com/(.+?)/.*", "\\1", ., perl = TRUE)
35 | 
36 |   # the helper function safely creates a named list from objects
37 |   s_n_list(
38 |     datetime,
39 |     author,
40 |     headline,
41 |     text,
42 |     content_type
43 |   )
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/R/deliver_ac24_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.ac24_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # raw html is stored in column content_raw
 5 |   html <- rvest::read_html(x$content_raw)
 6 |   pb_tick(x, verbose, pb)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element(".author")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".post-content p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 | 
32 |   cover_image_html <- html %>%
33 |     rvest::html_element(".featured-image img") %>%
34 |     as.character()
35 | 
36 |   cover_image_url <- html %>%
37 |     rvest::html_element(".featured-image img") %>%
38 |     rvest::html_attr("src")
39 | 
40 |   s_n_list(
41 |     datetime,
42 |     author,
43 |     headline,
44 |     text,
45 |     cover_image_url,
46 |     cover_image_html
47 |   )
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_nypost_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.nypost_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_elements("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_elements("[property=\"og:title\"]") %>%
17 |     rvest::html_attr("content")
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_elements(".byline__author")  %>%
22 |     rvest::html_text2() %>%
23 |     toString() %>%
24 |     gsub("By ", "", ., fixed = TRUE)
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements("[class*=\"content\"]>p,[class*=\"entry-content\"]>p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   # the helper function safely creates a named list from objects
33 |   s_n_list(
34 |     datetime,
35 |     author,
36 |     headline,
37 |     text
38 |   )
39 | 
40 | }
41 | 
42 | pb_deliver_paper.decider_com <-
43 |   pb_deliver_paper.pagesix_com <-
44 |   pb_deliver_paper.nypost_com
45 | 


--------------------------------------------------------------------------------
/R/deliver_telegraph_co_uk.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.telegraph_co_uk <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     html_search("[itemprop=\"datePublished\"]",
11 |                 c("content", "datetime")) %>%
12 |     as.POSIXct(format = "%Y-%m-%dT%H:%M%z") %>%
13 |     utils::head(1L)
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_elements("[property=\"og:title\"]") %>%
18 |     rvest::html_attr("content")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_elements("[class*=\"byline__author\"]") %>%
23 |     rvest::html_attr("content") %>%
24 |     toString() %>%
25 |     gsub("^By\\s", "", .)
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements("[class*=\"article-body-text\"]") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   # type
34 |   content_type <- html %>%
35 |     rvest::html_element("[property=\"og:type\"]") %>%
36 |     rvest::html_attr("content")
37 | 
38 |   s_n_list(
39 |     datetime,
40 |     author,
41 |     headline,
42 |     text,
43 |     content_type
44 |   )
45 | }
46 | 


--------------------------------------------------------------------------------
/R/deliver_businessinsider_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.businessinsider_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[1, ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- json_df$headline
21 |         text <- html %>%
22 |             rvest::html_element(".article-main") %>%
23 |             rvest::html_elements("p, h2") %>%
24 |             rvest::html_text2() %>%
25 |             paste(collapse = "\n")
26 | 
27 |         s_n_list(
28 |             datetime,
29 |             author,
30 |             headline,
31 |             text
32 |         )
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/R/deliver_newsflash24_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.newsflash24_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- json_df$headline
21 |         text <- html %>%
22 |             rvest::html_elements(".entry-content p, .entry-content h2") %>%
23 |             rvest::html_text2() %>%
24 |             paste(collapse = "\n")
25 | 
26 |         s_n_list(
27 |             datetime,
28 |             author,
29 |             headline,
30 |             text
31 |         )
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/R/deliver_suedkurier_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.suedkurier_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         datetime <- lubridate::as_datetime(json_df$datePublished)
13 |         headline <- html %>%
14 |             rvest::html_element("header h1") %>%
15 |             rvest::html_text()
16 |         author <- paste0("<p>", json_df$author$name, "</p>", collapse = ",") %>%
17 |             rvest::read_html() %>%
18 |             rvest::html_text() %>%
19 |             toString()
20 |         text <- html %>%
21 |             rvest::html_elements(".article-summary,.article-jsonld.article-paywall-summary,.article-jsonld p") %>%
22 |             rvest::html_text2() %>%
23 |             paste(collapse = "\n")
24 | 
25 |         s_n_list(
26 |             datetime,
27 |             author,
28 |             headline,
29 |             text
30 |         )
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/R/deliver_vox_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.vox_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         if (length(json_df$`@type`) > 1) {
13 |             json_df <- json_df[json_df$`@type` == "Article", ]
14 |         }
15 |         datetime <- lubridate::as_datetime(json_df$datePublished)
16 |         headline <- json_df$headline
17 |         author <- toString(json_df$author$name)
18 |         text <- json_df$articleBody
19 |         if (author == "VOX Online") {
20 |             # the text might have the author abbr. at the end
21 |             author_abbr <- sub(".*\\(([^)]+)\\)$", "\\1", text)
22 |             if (author_abbr != "") {
23 |                 author <- author_abbr
24 |             }
25 |         }
26 |         s_n_list(
27 |             datetime,
28 |             author,
29 |             headline,
30 |             text
31 |         )
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/R/deliver_blesk_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.blesk_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   html <- rvest::read_html(x$content_raw)
 5 |   pb_tick(x, verbose, pb)
 6 |   # raw html is stored in column content_raw
 7 | 
 8 |   # data about the article is nicely stored in a json string
 9 |   data <- html %>%
10 |     rvest::html_elements("[type=\"application/ld+json\"]") %>%
11 |     rvest::html_text2() %>%
12 |     lapply(jsonlite::fromJSON)
13 | 
14 |   # usually there are more than one,
15 |   if (length(data) > 1L) {
16 |     tp <- purrr::map_chr(data, function(x)
17 |       purrr::pluck(x, "@type", .default = NA_character_))
18 | 
19 |     data <- purrr::pluck(data, which(tp == "NewsArticle"))
20 |   }
21 | 
22 |   datetime <- data$datePublished %>%
23 |     lubridate::ymd_hm()
24 | 
25 |   headline <- data$headline
26 | 
27 |   author <- data$author$name %>%
28 |     toString()
29 | 
30 |   # text
31 |   text <- html %>%
32 |     rvest::html_elements("#article p,#article h2") %>%
33 |     rvest::html_text2() %>%
34 |     paste(collapse = "\n")
35 | 
36 |   cover_image_url <- purrr::pluck(data, "image", "url", .default = NA_character_)
37 | 
38 |   s_n_list(
39 |     datetime,
40 |     author,
41 |     headline,
42 |     text,
43 |     cover_image_url
44 |   )
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/R/deliver_buzzfeed_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.buzzfeed_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("time") %>%
11 |     rvest::html_attr("datetime") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("[class^=\"headline_title\"]") %>%
17 |     rvest::html_text()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element("[class*=\"headline-byline_bylineName\"]")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".subbuzz-text>p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   # in-text links
32 |   text_links <- html %>%
33 |     rvest::html_elements(".subbuzz-text,.tweet__container") %>%
34 |     rvest::html_elements("a") %>%
35 |     rvest::html_attr("href") %>%
36 |     as.list()
37 | 
38 |   # the helper function safely creates a named list from objects
39 |   s_n_list(
40 |     datetime,
41 |     author,
42 |     headline,
43 |     text,
44 |     text_links
45 |   )
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/R/deliver_seznamzpravy_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.seznamzpravy_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # data about the article is nicely stored in a json string
 9 |   data <- html %>%
10 |     rvest::html_elements("[type=\"application/ld+json\"]") %>%
11 |     rvest::html_text() %>%
12 |     lapply(jsonlite::fromJSON)
13 | 
14 |   # usually there are more than one,
15 |   if (length(data) > 1L) {
16 |     tp <- purrr::map_chr(data, function(x)
17 |       purrr::pluck(x, "@type", .default = NA_character_))
18 | 
19 |     data <- purrr::pluck(data, which(tp == "NewsArticle"))
20 |   }
21 | 
22 |   datetime <- data$datePublished %>%
23 |     lubridate::as_datetime()
24 | 
25 |   headline <- data$headline
26 | 
27 |   author <- data$author$name %>%
28 |     toString()
29 | 
30 |   # text
31 |   text <- html %>%
32 |     rvest::html_elements("article p") %>%
33 |     rvest::html_text2() %>%
34 |     paste(collapse = "\n")
35 | 
36 |   cover_image_url <- purrr::pluck(data, "image", "url", .default = NA_character_)
37 | 
38 |   s_n_list(
39 |     datetime,
40 |     author,
41 |     headline,
42 |     text,
43 |     cover_image_url
44 |   )
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/R/deliver_karlsruhe_insider_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[1, ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- json_df$headline
21 |         text <- html %>%
22 |             rvest::html_element("article .td-post-content") %>%
23 |             rvest::html_elements("p, h2") %>%
24 |             rvest::html_text2() %>%
25 |             paste(collapse = "\n")
26 | 
27 |         s_n_list(
28 |             datetime,
29 |             author,
30 |             headline,
31 |             text
32 |         )
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/R/deliver_wiwo_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.wiwo_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         if (length(json_txt) != 0) { # otherwise the article is paywalled and not scrapeable
12 |             json_df <- jsonlite::fromJSON(json_txt[1])
13 | 
14 |             datetime <- lubridate::as_datetime(json_df$datePublished)
15 |             headline <- json_df$headline
16 |             author <- toString(json_df$creator)
17 |             text <- html %>%
18 |                 rvest::html_elements(".c-leadtext,.u-richtext h3,.u-richtext p") %>%
19 |                 rvest::html_text2() %>%
20 |                 .[!grepl("Lesen Sie auch", .)] %>% # Remove links in between
21 |                 paste(collapse = "\n")
22 |         } else {
23 |             return(s_n_list())
24 |         }
25 |         s_n_list(
26 |             datetime,
27 |             author,
28 |             headline,
29 |             text
30 |         )
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/R/deliver_evolvepolitics_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.evolvepolitics_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element(".entry-date") %>%
11 |     rvest::html_attr("datetime") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element(".tdb-title-text") %>%
17 |     rvest::html_text()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element(".tdb-author-name")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".tdb-block-inner") %>%
28 |     rvest::html_elements("p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   # in-text links
33 |   text_links <- html %>%
34 |     rvest::html_elements(".tdb-block-inner") %>%
35 |     rvest::html_elements("p>a") %>%
36 |     rvest::html_attr("href") %>%
37 |     as.list()
38 | 
39 |   # the helper function safely creates a named list from objects
40 |   s_n_list(
41 |     datetime,
42 |     author,
43 |     headline,
44 |     text,
45 |     text_links
46 |   )
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/R/deliver_geenstijl_nl.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.geenstijl_nl <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element(".datetime") %>%
11 |     rvest::html_text2() %>%
12 |     lubridate::dmy_hm()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element("[rel=\"author\"]")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_element("article") %>%
28 |     rvest::html_elements("p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   cover_image_html <- html %>%
33 |     rvest::html_element("article img") %>%
34 |     as.character()
35 | 
36 |   cover_image_url <- html %>%
37 |     rvest::html_element("article img") %>%
38 |     rvest::html_attr("src")
39 | 
40 |   # the helper function safely creates a named list from objects
41 |   s_n_list(
42 |     datetime,
43 |     author,
44 |     headline,
45 |     text,
46 |     cover_image_url,
47 |     cover_image_html
48 |   )
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/R/deliver_jungefreiheit_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.jungefreiheit_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- json_df$headline
21 |         text <- html %>%
22 |             rvest::html_elements(".elementor-widget-container p, .elementor-widget-container h3") %>%
23 |             rvest::html_text2() %>%
24 |             paste(collapse = "\n")
25 | 
26 |         s_n_list(
27 |             datetime,
28 |             author,
29 |             headline,
30 |             text
31 |         )
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/R/deliver_nu_nl.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.nu_nl <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[name=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element(".author")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".textblock.paragraph") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   cover_image_html <- html %>%
32 |     rvest::html_element(".article .app-image") %>%
33 |     as.character()
34 | 
35 |   cover_image_url <- html %>%
36 |     rvest::html_element(".article .app-image") %>%
37 |     rvest::html_attr("src")
38 | 
39 |   # the helper function safely creates a named list from objects
40 |   s_n_list(
41 |     datetime,
42 |     author,
43 |     headline,
44 |     text,
45 |     cover_image_url,
46 |     cover_image_html
47 |   )
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_ostsee_zeitung_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.ostsee_zeitung_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_txt <- json_txt[grepl("NewsArticle", json_txt)]
12 |         if (length(json_txt) == 0) {
13 |             return(s_n_list())
14 |         }
15 |         json_df <- jsonlite::fromJSON(json_txt)
16 | 
17 |         datetime <- lubridate::as_datetime(json_df$datePublished)
18 |         headline <- json_df$headline
19 |         author <- toString(json_df$author$name)
20 |         text <- html %>%
21 |             rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0") %>%
22 |             rvest::html_text2() %>%
23 |             paste(collapse = "\n")
24 | 
25 |         s_n_list(
26 |             datetime,
27 |             author,
28 |             headline,
29 |             text
30 |         )
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/R/deliver_stuttgarter_zeitung_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.stuttgarter_zeitung_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".brick.intro-text p,.brickgroup p,.brickgroup h2") %>%
18 |             rvest::html_text2()
19 |         rm_text <- c("StZ-Plus-Abonnement", "Vertrag mit Werbung")
20 | 
21 |         text <- text[!text %in% rm_text] %>%
22 |             paste(collapse = "\n")
23 | 
24 |         s_n_list(
25 |             datetime,
26 |             author,
27 |             headline,
28 |             text
29 |         )
30 |     }
31 | }
32 | # rss feed includes pages that cannot be parsed because they are subpages
33 | # rss feed also includes podcast, which cannot be parsed
34 | 


--------------------------------------------------------------------------------
/R/deliver_wdr_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.wdr_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     # careful: json can have many objects but the first seems to be the article
 8 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 9 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
10 |         return(s_n_list())
11 |     } else {
12 |         json_df <- jsonlite::fromJSON(json_txt[1])
13 |         date_tmp <- json_df$datePublished # missing sec
14 |         date_tmp <- sub("(\\d{2}:\\d{2})(\\+\\d{2}:\\d{2})", "\\1:00\\2", date_tmp)
15 |         datetime <- lubridate::as_datetime(date_tmp)
16 |         headline <- json_df$headline
17 |         author <- toString(json_df$author$name) %>% gsub("/", ",", .)
18 |         text <- html %>%
19 |             rvest::html_elements(".einleitung,.text,.subtitle") %>%
20 |             rvest::html_text2() %>%
21 |             paste(collapse = "\n")
22 | 
23 |         s_n_list(
24 |             datetime,
25 |             author,
26 |             headline,
27 |             text
28 |         )
29 |     }
30 | }
31 | # rss feed contains also overviews of articles which make the parser fail
32 | 


--------------------------------------------------------------------------------
/R/deliver_bbc_co_uk.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.bbc_co_uk <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("time") %>%
11 |     rvest::html_attr("datetime") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element("[class*=\"TextContributorName\"]")  %>%
22 |     rvest::html_text2() %>%
23 |     stats::na.omit() %>%
24 |     toString()
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements("article [class*=\"RichText\"],article .story-body") %>%
29 |     rvest::html_elements("p,li") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   cover_image_html <- html %>%
34 |     rvest::html_element("picture img") %>%
35 |     as.character()
36 | 
37 |   cover_image_url <- html %>%
38 |     rvest::html_element("picture img") %>%
39 |     rvest::html_attr("src")
40 | 
41 |   s_n_list(
42 |     datetime,
43 |     author,
44 |     headline,
45 |     text,
46 |     cover_image_url,
47 |     cover_image_html
48 |   )
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/R/deliver_rollingstone_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.rollingstone_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- json_df$headline
21 |         text <- html %>%
22 |             rvest::html_elements(".asmb-article-excerpt,.asmb-article-content-container h2,.asmb-article-content-container p") %>%
23 |             rvest::html_text2() %>%
24 |             paste(collapse = "\n")
25 | 
26 |         s_n_list(
27 |             datetime,
28 |             author,
29 |             headline,
30 |             text
31 |         )
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/R/deliver_irishmirror_ie.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.irishmirror_ie <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("[property=\"article:published_time\"]") %>%
13 |     rvest::html_attr("content") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element("title") %>%
19 |     rvest::html_text2()
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_element("[name=\"author\"]")  %>%
24 |     rvest::html_attr("content") %>%
25 |     toString()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements(".article-wrapper p") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   cover_image_html <- html %>%
34 |     rvest::html_element(".article-wrapper .img-container img") %>%
35 |     as.character()
36 | 
37 |   cover_image_url <- html %>%
38 |     rvest::html_element(".article-wrapper .img-container img") %>%
39 |     rvest::html_attr("src")
40 | 
41 |   s_n_list(
42 |     datetime,
43 |     author,
44 |     headline,
45 |     text,
46 |     cover_image_url,
47 |     cover_image_html
48 |   )
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_skwawkbox_org.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.skwawkbox_org <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element(".entry-date") %>%
11 |     rvest::html_attr("datetime") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element(".entry-title") %>%
17 |     rvest::html_text()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element(".byline")  %>%
22 |     rvest::html_text2() %>%
23 |     toString() %>%
24 |     gsub("by ", "", ., fixed = TRUE)
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements(".entry-content>p:not(:contains('The SKWAWKBOX needs your support'))") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   # in-text links
33 |   text_links <- html %>%
34 |     rvest::html_elements(".entry-content>p:not(:contains('The SKWAWKBOX needs your support'))>a") %>%
35 |     rvest::html_attr("href") %>%
36 |     as.list()
37 | 
38 |   # the helper function safely creates a named list from objects
39 |   s_n_list(
40 |     datetime,
41 |     author,
42 |     headline,
43 |     text,
44 |     text_links
45 |   )
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/R/deliver_mediacourant_nl.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.mediacourant_nl <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element("[name=\"author\"]")  %>%
22 |     rvest::html_attr("content") %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".entry__content p,entry__content h2") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   cover_image_html <- html %>%
32 |     rvest::html_element("article img") %>%
33 |     as.character()
34 | 
35 |   cover_image_url <- html %>%
36 |     rvest::html_element("article img") %>%
37 |     rvest::html_attr("src")
38 | 
39 |   # the helper function safely creates a named list from objects
40 |   s_n_list(
41 |     datetime,
42 |     author,
43 |     headline,
44 |     text,
45 |     cover_image_url,
46 |     cover_image_html
47 |   )
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_newstatesman_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.newstatesman_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("[property=\"article:published_time\"]") %>%
13 |     rvest::html_attr("content") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element("[property=\"og:title\"]") %>%
19 |     rvest::html_attr("content")
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_element("[name=\"author\"]") %>%
24 |     rvest::html_attr("content")
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements(".c-article-content__container p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   cover_image_html <- html %>%
33 |     rvest::html_element(".c-featured-image__container img") %>%
34 |     as.character()
35 | 
36 |   cover_image_url <- html %>%
37 |     rvest::html_element(".c-featured-image__container img") %>%
38 |     rvest::html_attr("src")
39 | 
40 |   s_n_list(
41 |     datetime,
42 |     author,
43 |     headline,
44 |     text,
45 |     cover_image_url,
46 |     cover_image_html
47 |   )
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_ndr_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.ndr_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle
13 |             datetime <- lubridate::as_datetime(json_df$datePublished)
14 |             headline <- json_df$headline
15 |             author <- toString(json_df$author$name)
16 |             text <- html %>%
17 |                 rvest::html_elements(".modulepadding.copytext p, .modulepadding.copytext h2") %>%
18 |                 rvest::html_text2() %>%
19 |                 paste(collapse = "\n")
20 |         } else {
21 |             datetime <- lubridate::as_datetime(json_df$uploadDate)
22 |             headline <- json_df$name
23 |             author <- ""
24 |             text <- json_df$description
25 |         }
26 |         s_n_list(
27 |             datetime,
28 |             author,
29 |             headline,
30 |             text
31 |         )
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/R/deliver_ruhrnachrichten_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.ruhrnachrichten_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[grepl("NewsArticle|Article", json_df$`@type`), ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- json_df$headline
21 |         text <- html %>%
22 |             rvest::html_elements("p.article__teaser-text,.article__content p, .article__content h2") %>%
23 |             rvest::html_text2() %>%
24 |             paste(collapse = "\n") %>%
25 |             gsub("\nZur Startseite$", "", .)
26 | 
27 |         s_n_list(
28 |             datetime,
29 |             author,
30 |             headline,
31 |             text
32 |         )
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/R/deliver_thejournal_ie.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.thejournal_ie <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("[property=\"article:post_date\"]") %>%
13 |     rvest::html_attr("content") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element("title") %>%
19 |     rvest::html_text2()
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_elements("[property=\"article:author\"]")  %>%
24 |     rvest::html_attr("content") %>%
25 |     toString()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements("[itemprop=\"articleBody\"] p:not(.article-updated-redesign)") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   cover_image_html <- html %>%
34 |     rvest::html_element(".article-primary-img-redesign") %>%
35 |     as.character()
36 | 
37 |   cover_image_url <- html %>%
38 |     rvest::html_element(".article-primary-img-redesign") %>%
39 |     rvest::html_attr("srcset")
40 | 
41 |   s_n_list(
42 |     datetime,
43 |     author,
44 |     headline,
45 |     text,
46 |     cover_image_url,
47 |     cover_image_html
48 |   )
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/R/inspect.R:
--------------------------------------------------------------------------------
 1 | #' Inspect content collected with pb_collect
 2 | #'
 3 | #' Opens a browser to display the content saved in a row of a data.frame created
 4 | #' with \link{pb_collect}.
 5 | #'
 6 | #' @param x a data.frame returned by \link{pb_collect}.
 7 | #' @param i which entry to display.
 8 | #' @param host_ip,port host IP and port to create the temporary web server that
 9 | #'   shows the content.
10 | #'
11 | #' @export
12 | pb_inspect <- function(x,
13 |                        i = 1L,
14 |                        host_ip = "127.0.0.1",
15 |                        port = httpuv::randomPort()) {
16 | 
17 |   content_raw <- NULL
18 |   rlang::check_installed("httpuv")
19 | 
20 |   if (!"content_raw" %in% names(x))
21 |     stop("Only works with output from pb_collect()")
22 | 
23 |   if (!is.null(paperboy.env$server)) paperboy.env$server$stop()
24 | 
25 |   if (grepl("<|>", x$content_raw[i])) {
26 |     paperboy.env$server <- httpuv::startServer(
27 |       host = host_ip,
28 |       port = port,
29 |       app = list(
30 |         call = function(req) {
31 |           list(
32 |             status = 200L,
33 |             headers = list("Content-Type" = "text/html"),
34 |             body = x$content_raw[i]
35 |           )
36 |         }
37 |       )
38 |     )
39 |     utils::browseURL(paste0("http://", host_ip, ":", port))
40 |   } else {
41 |     utils::browseURL(x$content_raw[i])
42 |   }
43 | 
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/R/deliver_nos_nl.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.nos_nl <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"og:article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element(".NYlVB")  %>%
22 |     rvest::html_text2() %>%
23 |     stats::na.omit() %>%
24 |     toString()
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements("#content p,#content h2") %>%
29 |     rvest::html_text2() %>%
30 |     setdiff("Deel artikel:") %>%
31 |     paste(collapse = "\n")
32 | 
33 |   cover_image_html <- html %>%
34 |     rvest::html_element("#content button picture") %>%
35 |     as.character()
36 | 
37 |   cover_image_url <- html %>%
38 |     rvest::html_element("#content button picture img") %>%
39 |     rvest::html_attr("src")
40 | 
41 |   # the helper function safely creates a named list from objects
42 |   s_n_list(
43 |     datetime,
44 |     author,
45 |     headline,
46 |     text,
47 |     cover_image_url,
48 |     cover_image_html
49 |   )
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/R/deliver_metronieuws_nl.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.metronieuws_nl <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element(".article__title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_elements("[name=\"author\"]")  %>%
22 |     rvest::html_attr("content") %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".article__content>p,.article__content>h2:not(.coral-talk-heading)") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 | 
32 | 
33 |   cover_image_html <- html %>%
34 |     rvest::html_element(".featured-image img") %>%
35 |     as.character()
36 | 
37 |   cover_image_url <- html %>%
38 |     rvest::html_element(".featured-image img") %>%
39 |     rvest::html_attr("src")
40 | 
41 |   # the helper function safely creates a named list from objects
42 |   s_n_list(
43 |     datetime,
44 |     author,
45 |     headline,
46 |     text,
47 |     cover_image_url,
48 |     cover_image_html
49 |   )
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/R/deliver_irishexaminer_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.irishexaminer_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("[property=\"article:published_time\"]") %>%
13 |     rvest::html_attr("content") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element(".article-title") %>%
19 |     rvest::html_text2()
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_element(".author-byline")  %>%
24 |     rvest::html_text2() %>%
25 |     toString()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements("article p") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   cover_image_html <- html %>%
34 |     rvest::html_element("picture img") %>%
35 |     as.character()
36 | 
37 |   cover_image_url <- html %>%
38 |     rvest::html_element("picture img") %>%
39 |     rvest::html_attr("src")
40 | 
41 |   if (!is.na(cover_image_url))
42 |     cover_image_url <- paste0("https://www.irishexaminer.com", cover_image_url)
43 | 
44 |   s_n_list(
45 |     datetime,
46 |     author,
47 |     headline,
48 |     text,
49 |     cover_image_url,
50 |     cover_image_html
51 |   )
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/R/deliver_mopo_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.mopo_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- sub(" \\| .*", "", json_df$headline)
21 |         text <- html %>%
22 |             rvest::html_elements("p, h2") %>%
23 |             rvest::html_text2() %>%
24 |             paste(collapse = "\n")
25 |         if (author == "") {
26 |             # the text has the author abbr. at the end
27 |             author <- sub(".*\\(([^)]+)\\)$", "\\1", text)
28 |         }
29 |         s_n_list(
30 |             datetime,
31 |             author,
32 |             headline,
33 |             text
34 |         )
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/R/deliver_aktualne_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.aktualne_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element(".article-title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_elements(".author__name")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".article .article__perex,#article-content p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   cover_image_html <- html %>%
32 |     rvest::html_element(".article__photo--opener img") %>%
33 |     as.character()
34 | 
35 |   cover_image_url <- html %>%
36 |     rvest::html_element(".article__photo--opener img") %>%
37 |     rvest::html_attr("src")
38 | 
39 |   if (!is.na(cover_image_url)) {
40 |     cover_image_url <- paste0("https:", cover_image_url)
41 |   }
42 | 
43 |   s_n_list(
44 |     datetime,
45 |     author,
46 |     headline,
47 |     text,
48 |     cover_image_url,
49 |     cover_image_html
50 |   )
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/R/deliver_idnes_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.idnes_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   x$content_raw <- iconv(x$content_raw, from = "windows-1250", to = "UTF-8")
 7 |   html <- rvest::read_html(x$content_raw)
 8 | 
 9 |   # datetime
10 |   datetime <- html %>%
11 |     rvest::html_element("[property=\"article:published_time\"]") %>%
12 |     rvest::html_attr("content") %>%
13 |     lubridate::as_datetime()
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_element(".content h1") %>%
18 |     rvest::html_text2()
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_element("[property=\"article:author\"]")  %>%
23 |     rvest::html_attr("content") %>%
24 |     toString()
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements(".opener,.text p") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   cover_image_html <- html %>%
33 |     rvest::html_elements(".art-full img,video") %>%
34 |     as.character() %>%
35 |     paste(collapse = "\n")
36 | 
37 |   cover_image_url <- html %>%
38 |     rvest::html_element(".art-full img,video") %>%
39 |     rvest::html_attr("src") %>%
40 |     paste0("https:", .)
41 | 
42 |   s_n_list(
43 |     datetime,
44 |     author,
45 |     headline,
46 |     text,
47 |     cover_image_url,
48 |     cover_image_html
49 |   )
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/R/deliver_independent_ie.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.independent_ie <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("[property=\"article:modified_time\"]") %>%
13 |     rvest::html_attr("content") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element("[property=\"og:title\"]") %>%
19 |     rvest::html_attr("content")
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_elements("[name=\"cXenseParse:mhu-article_author\"]")  %>%
24 |     rvest::html_attr("content") %>%
25 |     toString()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements("[data-fragment-name=\"articleDetail\"] p") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   cover_image_html <- html %>%
34 |     rvest::html_element("[data-testid=\"article-image-wrapper\"] img") %>%
35 |     as.character()
36 | 
37 |   cover_image_url <- html %>%
38 |     rvest::html_element("[data-testid=\"article-image-wrapper\"] img") %>%
39 |     rvest::html_attr("src")
40 | 
41 |   s_n_list(
42 |     datetime,
43 |     author,
44 |     headline,
45 |     text,
46 |     cover_image_url,
47 |     cover_image_html
48 |   )
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/R/deliver_rtl_nl.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.rtl_nl <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("[property=\"article:published_time\"]") %>%
13 |     rvest::html_attr("content") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   if (is.na(datetime)) {
17 |     datetime <- html %>%
18 |       rvest::html_element("time") %>%
19 |       rvest::html_attr("datetime") %>%
20 |       lubridate::as_datetime()
21 |   }
22 | 
23 |   # headline
24 |   headline <- html %>%
25 |     rvest::html_element("[property=\"og:title\"]") %>%
26 |     rvest::html_attr("content")
27 | 
28 |   # author
29 |   author <- html %>%
30 |     rvest::html_element("[data-testid=\"author\"]")  %>%
31 |     rvest::html_text2() %>%
32 |     toString() %>%
33 |     # would be cleaner to remove the child, but not sure how
34 |     gsub("\\..*", "", .)
35 | 
36 |   # text
37 |   text <- html %>%
38 |     rvest::html_elements("main p") %>%
39 |     rvest::html_text2() %>%
40 |     paste(collapse = "\n")
41 | 
42 |   # the helper function safely creates a named list from objects
43 |   s_n_list(
44 |     datetime,
45 |     author,
46 |     headline,
47 |     text
48 |   )
49 | 
50 | }
51 | 
52 | pb_deliver_paper.rtlnieuws_nl <- pb_deliver_paper.rtl_nl
53 | 


--------------------------------------------------------------------------------
/submit2cran.r:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | ## Update roxygen and check
 3 | roxygen2::roxygenise(clean = TRUE)
 4 | devtools::check()
 5 | 
 6 | ## Check code quality
 7 | lintr::lint_package()
 8 | goodpractice::gp()
 9 | 
10 | ## Check spelling
11 | spelling::spell_check_package()
12 | spelling::update_wordlist()
13 | spelling::spell_check_files("README.Rmd", ignore = readLines("./inst/WORDLIST"), lang = "en-GB")
14 | 
15 | ## build manual
16 | devtools::build_manual()
17 | 
18 | # build readme
19 | parser_df <- rio::import("inst/status.csv") %>%
20 |   arrange(domain)
21 | ## check if all parsers are listed
22 | parser_available <- pb_available() %>%
23 |   str_remove("www.") %>%
24 |   tibble(parser = .)
25 | 
26 | parser_available %>%
27 |   anti_join(parser_df, by = c("parser" = "domain"))
28 | 
29 | rio::export(parser_df, "inst/status.csv")
30 | devtools::build_readme()
31 | lines <- readLines("README.md")
32 | writeLines(gsub("[\\#", "[#", lines, fixed = TRUE), "README.md")
33 | 
34 | 
35 | 
36 | ## build vignette
37 | devtools::build_vignettes()
38 | 
39 | ## test covr
40 | devtools::test_coverage()
41 | 
42 | 
43 | # For release on CRAN
44 | ## test on winbuilder
45 | devtools::check_win_devel()
46 | devtools::check_win_oldrelease()
47 | devtools::check_win_release()
48 | 
49 | ## check r_hub
50 | ch <- rhub::check_for_cran(show_status = FALSE)
51 | ch$livelog() # check status
52 | 
53 | ## release
54 | revdepcheck::revdep_check()
55 | devtools::release()
56 | 


--------------------------------------------------------------------------------
/R/deliver_irozhlas_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.irozhlas_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # data about the article is nicely stored in a json string
 9 |   data <- html %>%
10 |     rvest::html_elements("[type=\"application/ld+json\"]") %>%
11 |     rvest::html_text() %>%
12 |     lapply(jsonlite::fromJSON, simplifyVector = FALSE) %>%
13 |     purrr::pluck(1L)
14 | 
15 |   # usually there are more than one,
16 |   if (length(data) > 0L) {
17 |     tp <- purrr::map_chr(data, function(x)
18 |       purrr::pluck(x, "@type", .default = NA_character_))
19 | 
20 |     data <- purrr::pluck(data, which(tp == "NewsArticle"))
21 |   }
22 | 
23 |   # datetime
24 |   datetime <- data$datePublished %>%
25 |     lubridate::as_datetime()
26 | 
27 |   # headline
28 |   headline <- data$headline
29 | 
30 |   # author
31 |   author <- purrr::map_chr(data$author, "name") %>%
32 |     toString()
33 | 
34 |   # text
35 |   text <- html %>%
36 |     rvest::html_elements("article p:not(.meta):not([class*=\"b-audio-player\"])") %>%
37 |     rvest::html_text2() %>%
38 |     paste(collapse = "\n")
39 | 
40 |   cover_image_url <- purrr::pluck(data, "image", "url", .default = NA_character_)
41 | 
42 |   s_n_list(
43 |     datetime,
44 |     author,
45 |     headline,
46 |     text,
47 |     cover_image_url
48 |   )
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: paperboy
 2 | Title: A Comprehensive Collection of News Media Scrapers
 3 | Version: 0.0.7.9000
 4 | Date: 2024-12-26
 5 | Authors@R:
 6 |     c(person(given = "Johannes B.",
 7 |              family = "Gruber",
 8 |              email = "JohannesB.Gruber@gmail.com", 
 9 |              role = c("aut", "cre"),
10 |              comment = c(ORCID = "0000-0001-9177-1772")),
11 |       person(given = "David",
12 |              family = "Schoch",
13 |              email = "david@schochastics.net",
14 |              role = "ctb",
15 |              comment = c(ORCID = "0000-0003-2952-4812")))
16 | Description: A comprehensive collection of webscraping scripts for news media sites.
17 | Depends:
18 |     R (>= 3.5.0)
19 | License: GPL-3
20 | Imports:
21 |     adaR,
22 |     callr,
23 |     cli,
24 |     cookiemonster,
25 |     curl,
26 |     dplyr,
27 |     jsonlite,
28 |     lubridate,
29 |     magrittr,
30 |     methods,
31 |     praise,
32 |     purrr,
33 |     rlang,
34 |     rvest,
35 |     tibble,
36 |     tidyr,
37 |     tidyselect,
38 |     utils,
39 |     xml2
40 | Suggests:
41 |     covr,
42 |     httpuv,
43 |     knitr,
44 |     rmarkdown,
45 |     rstudioapi,
46 |     spelling,
47 |     testthat,
48 |     withr
49 | URL: https://github.com/JBGruber/paperboy, https://jbgruber.github.io/paperboy/
50 | Encoding: UTF-8
51 | BugReports: https://github.com/JBGruber/paperboy/issues
52 | RoxygenNote: 7.3.2
53 | VignetteBuilder: knitr
54 | Language: en-GB
55 | 


--------------------------------------------------------------------------------
/R/deliver_denikn_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.denikn_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element(".e_author_t")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements("article p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   paywall <- FALSE
32 |   if (length(rvest::html_element(html, ".e_lock__hard"))) {
33 |     text <- paste("[Paywall-Truncated]", text)
34 |     paywall <- TRUE
35 |   }
36 | 
37 |   cover_image_html <- html %>%
38 |     rvest::html_element("header .b_single_i img") %>%
39 |     as.character()
40 | 
41 |   cover_image_url <- html %>%
42 |     rvest::html_element("header .b_single_i img") %>%
43 |     rvest::html_attr("src")
44 | 
45 |   s_n_list(
46 |     datetime,
47 |     author,
48 |     headline,
49 |     text,
50 |     paywall,
51 |     cover_image_url,
52 |     cover_image_html
53 |   )
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/R/html_search.R:
--------------------------------------------------------------------------------
 1 | #' Search raw html for attributes
 2 | #'
 3 | #' @param html raw html
 4 | #' @param selectors a vector of CSS selectors to include in search.
 5 | #' @param attributes attributes to extract. If NULL, returns text.
 6 | #' @param all if TRUE, all selectors are collected. Otherwise, only the first
 7 | #'   non-empty result is used.
 8 | #' @param n if multiple are found, how many to return
 9 | #'
10 | #' @return a vector of max length n
11 | #' @keywords internal
12 | html_search <- function(html,
13 |                         selectors,
14 |                         attributes = NULL,
15 |                         all = TRUE,
16 |                         n = 1L) {
17 | 
18 |   if (all) {
19 |     res <- rvest::html_elements(html, paste0(selectors, collapse = ","))
20 |   } else {
21 |     res <- NULL
22 |     i <- 1L
23 |     l <- length(selectors)
24 |     while (length(res) < 1 && i < l) {
25 |       res <- rvest::html_elements(html, selectors[i])
26 |       i <- i + 1
27 |     }
28 |   }
29 | 
30 |   want_text <- "text" %in% attributes
31 |   if (want_text) attributes <- setdiff(attributes, "text")
32 | 
33 |   out <- rvest::html_attrs(res) %>%
34 |     unlist(recursive = FALSE) %>%
35 |     subset(., names(.) %in%
36 |              attributes) %>%
37 |     unname()
38 | 
39 |   if (want_text) out <- c(out, rvest::html_text2(res))
40 | 
41 |   if (is.null(out)) {
42 |     return(NA_character_)
43 |   } else {
44 |     return(utils::head(out, n))
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/R/deliver_novinky_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.novinky_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   page_data <- try({html %>%
 9 |     rvest::html_element(".page-detail script") %>%
10 |     rvest::html_text() %>%
11 |     jsonlite::fromJSON()}, silent = TRUE)
12 | 
13 |   # datetime
14 |   datetime <- purrr::pluck(page_data, "datePublished", .default = NA_character_) %>%
15 |     lubridate::as_datetime()
16 | 
17 |   # headline
18 |   headline <- purrr::pluck(page_data, "headline", .default = NA_character_)
19 | 
20 |   # author
21 |   author <- purrr::pluck(page_data, "author", "name", .default = NA_character_) %>%
22 |     toString()
23 | 
24 |   # text
25 |   text <- html %>%
26 |     rvest::html_elements(".j_if .speakable") %>%
27 |     rvest::html_text2() %>%
28 |     paste(collapse = "\n")
29 | 
30 |   cover_image_html <- html %>%
31 |     rvest::html_element(".ogm-main-media__container img") %>%
32 |     as.character()
33 | 
34 |   cover_image_url <- html %>%
35 |     rvest::html_element(".ogm-main-media__container img") %>%
36 |     rvest::html_attr("src") %>%
37 |     paste0("https:", .)
38 | 
39 |   # the helper function safely creates a named list from objects
40 |   s_n_list(
41 |     datetime,
42 |     author,
43 |     headline,
44 |     text,
45 |     cover_image_url,
46 |     cover_image_html
47 |   )
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/R/deliver_der_postillon_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.der_postillon_com <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 | 
13 |         datetime <- lubridate::as_datetime(json_df$datePublished)
14 |         headline <- json_df$headline
15 |         author <- toString(json_df$author$name)
16 |         text <- html %>%
17 |             rvest::html_elements(".post-body p") %>%
18 |             rvest::html_text2() %>%
19 |             paste(collapse = "\n")
20 | 
21 |         # author abbr can be found at the end of the article
22 |         if (author == "Der Postillon") {
23 |             author_tmp <- html %>%
24 |                 rvest::html_element("div[id='post-body'] span[style='font-size: x-small;']") %>%
25 |                 rvest::html_text() %>%
26 |                 sub("; Erstver.*$", "", .)
27 |             if (author_tmp != "") {
28 |                 author <- author_tmp
29 |             }
30 |         }
31 |         s_n_list(
32 |             datetime,
33 |             author,
34 |             headline,
35 |             text
36 |         )
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/R/deliver_ksta_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.ksta_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- sub(" \\| .*", "", json_df$headline)
21 |         text <- html %>%
22 |             rvest::html_elements(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>%
23 |             rvest::html_text2() %>%
24 |             paste(collapse = "\n")
25 |         if (author == "") {
26 |             # the text has the author abbr. at the end
27 |             author <- sub(".*\\(([^)]+)\\)$", "\\1", text)
28 |         }
29 |         s_n_list(
30 |             datetime,
31 |             author,
32 |             headline,
33 |             text
34 |         )
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/R/deliver_rte_ie.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.rte_ie <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   # datetime
11 |   datetime <- html %>%
12 |     rvest::html_element("[property=\"article:published_time\"]") %>%
13 |     rvest::html_attr("content") %>%
14 |     lubridate::as_datetime()
15 | 
16 |   # headline
17 |   headline <- html %>%
18 |     rvest::html_element("title") %>%
19 |     rvest::html_text2()
20 | 
21 |   # author
22 |   author <- html %>%
23 |     rvest::html_elements("[itemprop=\"author\"]>[itemprop=\"name\"]")  %>%
24 |     rvest::html_attr("content") %>%
25 |     toString()
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements(".article-body p") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   type <- html %>%
34 |     rvest::html_element("[name=\"article-type\"]") %>%
35 |     rvest::html_attr("content")
36 | 
37 |   cover_image_html <- html %>%
38 |     rvest::html_element("#main-article-image img") %>%
39 |     as.character()
40 | 
41 |   cover_image_url <- html %>%
42 |     rvest::html_element("#main-article-image img") %>%
43 |     rvest::html_attr("src")
44 | 
45 |   s_n_list(
46 |     datetime,
47 |     author,
48 |     headline,
49 |     text,
50 |     type,
51 |     cover_image_url,
52 |     cover_image_html
53 |   )
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/R/deliver_express_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.express_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[1])
12 |         json_df <- json_df$`@graph`
13 |         if (any(json_df$`@type` == "Person")) {
14 |             author <- toString(json_df$name[json_df$`@type` == "Person"])
15 |         } else {
16 |             author <- ""
17 |         }
18 |         json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ]
19 |         datetime <- lubridate::as_datetime(json_df$datePublished)
20 |         headline <- sub(" \\| .*", "", json_df$headline)
21 |         text <- html %>%
22 |             rvest::html_elements(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>%
23 |             rvest::html_text2() %>%
24 |             paste(collapse = "\n")
25 |         if (author == "") {
26 |             # the text has the author abbr. at the end
27 |             author <- sub(".*\\(([^)]+)\\)$", "\\1", text)
28 |         }
29 |         s_n_list(
30 |             datetime,
31 |             author,
32 |             headline,
33 |             text
34 |         )
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/R/deliver_irishtimes_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.irishtimes_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   data <- html %>%
11 |     rvest::html_elements("[type=\"application/ld+json\"]") %>%
12 |     rvest::html_text2() %>%
13 |     lapply(jsonlite::fromJSON)
14 | 
15 |   # usually there are more than one,
16 |   if (length(data) > 1L) {
17 |     tp <- purrr::map_chr(data, function(x)
18 |       purrr::pluck(x, "@type", .default = NA_character_))
19 | 
20 |     data <- purrr::pluck(data, which(tp == "NewsArticle"), .default = NA)
21 |   }
22 | 
23 |   if (!isTRUE(is.na(data))) {
24 | 
25 |     # datetime
26 |     datetime <- data$datePublished %>%
27 |       lubridate::as_datetime()
28 | 
29 |     # headline
30 |     headline <- data$headline
31 | 
32 |     # author
33 |     author <- data$author$name %>%
34 |       toString()
35 | 
36 |     # text
37 |     text <- html %>%
38 |       rvest::html_elements("article p") %>%
39 |       rvest::html_text2() %>%
40 |       paste(collapse = "\n")
41 | 
42 |     cover_image_url <- purrr::pluck(data$image, 1, .default = NA)
43 | 
44 |     type <- data$`@type`
45 | 
46 |     s_n_list(
47 |       datetime,
48 |       author,
49 |       headline,
50 |       text,
51 |       type,
52 |       cover_image_url
53 |     )
54 |   } else {
55 |     s_n_list()
56 |   }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/R/deliver_rnd_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.rnd_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         if (length(json_txt) <= 2) {
12 |             return(s_n_list())
13 |         }
14 |         json_df <- jsonlite::fromJSON(json_txt[3])
15 | 
16 |         datetime <- lubridate::as_datetime(json_df$datePublished)
17 |         headline <- json_df$headline
18 |         author <- toString(json_df$author$name)
19 |         text <- html %>%
20 |             rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>%
21 |             rvest::html_text2()
22 | 
23 |         more_items <- html %>% # delete content in lists of related items
24 |             rvest::html_elements("div[data-is-element-rendered='true']") %>%
25 |             rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>%
26 |             rvest::html_text2()
27 |         text <- text[!text %in% more_items] %>% paste(collapse = "\n")
28 |         s_n_list(
29 |             datetime,
30 |             author,
31 |             headline,
32 |             text
33 |         )
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/R/deliver_sky_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.sky_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   # updates progress bar
 5 |   pb_tick(x, verbose, pb)
 6 | 
 7 |   # raw html is stored in column content_raw
 8 |   html <- rvest::read_html(x$content_raw)
 9 | 
10 |   data <- html %>%
11 |     rvest::html_elements("[type=\"application/ld+json\"]") %>%
12 |     rvest::html_text2() %>%
13 |     lapply(jsonlite::fromJSON)
14 | 
15 |   # usually there are more than one,
16 |   if (length(data) > 1L) {
17 |     tp <- purrr::map_chr(data, function(x)
18 |       purrr::pluck(x, "@type", .default = NA_character_))
19 | 
20 |     data <- purrr::pluck(data, which(tp == "NewsArticle"), .default = NA)
21 |   }
22 | 
23 |   if (!isTRUE(is.na(data))) {
24 | 
25 |     # datetime
26 |     datetime <- data$datePublished %>%
27 |       lubridate::as_datetime()
28 | 
29 |     # headline
30 |     headline <- data$headline
31 | 
32 |     # author
33 |     author <- data$author$name %>%
34 |       toString()
35 | 
36 |     # text
37 |     text <- html %>%
38 |       rvest::html_elements(".sdc-article-body p") %>%
39 |       rvest::html_text2() %>%
40 |       paste(collapse = "\n")
41 | 
42 |     cover_image_url <- purrr::pluck(data$image, "url", .default = NA)
43 | 
44 |     type <- data$`@type`
45 | 
46 |     s_n_list(
47 |       datetime,
48 |       author,
49 |       headline,
50 |       text,
51 |       type,
52 |       cover_image_url
53 |     )
54 |   } else {
55 |     s_n_list()
56 |   }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/R/deliver_hn_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.hn_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(charToRaw(enc2utf8(x$content_raw)))
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("title") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_elements("[name=\"author\"]")  %>%
22 |     rvest::html_attr("content") %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements(".article-content p") %>%
28 |     rvest::html_text2() %>%
29 |     paste(collapse = "\n")
30 | 
31 |   paywall <- FALSE
32 |   if (length(rvest::html_element(html, ".paywall"))) {
33 |     text <- paste("[Paywall-Truncated]", text)
34 |     paywall <- TRUE
35 |   }
36 | 
37 |   cover_image_html <- html %>%
38 |     rvest::html_element(".article-image-wrapper img") %>%
39 |     as.character()
40 | 
41 |   cover_image_url <- html %>%
42 |     rvest::html_element(".article-image-wrapper img") %>%
43 |     rvest::html_attr("src")
44 | 
45 |   s_n_list(
46 |     datetime,
47 |     author,
48 |     headline,
49 |     text,
50 |     paywall,
51 |     cover_image_url,
52 |     cover_image_html
53 |   )
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/tests/testthat/test-deliver.R:
--------------------------------------------------------------------------------
 1 | test_that("Test infrascture", {
 2 |   expect_message(
 3 |     pb_deliver("google.com", verbose = TRUE),
 4 |     "No parser for domain"
 5 |   )
 6 |   # only warn first time
 7 |   expect_no_message(
 8 |     pb_deliver("google.com", verbose = FALSE)
 9 |   )
10 |   # still warn with new site
11 |   expect_message(
12 |     pb_deliver("duckduckgo.com/", verbose = TRUE),
13 |     "No parser for domain"
14 |   )
15 |   expect_equal(
16 |     nrow(pb_deliver("duckduckgo.com/", try_default = FALSE)),
17 |     0L
18 |   )
19 |   expect_error(
20 |     pb_deliver(list("google.com"), verbose = FALSE),
21 |     "No method for class list."
22 |   )
23 |   expect_error(
24 |     pb_deliver(data.frame(test = "google.com"), verbose = FALSE),
25 |     "must be a character vector of URLs"
26 |   )
27 |   expect_message(
28 |     pb_deliver(pb_collect("https://httpbin.org/status/404", verbose = FALSE)),
29 |     "1 URL removed due to bad status."
30 |   )
31 | })
32 | 
33 | test_that("Test theguardian scraper", {
34 |   skip_if_offline()
35 |   expect_equal({
36 |       out <- pb_deliver("https://tinyurl.com/386e98k5", verbose = FALSE)
37 |       c(class(out), ncol(out), nrow(out))
38 |     }, c("tbl_df", "tbl", "data.frame", "9", "1"))
39 | })
40 | 
41 | test_that("Test huffpost scraper", {
42 |   skip_if_offline()
43 |   expect_equal({
44 |     out <- pb_deliver("https://tinyurl.com/4shbwkxs", verbose = FALSE)
45 |     c(class(out), ncol(out), nrow(out))
46 |   }, c("tbl_df", "tbl", "data.frame", "9", "1"))
47 | })
48 | 
49 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |   release:
 8 |     types: [published]
 9 |   workflow_dispatch:
10 | 
11 | name: pkgdown.yaml
12 | 
13 | permissions: read-all
14 | 
15 | jobs:
16 |   pkgdown:
17 |     runs-on: ubuntu-latest
18 |     # Only restrict concurrency for non-PR jobs
19 |     concurrency:
20 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 |     env:
22 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 |     permissions:
24 |       contents: write
25 |     steps:
26 |       - uses: actions/checkout@v4
27 | 
28 |       - uses: r-lib/actions/setup-pandoc@v2
29 | 
30 |       - uses: r-lib/actions/setup-r@v2
31 |         with:
32 |           use-public-rspm: true
33 | 
34 |       - uses: r-lib/actions/setup-r-dependencies@v2
35 |         with:
36 |           extra-packages: any::pkgdown, local::.
37 |           needs: website
38 | 
39 |       - name: Build site
40 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 |         shell: Rscript {0}
42 | 
43 |       - name: Deploy to GitHub pages 🚀
44 |         if: github.event_name != 'pull_request'
45 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
46 |         with:
47 |           clean: false
48 |           branch: gh-pages
49 |           folder: docs
50 | 


--------------------------------------------------------------------------------
/R/deliver_rtl_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.rtl_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         json_df <- jsonlite::fromJSON(json_txt[2])
12 |         if (any(json_df$`@type` %in% c("VideoGame"))) {
13 |             return(s_n_list())
14 |         }
15 |         if (json_df$`@type` != "VideoObject") { # NewsArticle
16 |             datetime <- lubridate::as_datetime(json_df$datePublished)
17 |             headline <- json_df$headline
18 |             author <- toString(json_df$author$name)
19 |             text <- html %>%
20 |                 rvest::html_elements(".article-body .LeadText_lead__rfwFU,.article-body .AnnotatedMarkup_paragraph__IUT9l") %>%
21 |                 rvest::html_text2() %>%
22 |                 paste(collapse = "\n")
23 |         } else {
24 |             datetime <- lubridate::as_datetime(json_df$uploadDate)
25 |             headline <- json_df$name
26 |             author <- ""
27 |             text <- json_df$transcript # for video objects, use transcript as text
28 |         }
29 | 
30 |         s_n_list(
31 |             datetime,
32 |             author,
33 |             headline,
34 |             text
35 |         )
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/R/deliver_ceskatelevize_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.ceskatelevize_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_elements("[type=\"application/json\"]") %>%
11 |     rvest::html_text() %>%
12 |     extract("(?<=\"startsAt\":\")\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}") %>%
13 |     lubridate::as_datetime()
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_element("[property=\"og:title\"]") %>%
18 |     rvest::html_attr("content")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_element("[name=\"author\"]")  %>%
23 |     rvest::html_attr("content") %>%
24 |     toString()
25 | 
26 |   if (author == "NA") {
27 |     author <- html %>%
28 |       rvest::html_element(".article-meta__authors")  %>%
29 |       rvest::html_text() %>%
30 |       trimws()
31 |   }
32 | 
33 |   # text
34 |   text <- html %>%
35 |     rvest::html_elements(".article__content p") %>%
36 |     rvest::html_text2() %>%
37 |     paste(collapse = "\n")
38 | 
39 |   cover_image_html <- html %>%
40 |     rvest::html_element("main img") %>%
41 |     as.character()
42 | 
43 |   cover_image_url <- html %>%
44 |     rvest::html_element("main img") %>%
45 |     rvest::html_attr("src")
46 | 
47 |   s_n_list(
48 |     datetime,
49 |     author,
50 |     headline,
51 |     text,
52 |     cover_image_url,
53 |     cover_image_html
54 |   )
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/R/deliver_prosieben_de.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.prosieben_de <- function(x, verbose = NULL, pb, ...) {
 3 |     pb_tick(x, verbose, pb)
 4 |     # raw html is stored in column content_raw
 5 |     html <- rvest::read_html(x$content_raw)
 6 | 
 7 |     json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
 8 |     if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
 9 |         return(s_n_list())
10 |     } else {
11 |         if (length(json_txt) == 2) {
12 |             json_txt <- json_txt[2]
13 |         }
14 |         json_df <- jsonlite::fromJSON(json_txt)
15 |         if (json_df$`@type` != "VideoObject" && json_df$`@type` != "FAQPage") { # NewsArticle
16 |             datetime <- lubridate::as_datetime(json_df$datePublished)
17 |             headline <- json_df$headline
18 |             author <- toString(json_df$author$name)
19 |             text <- html %>%
20 |                 rvest::html_elements(".css-f9qfdi p.css-bq2685,.css-f9qfdi h2") %>%
21 |                 rvest::html_text2() %>%
22 |                 paste(collapse = "\n")
23 |         } else if (json_df$`@type` != "FAQPage") {
24 |             return(s_n_list())
25 |         } else {
26 |             datetime <- lubridate::as_datetime(json_df$uploadDate)
27 |             headline <- json_df$name
28 |             author <- ""
29 |             text <- json_df$description # for video objects, use description as text
30 |         }
31 | 
32 |         s_n_list(
33 |             datetime,
34 |             author,
35 |             headline,
36 |             text
37 |         )
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/R/deliver_lidovky_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.lidovky_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(charToRaw(enc2utf8(x$content_raw)))
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_element("[itemprop=\"name headline\"]") %>%
17 |     rvest::html_text2()
18 | 
19 |   # author
20 |   author <- html %>%
21 |     rvest::html_element("[itemprop=\"author\"] span")  %>%
22 |     rvest::html_text2() %>%
23 |     toString()
24 | 
25 |   # text
26 |   text <- html %>%
27 |     rvest::html_elements("[itemprop=\"articleBody\"] p,.opener") %>%
28 |     rvest::html_text2() %>%
29 |     trimws() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   paywall <- FALSE
33 |   if (length(rvest::html_element(html, "#paywall"))) {
34 |     text <- paste("[Paywall-Truncated]", text)
35 |     paywall <- TRUE
36 |   }
37 | 
38 |   cover_image_html <- html %>%
39 |     rvest::html_element(".opener-foto img,.opener-flv-player img") %>%
40 |     as.character()
41 | 
42 |   cover_image_url <- html %>%
43 |     rvest::html_element(".opener-foto img,.opener-flv-player img") %>%
44 |     rvest::html_attr("src") %>%
45 |     paste0("https:", .)
46 | 
47 |   s_n_list(
48 |     datetime,
49 |     author,
50 |     headline,
51 |     text,
52 |     paywall,
53 |     cover_image_url,
54 |     cover_image_html
55 |   )
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/R/deliver_parlamentnilisty_cz.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.parlamentnilisty_cz <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(charToRaw(enc2utf8(x$content_raw)))
 7 | 
 8 |   # data about the article is nicely stored in a json string
 9 |   data <- html %>%
10 |     rvest::html_elements("[type=\"application/ld+json\"]") %>%
11 |     rvest::html_text() %>%
12 |     gsub("[\r\n]", "", .) %>% # sometimes uses illegal line breaks
13 |     lapply(jsonlite::fromJSON, simplifyVector = FALSE)
14 | 
15 |   # usually there are more than one,
16 |   if (length(data) > 0L) {
17 |     tp <- purrr::map_chr(data, function(x)
18 |       purrr::pluck(x, "@type", .default = NA_character_))
19 | 
20 |     data <- purrr::pluck(data, which(tp == "NewsArticle"))
21 |   }
22 | 
23 |   # datetime
24 |   datetime <- data$datePublished %>%
25 |     lubridate::as_datetime()
26 | 
27 |   # headline
28 |   headline <- data$headline
29 | 
30 |   # author
31 |   author <- purrr::pluck(data$author, "name", .default = NA_character_) %>%
32 |     toString()
33 | 
34 |   # text
35 |   text <- html %>%
36 |     rvest::html_elements("article .article-content>p,article .brief") %>%
37 |     rvest::html_elements(":not(style)") %>%
38 |     rvest::html_text2() %>%
39 |     paste(collapse = "\n")
40 | 
41 |   cover_image_url <- purrr::pluck(data, "image", "url", .default = NA_character_)
42 |   if (!is.na(cover_image_url)) {
43 |     cover_image_url <- gsub("amp;", "", cover_image_url, fixed = TRUE)
44 |   }
45 | 
46 |   s_n_list(
47 |     datetime,
48 |     author,
49 |     headline,
50 |     text,
51 |     cover_image_url
52 |   )
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/R/deliver_huffpost_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.huffpost_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_elements("[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   # headline
15 |   headline <- html %>%
16 |     rvest::html_elements(".headline__title,.headline__subtitle,.js-headline,.headline") %>%
17 |     rvest::html_text() %>%
18 |     paste0(collapse = ". ")
19 | 
20 |   # author
21 |   author <- html %>%
22 |     rvest::html_element(".author-card__name,.wire-byline,.entry-wirepartner__byline") %>%
23 |     rvest::html_text() %>%
24 |     gsub("^By\\b\\s+", "", .)
25 | 
26 |   # text
27 |   text <- html %>%
28 |     rvest::html_elements("p,.entry-video__content__description") %>%
29 |     rvest::html_text2() %>%
30 |     paste(collapse = "\n")
31 | 
32 |   type <- html %>%
33 |     rvest::html_elements("article") %>%
34 |     rvest::html_attrs() %>%
35 |     .[[1]]
36 | 
37 |   content_type <- dplyr::case_when(
38 |     "article" %in% type ~ "article",
39 |     "entry-video" %in% type ~ "video",
40 |     TRUE ~ "unknown"
41 |   )
42 | 
43 | 
44 |   # the helper function safely creates a named list from objects
45 |   s_n_list(
46 |     datetime,
47 |     author,
48 |     headline,
49 |     text,
50 |     content_type
51 |   )
52 | 
53 | }
54 | 
55 | 
56 | # define aliases for pages using the same layout
57 | pb_deliver_paper.huffingtonpost_com <-
58 |   pb_deliver_paper.huffingtonpost_co_uk <-
59 |   pb_deliver_paper.huffpost_com
60 | 


--------------------------------------------------------------------------------
/R/deliver_telegraaf_nl.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.telegraaf_nl <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   data <- html %>%
 9 |     rvest::html_elements("[data-name=\"PageTracking\"]") %>%
10 |     rvest::html_text2() %>%
11 |     jsonlite::fromJSON()
12 | 
13 |   type <- purrr::pluck(data, "article", "type")
14 |   paywall <- purrr::pluck(data, "article", "premium")
15 | 
16 |   # datetime
17 |   datetime <- purrr::pluck(data, "article", "publishDate") %>%
18 |     lubridate::as_datetime()
19 | 
20 |   # headline
21 |   headline <- purrr::pluck(data, "article", "title")
22 | 
23 |   # author
24 |   author <- purrr::pluck(data, "article", "author", .default = NA_character_)
25 | 
26 |   # text
27 |   if (type == "normal") {
28 |     text <- html %>%
29 |       rvest::html_elements(".Article__intro,.DetailBodyBlocks p") %>%
30 |       rvest::html_text2() %>%
31 |       paste(collapse = "\n")
32 |   } else {
33 |     text <- paste0("[", type, "]")
34 |   }
35 | 
36 |   cover_image_html <- html %>%
37 |     rvest::html_element(".DetailArticleImage img") %>%
38 |     as.character()
39 | 
40 |   cover_image_url <- html %>%
41 |     rvest::html_element(".DetailArticleImage img") %>%
42 |     rvest::html_attr("src")
43 | 
44 |   if (!is.na(cover_image_url))
45 |     cover_image_url <- paste0("https://www.telegraaf.nl", cover_image_url)
46 | 
47 |   # the helper function safely creates a named list from objects
48 |   s_n_list(
49 |     datetime,
50 |     author,
51 |     headline,
52 |     text,
53 |     type,
54 |     paywall,
55 |     cover_image_url,
56 |     cover_image_html
57 |   )
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/R/deliver_nytimes_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.nytimes_com <- function(x, verbose, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     html_search(selectors = c(
11 |       "[property=\"article:published_time\"]"
12 |     ), attributes = "content") %>%
13 |     lubridate::as_datetime()
14 | 
15 |   # author
16 |   author <- html %>%
17 |     rvest::html_elements("[name=\"byl\"]")  %>%
18 |     rvest::html_attr("content") %>%
19 |     toString() %>%
20 |     gsub("By ", "", ., fixed = TRUE) %>%
21 |     unique() %>%
22 |     toString()
23 | 
24 |   if (!isFALSE(is.na(datetime))) {
25 |     datetime <- html %>%
26 |       rvest::html_elements("[slot=\"data\"],script") %>%
27 |       rvest::html_text() %>%
28 |       extract("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z") %>%
29 |       unique() %>%
30 |       lubridate::as_datetime() %>%
31 |       utils::head(1L)
32 |   }
33 | 
34 |   # headline
35 |   headline <- html %>%
36 |     rvest::html_elements("[property=\"og:title\"]") %>%
37 |     rvest::html_attr("content")
38 | 
39 |   # text
40 |   text_temp <- html %>%
41 |     rvest::html_elements("[name=\"articleBody\"]")
42 | 
43 |   if (length(text_temp) > 0) {
44 |     text <- text_temp %>%
45 |       rvest::html_elements("p") %>%
46 |       rvest::html_text2() %>%
47 |       paste(collapse = "\n")
48 |   } else {
49 |     text <- html %>%
50 |       rvest::html_elements("p") %>%
51 |       rvest::html_text2() %>%
52 |       paste(collapse = "\n")
53 |   }
54 | 
55 |   # the helper function safely creates a named list from objects
56 |   s_n_list(
57 |     datetime,
58 |     author,
59 |     headline,
60 |     text
61 |   )
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/R/deliver_nrc_nl.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.nrc_nl <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_element("time") %>%
11 |     rvest::html_attr("datetime") %>%
12 |     lubridate::as_datetime()
13 | 
14 |   type <- NULL
15 |   if (is.na(datetime)) {
16 |     datetime <- html %>%
17 |       rvest::html_element(".artikel") %>%
18 |       rvest::html_attr("data-article-updated-at") %>%
19 |       lubridate::as_datetime()
20 | 
21 |     type <- html %>%
22 |       rvest::html_element(".artikel") %>%
23 |       rvest::html_attr("data-article-type")
24 |   }
25 | 
26 |   # headline
27 |   headline <- html %>%
28 |     rvest::html_element("[property=\"og:title\"]") %>%
29 |     rvest::html_attr("content")
30 | 
31 |   if (!is.null(type)) headline <- paste0("[", type, "] ", headline)
32 | 
33 |   # author
34 |   author <- html %>%
35 |     rvest::html_elements("[rel=\"author\"],.authors")  %>%
36 |     rvest::html_text2() %>%
37 |     toString()
38 | 
39 |   # text
40 |   text <- html %>%
41 |     rvest::html_elements(".article__content>p,.article__content>.bericht>p,.podcast-content,.vorm__article-content>p") %>%
42 |     rvest::html_text2() %>%
43 |     paste(collapse = "\n")
44 | 
45 |   cover_image_html <- html %>%
46 |     rvest::html_element("picture img") %>%
47 |     as.character()
48 | 
49 |   cover_image_url <- html %>%
50 |     rvest::html_element("picture img") %>%
51 |     rvest::html_attr("src")
52 | 
53 |   # the helper function safely creates a named list from objects
54 |   s_n_list(
55 |     datetime,
56 |     author,
57 |     headline,
58 |     text,
59 |     cover_image_url,
60 |     cover_image_html
61 |   )
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/tests/testthat/test-misc.R:
--------------------------------------------------------------------------------
 1 | test_that("normalise_df works", {
 2 |   expect_equal(
 3 |     names(normalise_df(data.frame(test = TRUE))),
 4 |     c("url", "expanded_url", "domain", "status", "datetime", "author",
 5 |       "headline", "text", "misc")
 6 |   )
 7 |   expect_equal({
 8 |     out <- normalise_df(list(
 9 |       tibble::tibble(url = "test.com/1", test = TRUE),
10 |       tibble::tibble(url = "test.com/2", test = list(c(TRUE, FALSE)))
11 |     ))
12 |     purrr::map(out$misc, "test")
13 |   }, list(list(TRUE), list(c(TRUE, FALSE))))
14 | })
15 | 
16 | test_that("pb_available works", {
17 |   expect_equal({
18 |       out <- pb_available()
19 |       c(class(out), length(out) > 10)
20 |     },
21 |     c("character", "TRUE")
22 |   )
23 | })
24 | 
25 | test_that("Test safe named list making", {
26 |   expect_equal({
27 |       text <- "hello world"
28 |       author <- "Max Mustermann"
29 |       headline <- "lorem ipsum"
30 |       datetime <- character()
31 | 
32 |       paperboy:::s_n_list(
33 |         text,
34 |         author,
35 |         headline,
36 |         datetime
37 |       )
38 |     },
39 |     tibble::tibble(text = "hello world",
40 |                    author = "Max Mustermann",
41 |                    headline = "lorem ipsum",
42 |                    datetime = NA)
43 |   )
44 |   expect_equal({
45 |       text <- "hello world"
46 |       author <- c("Max Mustermann", "Erika Mustermann")
47 |       headline <- "lorem ipsum"
48 |       datetime <- character()
49 | 
50 |       paperboy:::s_n_list(
51 |         text,
52 |         author,
53 |         headline,
54 |         datetime
55 |       )
56 |     },
57 |     tibble::tibble(text = "hello world",
58 |                    author = list(c("Max Mustermann", "Erika Mustermann")),
59 |                    headline = "lorem ipsum",
60 |                    datetime = NA)
61 |   )
62 | })
63 | 


--------------------------------------------------------------------------------
/R/deliver_cnn_com.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | pb_deliver_paper.cnn_com <- function(x, verbose = NULL, pb, ...) {
 3 | 
 4 |   pb_tick(x, verbose, pb)
 5 |   # raw html is stored in column content_raw
 6 |   html <- rvest::read_html(x$content_raw)
 7 | 
 8 |   # datetime
 9 |   datetime <- html %>%
10 |     rvest::html_elements("[name=\"pubdate\"],[name=\"parsely-pub-date\"],[property=\"article:published_time\"]") %>%
11 |     rvest::html_attr("content") %>%
12 |     lubridate::as_datetime() %>%
13 |     utils::head(1L)
14 | 
15 |   # headline
16 |   headline <- html %>%
17 |     rvest::html_elements(".pg-headline,.headline>h1,[id*=\"video-headline\"],.headline__text,.PageHead__title,.Article__title") %>%
18 |     rvest::html_text2()
19 | 
20 |   # author
21 |   author <- html %>%
22 |     html_search(c(".Authors__writer", "[name=\"author\"]", ".byline__names"),
23 |                 c("text", "content")) %>%
24 |     toString() %>%
25 |     gsub("^By\\s", "", .)
26 | 
27 |   # text
28 |   text <- html %>%
29 |     rvest::html_elements(".article__content p:not(.editor-note),.zn-body-text,article,.article__main,BasicArticle__paragraph,[class^=\"Paragraph\"]") %>%
30 |     rvest::html_text2() %>%
31 |     paste(collapse = "\n")
32 | 
33 |   # type
34 |   content_type <- html %>%
35 |     rvest::html_element("[property=\"og:title\"]") %>%
36 |     rvest::html_attr("content") %>%
37 |     toString() %>%
38 |     {
39 |       x <- .
40 |       dplyr::case_when(
41 |         grepl("Live", x, ignore.case = TRUE) ~ "live",
42 |         grepl("Video", x, ignore.case = TRUE) ~ "video",
43 |         TRUE ~ "article"
44 |       )
45 |     }
46 | 
47 |   s_n_list(
48 |     datetime,
49 |     author,
50 |     headline,
51 |     text,
52 |     content_type
53 |   )
54 | 
55 | }
56 | 
57 | pb_deliver_paper.us_cnn_com <-
58 |   pb_deliver_paper.edition_cnn_com <-
59 |   pb_deliver_paper.cnn_com
60 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macos-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v3
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 
51 |       - name: Test coverage
52 |         run: covr::codecov()
53 |         shell: Rscript {0}
54 | 
55 |       - name: Upload check results
56 |         if: failure()
57 |         uses: actions/upload-artifact@main
58 |         with:
59 |           name: ${{ runner.os }}-r${{ matrix.config.r }}-results
60 |           path: check
61 | 


--------------------------------------------------------------------------------