├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── collect.R ├── deliver.R ├── deliver_3sat_de.R ├── deliver_abendblatt_de.R ├── deliver_abendzeitung_muenchen_de.R ├── deliver_ac24_cz.R ├── deliver_ad_nl.R ├── deliver_aktualne_cz.R ├── deliver_anotherangryvoice_blogspot_com.R ├── deliver_augsburger_allgemeine.R ├── deliver_badische_zeitung_de.R ├── deliver_bbc_co_uk.R ├── deliver_berliner_kurier_de.R ├── deliver_berliner_zeitung_de.R ├── deliver_bild_de.R ├── deliver_blesk_cz.R ├── deliver_bnn_de.R ├── deliver_br_de.R ├── deliver_breakingnews_ie.R ├── deliver_breitbart_com.R ├── deliver_businessinsider_de.R ├── deliver_buzzfeed_com.R ├── deliver_cbsnews_com.R ├── deliver_ceskatelevize_cz.R ├── deliver_cnet_com.R ├── deliver_cnn_com.R ├── deliver_dailymail_co_uk.R ├── deliver_default.R ├── deliver_denikn_cz.R ├── deliver_der_postillon_com.R ├── deliver_derstandard_at.R ├── deliver_derwesten_de.R ├── deliver_deutschlandfunk_de.R ├── deliver_deutschlandfunkkultur_de.R ├── deliver_dnn_de.R ├── deliver_echo24_de.R ├── deliver_epochtimes_de.R ├── deliver_evolvepolitics_com.R ├── deliver_express_de.R ├── deliver_faz_net.R ├── deliver_finanzen_net.R ├── deliver_fnp_de.R ├── deliver_focus_de.R ├── deliver_forbes_com.R ├── deliver_foxbusiness_com.R ├── deliver_fr_de.R ├── deliver_frankenpost_de.R ├── deliver_freiepresse_de.R ├── deliver_geenstijl_nl.R ├── deliver_handelsblatt_de.R ├── deliver_haz_de.R ├── deliver_heidelberg24_de.R ├── deliver_heise_de.R ├── deliver_hn_cz.R ├── deliver_hna_de.R ├── deliver_huffpost_com.R ├── deliver_idnes_cz.R ├── deliver_independent_co_uk.R ├── deliver_independent_ie.R ├── deliver_infranken_de.R ├── deliver_irishexaminer_com.R ├── deliver_irishmirror_ie.R ├── deliver_irishtimes_com.R ├── deliver_irozhlas_cz.R ├── deliver_joe_ie.R ├── deliver_jungefreiheit_de.R ├── deliver_kabeleins_de.R ├── deliver_karlsruhe_insider_de.R ├── deliver_kreiszeitung_de.R ├── deliver_ksta_de.R ├── deliver_kurier_at.R ├── deliver_latimes_com.R ├── deliver_lidovky_cz.R ├── deliver_lvz_de.R ├── deliver_manager_magazin_de.R ├── deliver_marketwatch_com.R ├── deliver_maz_online_de.R ├── deliver_mdr_de.R ├── deliver_mediacourant_nl.R ├── deliver_merkur_de.R ├── deliver_metronieuws_nl.R ├── deliver_mopo_de.R ├── deliver_morgenpost_de.R ├── deliver_n-tv_de.R ├── deliver_ndr_de.R ├── deliver_news_de.R ├── deliver_news_und_nachrichten_de.R ├── deliver_newsflash24_de.R ├── deliver_newstatesman_com.R ├── deliver_newsweek_com.R ├── deliver_nordkurier_de.R ├── deliver_nos_nl.R ├── deliver_novinky_cz.R ├── deliver_noz_de.R ├── deliver_nrc_nl.R ├── deliver_nu_nl.R ├── deliver_nw_de.R ├── deliver_nypost_com.R ├── deliver_nytimes_com.R ├── deliver_nzz_ch.R ├── deliver_orf_at.R ├── deliver_ostsee_zeitung_de.R ├── deliver_parlamentnilisty_cz.R ├── deliver_presseportal_de.R ├── deliver_prosieben_de.R ├── deliver_rbb24_de.R ├── deliver_rnd_de.R ├── deliver_rollingstone_de.R ├── deliver_rp_online_de.R ├── deliver_rte_ie.R ├── deliver_rtl_de.R ├── deliver_rtl_nl.R ├── deliver_ruhr24_de.R ├── deliver_ruhrnachrichten_de.R ├── deliver_saechsische_de.R ├── deliver_schwaebische_de.R ├── deliver_seznamzpravy_cz.R ├── deliver_sfgate_com.R ├── deliver_shz_de.R ├── deliver_skwawkbox_org.R ├── deliver_sky_com.R ├── deliver_spiegel_de.R ├── deliver_srf_ch.R ├── deliver_stern_de.R ├── deliver_stuttgarter_zeitung_de.R ├── deliver_sueddeutsche_de.R ├── deliver_suedkurier_de.R ├── deliver_swp_de.R ├── deliver_swr3_de.R ├── deliver_swr_de.R ├── deliver_swrfernsehen_de.R ├── deliver_t3n_de.R ├── deliver_t_online_de.R ├── deliver_tag24_de.R ├── deliver_tagesschau_de.R ├── deliver_tagesspiegel_de.R ├── deliver_taz_de.R ├── deliver_techrepublic_com.R ├── deliver_telegraaf_nl.R ├── deliver_telegraph_co_uk.R ├── deliver_thecanary_co.R ├── deliver_theguardian_com.R ├── deliver_thejournal_ie.R ├── deliver_thesun_ie.R ├── deliver_thueringer_allgemeine_de.R ├── deliver_tz_de.R ├── deliver_usatoday_com.R ├── deliver_vice_com.R ├── deliver_volkskrant_nl.R ├── deliver_volksstimme_de.R ├── deliver_vox_de.R ├── deliver_wa_de.R ├── deliver_washingtonpost_com.R ├── deliver_watson_ch.R ├── deliver_watson_de.R ├── deliver_waz_de.R ├── deliver_wdr_de.R ├── deliver_welt_de.R ├── deliver_wiwo_de.R ├── deliver_wsj_com.R ├── deliver_wz_de.R ├── deliver_yahoo_com.R ├── deliver_zdf_de.R ├── deliver_zeit_de.R ├── html_search.R ├── inspect.R ├── read_cookies.R ├── rss.r ├── sysdata.rda ├── utils.R ├── utils_dev.R └── zzz.R ├── README.Rmd ├── README.md ├── codecov.yml ├── inst ├── CITATION ├── WORDLIST ├── status.csv └── templates │ └── deliver_.R ├── man ├── figures │ └── logo.svg ├── html_search.Rd ├── pb_available.Rd ├── pb_collect.Rd ├── pb_collect_rss.Rd ├── pb_deliver.Rd ├── pb_deliver_paper.Rd ├── pb_find_rss.Rd ├── pb_inspect.Rd ├── pb_new.Rd ├── pb_read_cookies.Rd ├── reexports.Rd ├── test_parser.Rd └── use_new_parser.Rd ├── paperboy.Rproj ├── submit2cran.r ├── tests ├── benchmarks │ └── performance.r ├── spelling.R ├── testthat.R └── testthat │ ├── test-collect.R │ ├── test-deliver.R │ ├── test-misc.R │ ├── test-parser.R │ └── test-rss.R └── vignettes ├── .gitignore ├── For_Developers.Rmd └── inspect.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | /tests/local-files 5 | ^\.github$ 6 | ^codecov\.yml$ 7 | ^submit2cran\.r$ 8 | ^test_data\.rds$ 9 | ^doc$ 10 | ^Meta$ 11 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | 51 | - name: Test coverage 52 | run: covr::codecov() 53 | shell: Rscript {0} 54 | 55 | - name: Upload check results 56 | if: failure() 57 | uses: actions/upload-artifact@main 58 | with: 59 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 60 | path: check 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .Rdata 4 | .httr-oauth 5 | .DS_Store 6 | tests/spelling.Rout.save 7 | tests/local-files 8 | Update_package.R 9 | test_data.rds 10 | /doc/ 11 | /Meta/ 12 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: paperboy 2 | Title: Comprehensive Collection of News Media Scrapers 3 | Version: 0.0.7.9000 4 | Date: 2024-12-26 5 | Authors@R: 6 | c(person(given = "Johannes B.", 7 | family = "Gruber", 8 | email = "JohannesB.Gruber@gmail.com", 9 | role = c("aut", "cre"), 10 | comment = c(ORCID = "0000-0001-9177-1772")), 11 | person(given = "David", 12 | family = "Schoch", 13 | email = "david@schochastics.net", 14 | role = "ctb", 15 | comment = c(ORCID = "0000-0003-2952-4812"))) 16 | Description: A comprehensive collection of webscraping scripts for news media sites. 17 | Depends: 18 | R (>= 3.5.0) 19 | License: GPL-3 20 | Imports: 21 | adaR, 22 | callr, 23 | cli, 24 | cookiemonster, 25 | curl, 26 | dplyr, 27 | jsonlite, 28 | lubridate, 29 | magrittr, 30 | methods, 31 | praise, 32 | purrr, 33 | rlang, 34 | rvest, 35 | tibble, 36 | tidyr, 37 | tidyselect, 38 | utils, 39 | xml2 40 | Suggests: 41 | covr, 42 | httpuv, 43 | knitr, 44 | rmarkdown, 45 | rstudioapi, 46 | spelling, 47 | testthat, 48 | withr 49 | URL: https://github.com/JBGruber/paperboy 50 | Encoding: UTF-8 51 | BugReports: https://github.com/JBGruber/paperboy/issues 52 | RoxygenNote: 7.3.2 53 | VignetteBuilder: knitr 54 | Language: en-GB 55 | -------------------------------------------------------------------------------- /R/deliver_3sat_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.3sat_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | datetime <- html %>% 7 | rvest::html_elements("time") %>% 8 | rvest::html_attr("datetime") %>% 9 | lubridate::as_datetime() 10 | 11 | headline <- html %>% 12 | rvest::html_elements(".main-content-details h2") %>% 13 | rvest::html_text() 14 | 15 | author <- "" # no author info found 16 | 17 | text <- html %>% 18 | rvest::html_elements(".o--post-long p") %>% 19 | rvest::html_text2() %>% 20 | paste(collapse = "\n") 21 | 22 | s_n_list( 23 | datetime, 24 | author, 25 | headline, 26 | text 27 | ) 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_abendblatt_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.abendblatt_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".article-body h3, .article-body p") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | # rss feed includes pages that cannot be parsed because they are subpages 30 | # rss feed also includes podcast, which cannot be parsed 31 | -------------------------------------------------------------------------------- /R/deliver_abendzeitung_muenchen_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.abendzeitung_muenchen_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".artdetail_short ,.artdetail_text p,.artdetail_text h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_ac24_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.ac24_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | pb_tick(x, verbose, pb) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element(".author") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".post-content p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | 32 | cover_image_html <- html %>% 33 | rvest::html_element(".featured-image img") %>% 34 | as.character() 35 | 36 | cover_image_url <- html %>% 37 | rvest::html_element(".featured-image img") %>% 38 | rvest::html_attr("src") 39 | 40 | s_n_list( 41 | datetime, 42 | author, 43 | headline, 44 | text, 45 | cover_image_url, 46 | cover_image_html 47 | ) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_aktualne_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.aktualne_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element(".article-title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_elements(".author__name") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".article .article__perex,#article-content p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | cover_image_html <- html %>% 32 | rvest::html_element(".article__photo--opener img") %>% 33 | as.character() 34 | 35 | cover_image_url <- html %>% 36 | rvest::html_element(".article__photo--opener img") %>% 37 | rvest::html_attr("src") 38 | 39 | if (!is.na(cover_image_url)) { 40 | cover_image_url <- paste0("https:", cover_image_url) 41 | } 42 | 43 | s_n_list( 44 | datetime, 45 | author, 46 | headline, 47 | text, 48 | cover_image_url, 49 | cover_image_html 50 | ) 51 | 52 | } 53 | -------------------------------------------------------------------------------- /R/deliver_anotherangryvoice_blogspot_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.anotherangryvoice_blogspot_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element(".published") %>% 11 | rvest::html_attr("title") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element(".entry-title") %>% 17 | rvest::html_text() %>% 18 | trimws() 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_element(".fn") %>% 23 | rvest::html_text2() %>% 24 | toString() 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_element(".entry-content") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | # in-text links 33 | text_links <- html %>% 34 | rvest::html_elements(".entry-content>span>a") %>% 35 | rvest::html_attr("href") %>% 36 | as.list() 37 | 38 | # the helper function safely creates a named list from objects 39 | s_n_list( 40 | datetime, 41 | author, 42 | headline, 43 | text, 44 | text_links 45 | ) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /R/deliver_augsburger_allgemeine.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.augsburger_allgemeine_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | 8 | datetime <- html %>% 9 | rvest::html_element("time") %>% 10 | rvest::html_attr("datetime") %>% 11 | lubridate::as_datetime() 12 | headline <- html %>% 13 | rvest::html_element("h2.typo-teaserheadline-SoleXL, h2.typo-articleheadline-Recife") %>% 14 | rvest::html_text() 15 | author <- html %>% 16 | rvest::html_elements("a.typo-author-link") %>% 17 | rvest::html_text2() %>% 18 | toString() 19 | text <- html %>% 20 | rvest::html_elements(".typo-article-teaser-Recife, .typo-article-teaser, .article-body-paid-content, .typo-subhead, p.text-xs") %>% 21 | rvest::html_text2() %>% 22 | unique() %>% # teaser might be duplicated 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_badische_zeitung_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.badische_zeitung_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(iconv(x$content_raw, from = "ISO-8859-1", to = "UTF-8")) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author) 16 | text <- html %>% 17 | rvest::html_elements("section[role = \"article\"], .article-site__topic") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_bbc_co_uk.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.bbc_co_uk <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("time") %>% 11 | rvest::html_attr("datetime") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element("[class*=\"TextContributorName\"]") %>% 22 | rvest::html_text2() %>% 23 | stats::na.omit() %>% 24 | toString() 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements("article [class*=\"RichText\"],article .story-body") %>% 29 | rvest::html_elements("p,li") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_html <- html %>% 34 | rvest::html_element("picture img") %>% 35 | as.character() 36 | 37 | cover_image_url <- html %>% 38 | rvest::html_element("picture img") %>% 39 | rvest::html_attr("src") 40 | 41 | s_n_list( 42 | datetime, 43 | author, 44 | headline, 45 | text, 46 | cover_image_url, 47 | cover_image_html 48 | ) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /R/deliver_berliner_kurier_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.berliner_kurier_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() 8 | json_df <- jsonlite::fromJSON(json_txt) 9 | 10 | datetime <- lubridate::as_datetime(json_df$datePublished) 11 | headline <- json_df$headline 12 | author <- toString(json_df$author$name) 13 | text <- html %>% 14 | rvest::html_elements(".article_header-lead__0E3Bn, p.article_paragraph__hXYKJ, h2.article_subtitle__wx1Lu") %>% 15 | rvest::html_text2() %>% 16 | paste(collapse = "\n") 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | -------------------------------------------------------------------------------- /R/deliver_berliner_zeitung_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.berliner_zeitung_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".article_paragraph__hXYKJ") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_bild_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.bild_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | datetime <- html %>% 8 | rvest::html_element("time") %>% 9 | rvest::html_attr("datetime") %>% 10 | lubridate::as_datetime() 11 | 12 | # headline 13 | headline <- html %>% 14 | rvest::html_elements(".document-title__headline") %>% 15 | rvest::html_text() 16 | 17 | # author 18 | author <- html %>% 19 | rvest::html_elements(".article_author") %>% 20 | rvest::html_text() %>% 21 | toString() 22 | 23 | # text 24 | text <- html %>% 25 | rvest::html_elements(".article-body") %>% 26 | rvest::html_text() %>% 27 | paste(collapse = "\n") 28 | 29 | # the helper function safely creates a named list from objects 30 | s_n_list( 31 | datetime, 32 | author, 33 | headline, 34 | text 35 | ) 36 | } 37 | -------------------------------------------------------------------------------- /R/deliver_blesk_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.blesk_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | html <- rvest::read_html(x$content_raw) 5 | pb_tick(x, verbose, pb) 6 | # raw html is stored in column content_raw 7 | 8 | # data about the article is nicely stored in a json string 9 | data <- html %>% 10 | rvest::html_elements("[type=\"application/ld+json\"]") %>% 11 | rvest::html_text2() %>% 12 | lapply(jsonlite::fromJSON) 13 | 14 | # usually there are more than one, 15 | if (length(data) > 1L) { 16 | tp <- purrr::map_chr(data, function(x) 17 | purrr::pluck(x, "@type", .default = NA_character_)) 18 | 19 | data <- purrr::pluck(data, which(tp == "NewsArticle")) 20 | } 21 | 22 | datetime <- data$datePublished %>% 23 | lubridate::ymd_hm() 24 | 25 | headline <- data$headline 26 | 27 | author <- data$author$name %>% 28 | toString() 29 | 30 | # text 31 | text <- html %>% 32 | rvest::html_elements("#article p,#article h2") %>% 33 | rvest::html_text2() %>% 34 | paste(collapse = "\n") 35 | 36 | cover_image_url <- purrr::pluck(data, "image", "url", .default = NA_character_) 37 | 38 | s_n_list( 39 | datetime, 40 | author, 41 | headline, 42 | text, 43 | cover_image_url 44 | ) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /R/deliver_bnn_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.bnn_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | article <- grepl("\"NewsArticle\"", json_txt) 12 | if (!any(article)) { 13 | return(s_n_list()) 14 | } 15 | json_df <- jsonlite::fromJSON(json_txt[article]) 16 | 17 | datetime <- lubridate::as_datetime(json_df$datePublished) 18 | headline <- json_df$headline 19 | author <- toString(json_df$author$name) 20 | text <- html %>% 21 | rvest::html_elements(".intro,.article__body p,.article__body h2") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /R/deliver_br_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | datetime <- html %>% 8 | rvest::html_element("time") %>% 9 | rvest::html_attr("datetime") %>% 10 | lubridate::as_datetime() 11 | 12 | headline <- html %>% 13 | rvest::html_element(".heading1") %>% 14 | rvest::html_text2() 15 | 16 | author <- html %>% 17 | rvest::html_element(".ArticleModuleTeaser_authorName__Q7ctt") %>% 18 | rvest::html_text2() %>% 19 | toString() 20 | text <- html %>% 21 | rvest::html_element(".RichText_richText__wS9Rz.body3") %>% 22 | rvest::html_elements("p, h2") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_breakingnews_ie.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.breakingnews_ie <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | data <- html %>% 11 | rvest::html_element("script") %>% 12 | rvest::html_text2() 13 | 14 | if (!isTRUE(is.na(data))) { 15 | data <- jsonlite::fromJSON(data) 16 | # datetime 17 | datetime <- data$datePublished %>% 18 | lubridate::as_datetime() 19 | 20 | # headline 21 | headline <- data$headline 22 | 23 | # author 24 | author <- data$author$name %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("article p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_url <- utils::head(data$image$url, 1L) 34 | 35 | type <- data$`@type` 36 | 37 | s_n_list( 38 | datetime, 39 | author, 40 | headline, 41 | text, 42 | type, 43 | cover_image_url 44 | ) 45 | } else { 46 | s_n_list() 47 | } 48 | 49 | } 50 | 51 | -------------------------------------------------------------------------------- /R/deliver_breitbart_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.breitbart_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("time") %>% 11 | rvest::html_attr("datetime") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element("address") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".entry-content>p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | # in-text links 32 | text_links <- html %>% 33 | rvest::html_elements(".entry-content>p>a") %>% 34 | rvest::html_attr("href") %>% 35 | as.list() 36 | 37 | # the helper function safely creates a named list from objects 38 | s_n_list( 39 | datetime, 40 | author, 41 | headline, 42 | text, 43 | text_links 44 | ) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /R/deliver_businessinsider_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.businessinsider_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[1, ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- json_df$headline 21 | text <- html %>% 22 | rvest::html_element(".article-main") %>% 23 | rvest::html_elements("p, h2") %>% 24 | rvest::html_text2() %>% 25 | paste(collapse = "\n") 26 | 27 | s_n_list( 28 | datetime, 29 | author, 30 | headline, 31 | text 32 | ) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /R/deliver_buzzfeed_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.buzzfeed_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("time") %>% 11 | rvest::html_attr("datetime") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("[class^=\"headline_title\"]") %>% 17 | rvest::html_text() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element("[class*=\"headline-byline_bylineName\"]") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".subbuzz-text>p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | # in-text links 32 | text_links <- html %>% 33 | rvest::html_elements(".subbuzz-text,.tweet__container") %>% 34 | rvest::html_elements("a") %>% 35 | rvest::html_attr("href") %>% 36 | as.list() 37 | 38 | # the helper function safely creates a named list from objects 39 | s_n_list( 40 | datetime, 41 | author, 42 | headline, 43 | text, 44 | text_links 45 | ) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /R/deliver_cbsnews_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.cbsnews_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_elements("time") %>% 11 | rvest::html_attr("datetime") %>% 12 | lubridate::as_datetime() %>% 13 | utils::head(1L) 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_elements("[property=\"og:title\"]") %>% 18 | rvest::html_attr("content") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_element("[class*=\"content__meta--byline\"]") %>% 23 | rvest::html_text() %>% 24 | gsub("By\\b\\s+|\n", "", .) %>% 25 | trimws() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements(".content__body>p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | content_type <- x$expanded_url %>% 34 | gsub(".*cbsnews.com/(.+?)/.*", "\\1", ., perl = TRUE) 35 | 36 | # the helper function safely creates a named list from objects 37 | s_n_list( 38 | datetime, 39 | author, 40 | headline, 41 | text, 42 | content_type 43 | ) 44 | 45 | } 46 | -------------------------------------------------------------------------------- /R/deliver_ceskatelevize_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.ceskatelevize_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_elements("[type=\"application/json\"]") %>% 11 | rvest::html_text() %>% 12 | extract("(?<=\"startsAt\":\")\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.\\d{3}") %>% 13 | lubridate::as_datetime() 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_element("[property=\"og:title\"]") %>% 18 | rvest::html_attr("content") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_element("[name=\"author\"]") %>% 23 | rvest::html_attr("content") %>% 24 | toString() 25 | 26 | if (author == "NA") { 27 | author <- html %>% 28 | rvest::html_element(".article-meta__authors") %>% 29 | rvest::html_text() %>% 30 | trimws() 31 | } 32 | 33 | # text 34 | text <- html %>% 35 | rvest::html_elements(".article__content p") %>% 36 | rvest::html_text2() %>% 37 | paste(collapse = "\n") 38 | 39 | cover_image_html <- html %>% 40 | rvest::html_element("main img") %>% 41 | as.character() 42 | 43 | cover_image_url <- html %>% 44 | rvest::html_element("main img") %>% 45 | rvest::html_attr("src") 46 | 47 | s_n_list( 48 | datetime, 49 | author, 50 | headline, 51 | text, 52 | cover_image_url, 53 | cover_image_html 54 | ) 55 | 56 | } 57 | -------------------------------------------------------------------------------- /R/deliver_cnn_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.cnn_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_elements("[name=\"pubdate\"],[name=\"parsely-pub-date\"],[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() %>% 13 | utils::head(1L) 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_elements(".pg-headline,.headline>h1,[id*=\"video-headline\"],.headline__text,.PageHead__title,.Article__title") %>% 18 | rvest::html_text2() 19 | 20 | # author 21 | author <- html %>% 22 | html_search(c(".Authors__writer", "[name=\"author\"]", ".byline__names"), 23 | c("text", "content")) %>% 24 | toString() %>% 25 | gsub("^By\\s", "", .) 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements(".article__content p:not(.editor-note),.zn-body-text,article,.article__main,BasicArticle__paragraph,[class^=\"Paragraph\"]") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | # type 34 | content_type <- html %>% 35 | rvest::html_element("[property=\"og:title\"]") %>% 36 | rvest::html_attr("content") %>% 37 | toString() %>% 38 | { 39 | x <- . 40 | dplyr::case_when( 41 | grepl("Live", x, ignore.case = TRUE) ~ "live", 42 | grepl("Video", x, ignore.case = TRUE) ~ "video", 43 | TRUE ~ "article" 44 | ) 45 | } 46 | 47 | s_n_list( 48 | datetime, 49 | author, 50 | headline, 51 | text, 52 | content_type 53 | ) 54 | 55 | } 56 | 57 | pb_deliver_paper.us_cnn_com <- 58 | pb_deliver_paper.edition_cnn_com <- 59 | pb_deliver_paper.cnn_com 60 | -------------------------------------------------------------------------------- /R/deliver_dailymail_co_uk.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.dailymail_co_uk <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | 9 | # datetime 10 | datetime <- html %>% 11 | rvest::html_elements("[property=\"article:published_time\"]") %>% 12 | rvest::html_attr("content") %>% 13 | lubridate::as_datetime() 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_elements("[property =\"mol:headline\"]") %>% 18 | rvest::html_attr("content") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_elements("[name =\"author\"]") %>% 23 | rvest::html_attr("content") 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements("[itemprop=\"articleBody\"]") %>% 28 | rvest::html_elements("p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | s_n_list( 33 | datetime, 34 | author, 35 | headline, 36 | text 37 | ) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /R/deliver_denikn_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.denikn_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element(".e_author_t") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements("article p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | paywall <- FALSE 32 | if (length(rvest::html_element(html, ".e_lock__hard"))) { 33 | text <- paste("[Paywall-Truncated]", text) 34 | paywall <- TRUE 35 | } 36 | 37 | cover_image_html <- html %>% 38 | rvest::html_element("header .b_single_i img") %>% 39 | as.character() 40 | 41 | cover_image_url <- html %>% 42 | rvest::html_element("header .b_single_i img") %>% 43 | rvest::html_attr("src") 44 | 45 | s_n_list( 46 | datetime, 47 | author, 48 | headline, 49 | text, 50 | paywall, 51 | cover_image_url, 52 | cover_image_html 53 | ) 54 | 55 | } 56 | -------------------------------------------------------------------------------- /R/deliver_der_postillon_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.der_postillon_com <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".post-body p") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | # author abbr can be found at the end of the article 22 | if (author == "Der Postillon") { 23 | author_tmp <- html %>% 24 | rvest::html_element("div[id='post-body'] span[style='font-size: x-small;']") %>% 25 | rvest::html_text() %>% 26 | sub("; Erstver.*$", "", .) 27 | if (author_tmp != "") { 28 | author <- author_tmp 29 | } 30 | } 31 | s_n_list( 32 | datetime, 33 | author, 34 | headline, 35 | text 36 | ) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /R/deliver_derstandard_at.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.derstandard_at <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | datetime <- html %>% 7 | rvest::html_elements(".article-meta") %>% 8 | rvest::html_text() %>% 9 | lubridate::as_datetime() 10 | 11 | headline <- html %>% 12 | rvest::html_elements("h1.article-title") %>% 13 | rvest::html_text() 14 | 15 | author <- html %>% 16 | rvest::html_elements(".article-origins") %>% 17 | rvest::html_text() %>% 18 | toString() 19 | 20 | text <- html %>% 21 | rvest::html_elements(".article-body p, .article-body h3") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") # There is a note that parts of the website are blocked 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_derwesten_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.derwesten_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph`[1, ] 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- html %>% 16 | rvest::html_elements(".author.vcard .url.fn.n") %>% 17 | rvest::html_text() %>% 18 | toString() 19 | 20 | text <- html %>% 21 | rvest::html_elements(".lead p,.article-body p") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /R/deliver_deutschlandfunk_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.deutschlandfunk_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | datetime <- html %>% 8 | rvest::html_element("time") %>% 9 | rvest::html_attr("datetime") %>% 10 | lubridate::as_datetime() 11 | headline <- html %>% 12 | rvest::html_element(".headline-title") %>% 13 | rvest::html_text() 14 | author <- "deutschlandfunk.de" # could not find article with author 15 | text <- html %>% 16 | rvest::html_elements(".article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>% 17 | rvest::html_text2() %>% 18 | paste(collapse = "\n") 19 | 20 | s_n_list( 21 | datetime, 22 | author, 23 | headline, 24 | text 25 | ) 26 | } 27 | -------------------------------------------------------------------------------- /R/deliver_deutschlandfunkkultur_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.deutschlandfunkkultur_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | datetime <- html %>% 8 | rvest::html_element("time") %>% 9 | rvest::html_attr("datetime") %>% 10 | lubridate::as_datetime() 11 | headline <- html %>% 12 | rvest::html_element(".headline-title,.section-article-head-area-title") %>% 13 | rvest::html_text() 14 | author <- html %>% 15 | rvest::html_element(".article-header-author") %>% 16 | rvest::html_text() 17 | text <- html %>% 18 | rvest::html_elements(".section-article-head-area-description,.article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>% 19 | rvest::html_text2() %>% 20 | paste(collapse = "\n") 21 | 22 | s_n_list( 23 | datetime, 24 | author, 25 | headline, 26 | text 27 | ) 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_dnn_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.dnn_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) <= 2) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[3]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 p,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_echo24_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.echo24_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | 14 | datetime <- lubridate::as_datetime(json_df$datePublished) 15 | headline <- json_df$headline 16 | author <- toString(json_df$author$name) 17 | text <- html %>% 18 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% 19 | rvest::html_text2() %>% 20 | paste(collapse = "\n") 21 | 22 | s_n_list( 23 | datetime, 24 | author, 25 | headline, 26 | text 27 | ) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /R/deliver_epochtimes_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.epochtimes_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- json_df$articleBody 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_evolvepolitics_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.evolvepolitics_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element(".entry-date") %>% 11 | rvest::html_attr("datetime") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element(".tdb-title-text") %>% 17 | rvest::html_text() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element(".tdb-author-name") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".tdb-block-inner") %>% 28 | rvest::html_elements("p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | # in-text links 33 | text_links <- html %>% 34 | rvest::html_elements(".tdb-block-inner") %>% 35 | rvest::html_elements("p>a") %>% 36 | rvest::html_attr("href") %>% 37 | as.list() 38 | 39 | # the helper function safely creates a named list from objects 40 | s_n_list( 41 | datetime, 42 | author, 43 | headline, 44 | text, 45 | text_links 46 | ) 47 | 48 | } 49 | -------------------------------------------------------------------------------- /R/deliver_express_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.express_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- sub(" \\| .*", "", json_df$headline) 21 | text <- html %>% 22 | rvest::html_elements(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") 25 | if (author == "") { 26 | # the text has the author abbr. at the end 27 | author <- sub(".*\\(([^)]+)\\)$", "\\1", text) 28 | } 29 | s_n_list( 30 | datetime, 31 | author, 32 | headline, 33 | text 34 | ) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /R/deliver_finanzen_net.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.finanzen_net <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("p.h3, .news-container__text p, .news-container__text h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_fnp_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.fnp_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_focus_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | if ("@graph" %in% names(json_df)) { 13 | json_df <- json_df$`@graph` 14 | } 15 | if (json_df$`@type` != "NewsArticle") { 16 | return(s_n_list()) 17 | } 18 | datetime <- lubridate::as_datetime(json_df$datePublished) 19 | headline <- json_df$headline 20 | author <- toString(json_df$author$name) 21 | text <- html %>% 22 | rvest::html_elements(".leadIn,.textBlock") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") 25 | 26 | 27 | s_n_list( 28 | datetime, 29 | author, 30 | headline, 31 | text 32 | ) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /R/deliver_forbes_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.forbes_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_elements("[property=\"article:published\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_elements("[property=\"og:title\"]") %>% 17 | rvest::html_attr("content") 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_elements("[property=\"article:author\"]") %>% 22 | rvest::html_attr("content") 23 | 24 | if (length(author) > 1) author <- toString(author) 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements("p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | s_n_list( 33 | datetime, 34 | author, 35 | headline, 36 | text 37 | ) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /R/deliver_foxbusiness_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | 3 | pb_deliver_paper.foxbusiness_com <- function(x, verbose = NULL, pb, ...) { 4 | 5 | pb_tick(x, verbose, pb) 6 | # raw html is stored in column content_raw 7 | html <- rvest::read_html(x$content_raw) 8 | 9 | # datetime 10 | datetime <- html %>% 11 | rvest::html_elements("[name=\"dcterms.created\"]") %>% 12 | rvest::html_attr("content") %>% 13 | lubridate::as_datetime() 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_elements("[property=\"og:title\"]") %>% 18 | rvest::html_attr("content") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_elements(".author,.author-byline") %>% 23 | rvest::html_text2() %>% 24 | gsub("By ", "", ., fixed = TRUE) %>% 25 | trimws() %>% 26 | toString() 27 | 28 | # text 29 | text <- html %>% 30 | rvest::html_elements(".article-content") %>% 31 | rvest::html_text2() %>% 32 | paste(collapse = "\n") 33 | 34 | s_n_list( 35 | datetime, 36 | author, 37 | headline, 38 | text 39 | ) 40 | } 41 | 42 | 43 | pb_deliver_paper.foxnews_com <- pb_deliver_paper.foxbusiness_com 44 | 45 | -------------------------------------------------------------------------------- /R/deliver_fr_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.fr_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | 14 | datetime <- lubridate::as_datetime(json_df$datePublished) 15 | headline <- json_df$headline 16 | author <- toString(json_df$author$name) 17 | text <- html %>% 18 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph") %>% 19 | rvest::html_text2() %>% 20 | paste(collapse = "\n") 21 | 22 | 23 | s_n_list( 24 | datetime, 25 | author, 26 | headline, 27 | text 28 | ) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /R/deliver_frankenpost_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.frankenpost_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) <= 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".article-text p, .article-text h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_freiepresse_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.freiepresse_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_txt <- json_txt[grepl("NewsArticle", json_txt)] 12 | if (length(json_txt) == 0) { 13 | return(s_n_list()) 14 | } 15 | json_df <- jsonlite::fromJSON(json_txt) 16 | 17 | datetime <- lubridate::as_datetime(json_df$datePublished) 18 | headline <- json_df$headline 19 | author <- toString(json_df$author) 20 | text <- html %>% 21 | rvest::html_elements(".article__text p,.article__text h2") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /R/deliver_geenstijl_nl.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.geenstijl_nl <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element(".datetime") %>% 11 | rvest::html_text2() %>% 12 | lubridate::dmy_hm() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element("[rel=\"author\"]") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_element("article") %>% 28 | rvest::html_elements("p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | cover_image_html <- html %>% 33 | rvest::html_element("article img") %>% 34 | as.character() 35 | 36 | cover_image_url <- html %>% 37 | rvest::html_element("article img") %>% 38 | rvest::html_attr("src") 39 | 40 | # the helper function safely creates a named list from objects 41 | s_n_list( 42 | datetime, 43 | author, 44 | headline, 45 | text, 46 | cover_image_url, 47 | cover_image_html 48 | ) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /R/deliver_haz_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.haz_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_txt <- json_txt[grepl("NewsArticle", json_txt)] 12 | if (length(json_txt) == 0) { 13 | return(s_n_list()) 14 | } 15 | json_df <- jsonlite::fromJSON(json_txt) 16 | 17 | datetime <- lubridate::as_datetime(json_df$datePublished) 18 | headline <- json_df$headline 19 | author <- toString(json_df$author$name) 20 | text <- html %>% 21 | rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2,.Textstyled__Text-sc-1cqv9mi-0.gqSIEH") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /R/deliver_heidelberg24_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.heidelberg24_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_heise_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.heise_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) | length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | datetime <- lubridate::as_datetime(json_df$datePublished) 13 | headline <- json_df$headline 14 | author <- toString(json_df$author$name) 15 | 16 | text <- html %>% 17 | rvest::html_elements("#lead,#article-content-body .ringCommonDetail.ringBlockType-paragraph,.article-content,.a-article-header__lead") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_hn_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.hn_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(charToRaw(enc2utf8(x$content_raw))) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_elements("[name=\"author\"]") %>% 22 | rvest::html_attr("content") %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".article-content p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | paywall <- FALSE 32 | if (length(rvest::html_element(html, ".paywall"))) { 33 | text <- paste("[Paywall-Truncated]", text) 34 | paywall <- TRUE 35 | } 36 | 37 | cover_image_html <- html %>% 38 | rvest::html_element(".article-image-wrapper img") %>% 39 | as.character() 40 | 41 | cover_image_url <- html %>% 42 | rvest::html_element(".article-image-wrapper img") %>% 43 | rvest::html_attr("src") 44 | 45 | s_n_list( 46 | datetime, 47 | author, 48 | headline, 49 | text, 50 | paywall, 51 | cover_image_url, 52 | cover_image_html 53 | ) 54 | 55 | } 56 | -------------------------------------------------------------------------------- /R/deliver_hna_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.hna_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_huffpost_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.huffpost_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_elements("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_elements(".headline__title,.headline__subtitle,.js-headline,.headline") %>% 17 | rvest::html_text() %>% 18 | paste0(collapse = ". ") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_element(".author-card__name,.wire-byline,.entry-wirepartner__byline") %>% 23 | rvest::html_text() %>% 24 | gsub("^By\\b\\s+", "", .) 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements("p,.entry-video__content__description") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | type <- html %>% 33 | rvest::html_elements("article") %>% 34 | rvest::html_attrs() %>% 35 | .[[1]] 36 | 37 | content_type <- dplyr::case_when( 38 | "article" %in% type ~ "article", 39 | "entry-video" %in% type ~ "video", 40 | TRUE ~ "unknown" 41 | ) 42 | 43 | 44 | # the helper function safely creates a named list from objects 45 | s_n_list( 46 | datetime, 47 | author, 48 | headline, 49 | text, 50 | content_type 51 | ) 52 | 53 | } 54 | 55 | 56 | # define aliases for pages using the same layout 57 | pb_deliver_paper.huffingtonpost_com <- 58 | pb_deliver_paper.huffingtonpost_co_uk <- 59 | pb_deliver_paper.huffpost_com 60 | -------------------------------------------------------------------------------- /R/deliver_idnes_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.idnes_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | x$content_raw <- iconv(x$content_raw, from = "windows-1250", to = "UTF-8") 7 | html <- rvest::read_html(x$content_raw) 8 | 9 | # datetime 10 | datetime <- html %>% 11 | rvest::html_element("[property=\"article:published_time\"]") %>% 12 | rvest::html_attr("content") %>% 13 | lubridate::as_datetime() 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_element(".content h1") %>% 18 | rvest::html_text2() 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_element("[property=\"article:author\"]") %>% 23 | rvest::html_attr("content") %>% 24 | toString() 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements(".opener,.text p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | cover_image_html <- html %>% 33 | rvest::html_elements(".art-full img,video") %>% 34 | as.character() %>% 35 | paste(collapse = "\n") 36 | 37 | cover_image_url <- html %>% 38 | rvest::html_element(".art-full img,video") %>% 39 | rvest::html_attr("src") %>% 40 | paste0("https:", .) 41 | 42 | s_n_list( 43 | datetime, 44 | author, 45 | headline, 46 | text, 47 | cover_image_url, 48 | cover_image_html 49 | ) 50 | 51 | } 52 | -------------------------------------------------------------------------------- /R/deliver_independent_co_uk.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.independent_co_uk <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"date\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("#articleHeader h1") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element("[property=\"article:author_name\"]") %>% 22 | rvest::html_attr("content") %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements("#main p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | # the helper function safely creates a named list from objects 32 | s_n_list( 33 | datetime, 34 | author, 35 | headline, 36 | text 37 | ) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /R/deliver_independent_ie.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.independent_ie <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("[property=\"article:modified_time\"]") %>% 13 | rvest::html_attr("content") %>% 14 | lubridate::as_datetime() 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_element("[property=\"og:title\"]") %>% 19 | rvest::html_attr("content") 20 | 21 | # author 22 | author <- html %>% 23 | rvest::html_elements("[name=\"cXenseParse:mhu-article_author\"]") %>% 24 | rvest::html_attr("content") %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("[data-fragment-name=\"articleDetail\"] p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_html <- html %>% 34 | rvest::html_element("[data-testid=\"article-image-wrapper\"] img") %>% 35 | as.character() 36 | 37 | cover_image_url <- html %>% 38 | rvest::html_element("[data-testid=\"article-image-wrapper\"] img") %>% 39 | rvest::html_attr("src") 40 | 41 | s_n_list( 42 | datetime, 43 | author, 44 | headline, 45 | text, 46 | cover_image_url, 47 | cover_image_html 48 | ) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /R/deliver_infranken_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.infranken_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- json_df$articleBody 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_irishexaminer_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.irishexaminer_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("[property=\"article:published_time\"]") %>% 13 | rvest::html_attr("content") %>% 14 | lubridate::as_datetime() 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_element(".article-title") %>% 19 | rvest::html_text2() 20 | 21 | # author 22 | author <- html %>% 23 | rvest::html_element(".author-byline") %>% 24 | rvest::html_text2() %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("article p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_html <- html %>% 34 | rvest::html_element("picture img") %>% 35 | as.character() 36 | 37 | cover_image_url <- html %>% 38 | rvest::html_element("picture img") %>% 39 | rvest::html_attr("src") 40 | 41 | if (!is.na(cover_image_url)) 42 | cover_image_url <- paste0("https://www.irishexaminer.com", cover_image_url) 43 | 44 | s_n_list( 45 | datetime, 46 | author, 47 | headline, 48 | text, 49 | cover_image_url, 50 | cover_image_html 51 | ) 52 | 53 | } 54 | -------------------------------------------------------------------------------- /R/deliver_irishmirror_ie.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.irishmirror_ie <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("[property=\"article:published_time\"]") %>% 13 | rvest::html_attr("content") %>% 14 | lubridate::as_datetime() 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_element("title") %>% 19 | rvest::html_text2() 20 | 21 | # author 22 | author <- html %>% 23 | rvest::html_element("[name=\"author\"]") %>% 24 | rvest::html_attr("content") %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements(".article-wrapper p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_html <- html %>% 34 | rvest::html_element(".article-wrapper .img-container img") %>% 35 | as.character() 36 | 37 | cover_image_url <- html %>% 38 | rvest::html_element(".article-wrapper .img-container img") %>% 39 | rvest::html_attr("src") 40 | 41 | s_n_list( 42 | datetime, 43 | author, 44 | headline, 45 | text, 46 | cover_image_url, 47 | cover_image_html 48 | ) 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_irishtimes_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.irishtimes_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | data <- html %>% 11 | rvest::html_elements("[type=\"application/ld+json\"]") %>% 12 | rvest::html_text2() %>% 13 | lapply(jsonlite::fromJSON) 14 | 15 | # usually there are more than one, 16 | if (length(data) > 1L) { 17 | tp <- purrr::map_chr(data, function(x) 18 | purrr::pluck(x, "@type", .default = NA_character_)) 19 | 20 | data <- purrr::pluck(data, which(tp == "NewsArticle"), .default = NA) 21 | } 22 | 23 | if (!isTRUE(is.na(data))) { 24 | 25 | # datetime 26 | datetime <- data$datePublished %>% 27 | lubridate::as_datetime() 28 | 29 | # headline 30 | headline <- data$headline 31 | 32 | # author 33 | author <- data$author$name %>% 34 | toString() 35 | 36 | # text 37 | text <- html %>% 38 | rvest::html_elements("article p") %>% 39 | rvest::html_text2() %>% 40 | paste(collapse = "\n") 41 | 42 | cover_image_url <- purrr::pluck(data$image, 1, .default = NA) 43 | 44 | type <- data$`@type` 45 | 46 | s_n_list( 47 | datetime, 48 | author, 49 | headline, 50 | text, 51 | type, 52 | cover_image_url 53 | ) 54 | } else { 55 | s_n_list() 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /R/deliver_irozhlas_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.irozhlas_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # data about the article is nicely stored in a json string 9 | data <- html %>% 10 | rvest::html_elements("[type=\"application/ld+json\"]") %>% 11 | rvest::html_text() %>% 12 | lapply(jsonlite::fromJSON, simplifyVector = FALSE) %>% 13 | purrr::pluck(1L) 14 | 15 | # usually there are more than one, 16 | if (length(data) > 0L) { 17 | tp <- purrr::map_chr(data, function(x) 18 | purrr::pluck(x, "@type", .default = NA_character_)) 19 | 20 | data <- purrr::pluck(data, which(tp == "NewsArticle")) 21 | } 22 | 23 | # datetime 24 | datetime <- data$datePublished %>% 25 | lubridate::as_datetime() 26 | 27 | # headline 28 | headline <- data$headline 29 | 30 | # author 31 | author <- purrr::map_chr(data$author, "name") %>% 32 | toString() 33 | 34 | # text 35 | text <- html %>% 36 | rvest::html_elements("article p:not(.meta):not([class*=\"b-audio-player\"])") %>% 37 | rvest::html_text2() %>% 38 | paste(collapse = "\n") 39 | 40 | cover_image_url <- purrr::pluck(data, "image", "url", .default = NA_character_) 41 | 42 | s_n_list( 43 | datetime, 44 | author, 45 | headline, 46 | text, 47 | cover_image_url 48 | ) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /R/deliver_joe_ie.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.joe_ie <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | data <- html %>% 11 | rvest::html_element("[type=\"application/ld+json\"]") %>% 12 | rvest::html_text2() 13 | 14 | if (!isTRUE(is.na(data))) { 15 | data <- jsonlite::fromJSON(data) 16 | # datetime 17 | datetime <- data$datePublished %>% 18 | lubridate::as_datetime() 19 | 20 | # headline 21 | headline <- data$headline 22 | 23 | # author 24 | author <- data$author$name %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("article p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_url <- utils::head(data$image$url, 1L) 34 | 35 | type <- data$`@type` 36 | 37 | s_n_list( 38 | datetime, 39 | author, 40 | headline, 41 | text, 42 | type, 43 | cover_image_url 44 | ) 45 | } else { 46 | s_n_list() 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_jungefreiheit_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.jungefreiheit_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- json_df$headline 21 | text <- html %>% 22 | rvest::html_elements(".elementor-widget-container p, .elementor-widget-container h3") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") 25 | 26 | s_n_list( 27 | datetime, 28 | author, 29 | headline, 30 | text 31 | ) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /R/deliver_kabeleins_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.kabeleins_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("p.css-1tkp8z5, h2.css-xfddm,p.css-1pcz62z") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_karlsruhe_insider_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[1, ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- json_df$headline 21 | text <- html %>% 22 | rvest::html_element("article .td-post-content") %>% 23 | rvest::html_elements("p, h2") %>% 24 | rvest::html_text2() %>% 25 | paste(collapse = "\n") 26 | 27 | s_n_list( 28 | datetime, 29 | author, 30 | headline, 31 | text 32 | ) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /R/deliver_kreiszeitung_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.kreiszeitung_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_ksta_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.ksta_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- sub(" \\| .*", "", json_df$headline) 21 | text <- html %>% 22 | rvest::html_elements(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") 25 | if (author == "") { 26 | # the text has the author abbr. at the end 27 | author <- sub(".*\\(([^)]+)\\)$", "\\1", text) 28 | } 29 | s_n_list( 30 | datetime, 31 | author, 32 | headline, 33 | text 34 | ) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /R/deliver_kurier_at.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.kurier_at <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".headerComp-intro,.paragraph.copy") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_latimes_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | 3 | pb_deliver_paper.latimes_com <- function(x, verbose = NULL, pb, ...) { 4 | 5 | pb_tick(x, verbose, pb) 6 | # raw html is stored in column content_raw 7 | html <- rvest::read_html(x$content_raw) 8 | 9 | # datetime 10 | datetime <- html %>% 11 | rvest::html_elements("[property=\"article:published_time\"]") %>% 12 | rvest::html_attr("content") %>% 13 | lubridate::as_datetime() 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_elements("[property=\"og:title\"]") %>% 18 | rvest::html_attr("content") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_elements(".authors") %>% 23 | rvest::html_text() %>% 24 | toString() %>% 25 | gsub("\n", "", .) %>% 26 | gsub("By", "", ., fixed = TRUE) %>% 27 | trimws() 28 | 29 | # text 30 | text <- html %>% 31 | rvest::html_elements(".page-article-container>p,.rich-text-body>p") %>% 32 | rvest::html_text2() %>% 33 | paste(collapse = "\n") 34 | 35 | s_n_list( 36 | datetime, 37 | author, 38 | headline, 39 | text 40 | ) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /R/deliver_lidovky_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.lidovky_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(charToRaw(enc2utf8(x$content_raw))) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("[itemprop=\"name headline\"]") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element("[itemprop=\"author\"] span") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements("[itemprop=\"articleBody\"] p,.opener") %>% 28 | rvest::html_text2() %>% 29 | trimws() %>% 30 | paste(collapse = "\n") 31 | 32 | paywall <- FALSE 33 | if (length(rvest::html_element(html, "#paywall"))) { 34 | text <- paste("[Paywall-Truncated]", text) 35 | paywall <- TRUE 36 | } 37 | 38 | cover_image_html <- html %>% 39 | rvest::html_element(".opener-foto img,.opener-flv-player img") %>% 40 | as.character() 41 | 42 | cover_image_url <- html %>% 43 | rvest::html_element(".opener-foto img,.opener-flv-player img") %>% 44 | rvest::html_attr("src") %>% 45 | paste0("https:", .) 46 | 47 | s_n_list( 48 | datetime, 49 | author, 50 | headline, 51 | text, 52 | paywall, 53 | cover_image_url, 54 | cover_image_html 55 | ) 56 | 57 | } 58 | -------------------------------------------------------------------------------- /R/deliver_lvz_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.lvz_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[3]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_manager_magazin_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.manager_magazin_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df[json_df$`@type` == "NewsArticle", ] 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".leading-loose, .RichText p, .RichText h3") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_marketwatch_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | 3 | pb_deliver_paper.marketwatch_com <- function(x, verbose = NULL, pb, ...) { 4 | 5 | pb_tick(x, verbose, pb) 6 | # raw html is stored in column content_raw 7 | html <- rvest::read_html(x$content_raw) 8 | 9 | # datetime 10 | datetime <- html %>% 11 | rvest::html_elements("[name=\"parsely-pub-date\"]") %>% 12 | rvest::html_attr("content") %>% 13 | lubridate::as_datetime() 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_elements("[property =\"og:title\"]") %>% 18 | rvest::html_attr("content") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_elements("[name=\"parsely-author\"]") %>% 23 | rvest::html_attr("content") %>% 24 | toString() 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements(":not(.bio__description)>p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | s_n_list( 33 | datetime, 34 | author, 35 | headline, 36 | text 37 | ) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /R/deliver_maz_online_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.maz_online_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[3]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("header .Textstyled__Text-sc-1cqv9mi-0, article .Textstyled__Text-sc-1cqv9mi-0, article h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_mdr_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.mdr_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".einleitung,.paragraph") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_mediacourant_nl.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.mediacourant_nl <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element("[name=\"author\"]") %>% 22 | rvest::html_attr("content") %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".entry__content p,entry__content h2") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | cover_image_html <- html %>% 32 | rvest::html_element("article img") %>% 33 | as.character() 34 | 35 | cover_image_url <- html %>% 36 | rvest::html_element("article img") %>% 37 | rvest::html_attr("src") 38 | 39 | # the helper function safely creates a named list from objects 40 | s_n_list( 41 | datetime, 42 | author, 43 | headline, 44 | text, 45 | cover_image_url, 46 | cover_image_html 47 | ) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_merkur_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.merkur_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | 17 | text <- html %>% 18 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% 19 | rvest::html_text2() %>% 20 | paste(collapse = "\n") 21 | 22 | s_n_list( 23 | datetime, 24 | author, 25 | headline, 26 | text 27 | ) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /R/deliver_metronieuws_nl.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.metronieuws_nl <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element(".article__title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_elements("[name=\"author\"]") %>% 22 | rvest::html_attr("content") %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".article__content>p,.article__content>h2:not(.coral-talk-heading)") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | 32 | 33 | cover_image_html <- html %>% 34 | rvest::html_element(".featured-image img") %>% 35 | as.character() 36 | 37 | cover_image_url <- html %>% 38 | rvest::html_element(".featured-image img") %>% 39 | rvest::html_attr("src") 40 | 41 | # the helper function safely creates a named list from objects 42 | s_n_list( 43 | datetime, 44 | author, 45 | headline, 46 | text, 47 | cover_image_url, 48 | cover_image_html 49 | ) 50 | 51 | } 52 | -------------------------------------------------------------------------------- /R/deliver_mopo_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.mopo_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- sub(" \\| .*", "", json_df$headline) 21 | text <- html %>% 22 | rvest::html_elements("p, h2") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") 25 | if (author == "") { 26 | # the text has the author abbr. at the end 27 | author <- sub(".*\\(([^)]+)\\)$", "\\1", text) 28 | } 29 | s_n_list( 30 | datetime, 31 | author, 32 | headline, 33 | text 34 | ) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /R/deliver_morgenpost_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.morgenpost_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".article-body p") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_n-tv_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.n_tv_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".article__text") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | 22 | s_n_list( 23 | datetime, 24 | author, 25 | headline, 26 | text 27 | ) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /R/deliver_ndr_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.ndr_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".modulepadding.copytext p, .modulepadding.copytext h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | } else { 21 | datetime <- lubridate::as_datetime(json_df$uploadDate) 22 | headline <- json_df$name 23 | author <- "" 24 | text <- json_df$description 25 | } 26 | s_n_list( 27 | datetime, 28 | author, 29 | headline, 30 | text 31 | ) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /R/deliver_news_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.news_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- trimws(gsub("\\+\\+\\+.*?\\+\\+\\+", "", json_df$articleBody)) 17 | text <- gsub("\r\n", "\n", text) 18 | text <- gsub("Folgen Sie.*", "", text) 19 | s_n_list( 20 | datetime, 21 | author, 22 | headline, 23 | text 24 | ) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /R/deliver_news_und_nachrichten_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.news_und_nachrichten_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(gsub("[\r\n]*", "", json_txt[1])) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author) 16 | text <- json_df$articleBody 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_newsflash24_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.newsflash24_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- json_df$headline 21 | text <- html %>% 22 | rvest::html_elements(".entry-content p, .entry-content h2") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") 25 | 26 | s_n_list( 27 | datetime, 28 | author, 29 | headline, 30 | text 31 | ) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /R/deliver_newstatesman_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.newstatesman_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("[property=\"article:published_time\"]") %>% 13 | rvest::html_attr("content") %>% 14 | lubridate::as_datetime() 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_element("[property=\"og:title\"]") %>% 19 | rvest::html_attr("content") 20 | 21 | # author 22 | author <- html %>% 23 | rvest::html_element("[name=\"author\"]") %>% 24 | rvest::html_attr("content") 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements(".c-article-content__container p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | cover_image_html <- html %>% 33 | rvest::html_element(".c-featured-image__container img") %>% 34 | as.character() 35 | 36 | cover_image_url <- html %>% 37 | rvest::html_element(".c-featured-image__container img") %>% 38 | rvest::html_attr("src") 39 | 40 | s_n_list( 41 | datetime, 42 | author, 43 | headline, 44 | text, 45 | cover_image_url, 46 | cover_image_html 47 | ) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_newsweek_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | 3 | pb_deliver_paper.newsweek_com <- function(x, verbose = NULL, pb, ...) { 4 | 5 | pb_tick(x, verbose, pb) 6 | # raw html is stored in column content_raw 7 | html <- rvest::read_html(x$content_raw) 8 | 9 | # datetime 10 | datetime <- html %>% 11 | rvest::html_elements("[property=\"article:published_time\"]") %>% 12 | rvest::html_attr("content") %>% 13 | lubridate::as_datetime() 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_elements("[property =\"og:title\"]") %>% 18 | rvest::html_attr("content") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_elements("[class=\"author\"]") %>% 23 | rvest::html_text2() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".article-body") %>% 28 | rvest::html_elements("p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | s_n_list( 33 | datetime, 34 | author, 35 | headline, 36 | text 37 | ) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /R/deliver_nordkurier_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.nordkurier_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".tw-text-title-md, .paragraph,h2.tw-mb-4") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_nos_nl.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.nos_nl <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[property=\"og:article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element(".NYlVB") %>% 22 | rvest::html_text2() %>% 23 | stats::na.omit() %>% 24 | toString() 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements("#content p,#content h2") %>% 29 | rvest::html_text2() %>% 30 | setdiff("Deel artikel:") %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_html <- html %>% 34 | rvest::html_element("#content button picture") %>% 35 | as.character() 36 | 37 | cover_image_url <- html %>% 38 | rvest::html_element("#content button picture img") %>% 39 | rvest::html_attr("src") 40 | 41 | # the helper function safely creates a named list from objects 42 | s_n_list( 43 | datetime, 44 | author, 45 | headline, 46 | text, 47 | cover_image_url, 48 | cover_image_html 49 | ) 50 | 51 | } 52 | -------------------------------------------------------------------------------- /R/deliver_novinky_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.novinky_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | page_data <- try({html %>% 9 | rvest::html_element(".page-detail script") %>% 10 | rvest::html_text() %>% 11 | jsonlite::fromJSON()}, silent = TRUE) 12 | 13 | # datetime 14 | datetime <- purrr::pluck(page_data, "datePublished", .default = NA_character_) %>% 15 | lubridate::as_datetime() 16 | 17 | # headline 18 | headline <- purrr::pluck(page_data, "headline", .default = NA_character_) 19 | 20 | # author 21 | author <- purrr::pluck(page_data, "author", "name", .default = NA_character_) %>% 22 | toString() 23 | 24 | # text 25 | text <- html %>% 26 | rvest::html_elements(".j_if .speakable") %>% 27 | rvest::html_text2() %>% 28 | paste(collapse = "\n") 29 | 30 | cover_image_html <- html %>% 31 | rvest::html_element(".ogm-main-media__container img") %>% 32 | as.character() 33 | 34 | cover_image_url <- html %>% 35 | rvest::html_element(".ogm-main-media__container img") %>% 36 | rvest::html_attr("src") %>% 37 | paste0("https:", .) 38 | 39 | # the helper function safely creates a named list from objects 40 | s_n_list( 41 | datetime, 42 | author, 43 | headline, 44 | text, 45 | cover_image_url, 46 | cover_image_html 47 | ) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_noz_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.noz_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("p.w-600,section.content--group p, section.content--group h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_nrc_nl.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.nrc_nl <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("time") %>% 11 | rvest::html_attr("datetime") %>% 12 | lubridate::as_datetime() 13 | 14 | type <- NULL 15 | if (is.na(datetime)) { 16 | datetime <- html %>% 17 | rvest::html_element(".artikel") %>% 18 | rvest::html_attr("data-article-updated-at") %>% 19 | lubridate::as_datetime() 20 | 21 | type <- html %>% 22 | rvest::html_element(".artikel") %>% 23 | rvest::html_attr("data-article-type") 24 | } 25 | 26 | # headline 27 | headline <- html %>% 28 | rvest::html_element("[property=\"og:title\"]") %>% 29 | rvest::html_attr("content") 30 | 31 | if (!is.null(type)) headline <- paste0("[", type, "] ", headline) 32 | 33 | # author 34 | author <- html %>% 35 | rvest::html_elements("[rel=\"author\"],.authors") %>% 36 | rvest::html_text2() %>% 37 | toString() 38 | 39 | # text 40 | text <- html %>% 41 | rvest::html_elements(".article__content>p,.article__content>.bericht>p,.podcast-content,.vorm__article-content>p") %>% 42 | rvest::html_text2() %>% 43 | paste(collapse = "\n") 44 | 45 | cover_image_html <- html %>% 46 | rvest::html_element("picture img") %>% 47 | as.character() 48 | 49 | cover_image_url <- html %>% 50 | rvest::html_element("picture img") %>% 51 | rvest::html_attr("src") 52 | 53 | # the helper function safely creates a named list from objects 54 | s_n_list( 55 | datetime, 56 | author, 57 | headline, 58 | text, 59 | cover_image_url, 60 | cover_image_html 61 | ) 62 | 63 | } 64 | -------------------------------------------------------------------------------- /R/deliver_nu_nl.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.nu_nl <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("[name=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element("title") %>% 17 | rvest::html_text2() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element(".author") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".textblock.paragraph") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | cover_image_html <- html %>% 32 | rvest::html_element(".article .app-image") %>% 33 | as.character() 34 | 35 | cover_image_url <- html %>% 36 | rvest::html_element(".article .app-image") %>% 37 | rvest::html_attr("src") 38 | 39 | # the helper function safely creates a named list from objects 40 | s_n_list( 41 | datetime, 42 | author, 43 | headline, 44 | text, 45 | cover_image_url, 46 | cover_image_html 47 | ) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_nw_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.nw_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("p.em_text,h2.Zwischenzeile") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_nypost_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.nypost_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_elements("[property=\"article:published_time\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_elements("[property=\"og:title\"]") %>% 17 | rvest::html_attr("content") 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_elements(".byline__author") %>% 22 | rvest::html_text2() %>% 23 | toString() %>% 24 | gsub("By ", "", ., fixed = TRUE) 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements("[class*=\"content\"]>p,[class*=\"entry-content\"]>p") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | # the helper function safely creates a named list from objects 33 | s_n_list( 34 | datetime, 35 | author, 36 | headline, 37 | text 38 | ) 39 | 40 | } 41 | 42 | pb_deliver_paper.decider_com <- 43 | pb_deliver_paper.pagesix_com <- 44 | pb_deliver_paper.nypost_com 45 | -------------------------------------------------------------------------------- /R/deliver_nytimes_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.nytimes_com <- function(x, verbose, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | html_search(selectors = c( 11 | "[property=\"article:published_time\"]" 12 | ), attributes = "content") %>% 13 | lubridate::as_datetime() 14 | 15 | # author 16 | author <- html %>% 17 | rvest::html_elements("[name=\"byl\"]") %>% 18 | rvest::html_attr("content") %>% 19 | toString() %>% 20 | gsub("By ", "", ., fixed = TRUE) %>% 21 | unique() %>% 22 | toString() 23 | 24 | if (!isFALSE(is.na(datetime))) { 25 | datetime <- html %>% 26 | rvest::html_elements("[slot=\"data\"],script") %>% 27 | rvest::html_text() %>% 28 | extract("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z") %>% 29 | unique() %>% 30 | lubridate::as_datetime() %>% 31 | utils::head(1L) 32 | } 33 | 34 | # headline 35 | headline <- html %>% 36 | rvest::html_elements("[property=\"og:title\"]") %>% 37 | rvest::html_attr("content") 38 | 39 | # text 40 | text_temp <- html %>% 41 | rvest::html_elements("[name=\"articleBody\"]") 42 | 43 | if (length(text_temp) > 0) { 44 | text <- text_temp %>% 45 | rvest::html_elements("p") %>% 46 | rvest::html_text2() %>% 47 | paste(collapse = "\n") 48 | } else { 49 | text <- html %>% 50 | rvest::html_elements("p") %>% 51 | rvest::html_text2() %>% 52 | paste(collapse = "\n") 53 | } 54 | 55 | # the helper function safely creates a named list from objects 56 | s_n_list( 57 | datetime, 58 | author, 59 | headline, 60 | text 61 | ) 62 | 63 | } 64 | -------------------------------------------------------------------------------- /R/deliver_nzz_ch.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.nzz_ch <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".headline__lead,.articlecomponent.text,.subtitle,.articlecomponent") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_orf_at.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.orf_at <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".story-lead-text,.story-story p,.story-story h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_ostsee_zeitung_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.ostsee_zeitung_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_txt <- json_txt[grepl("NewsArticle", json_txt)] 12 | if (length(json_txt) == 0) { 13 | return(s_n_list()) 14 | } 15 | json_df <- jsonlite::fromJSON(json_txt) 16 | 17 | datetime <- lubridate::as_datetime(json_df$datePublished) 18 | headline <- json_df$headline 19 | author <- toString(json_df$author$name) 20 | text <- html %>% 21 | rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /R/deliver_parlamentnilisty_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.parlamentnilisty_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(charToRaw(enc2utf8(x$content_raw))) 7 | 8 | # data about the article is nicely stored in a json string 9 | data <- html %>% 10 | rvest::html_elements("[type=\"application/ld+json\"]") %>% 11 | rvest::html_text() %>% 12 | gsub("[\r\n]", "", .) %>% # sometimes uses illegal line breaks 13 | lapply(jsonlite::fromJSON, simplifyVector = FALSE) 14 | 15 | # usually there are more than one, 16 | if (length(data) > 0L) { 17 | tp <- purrr::map_chr(data, function(x) 18 | purrr::pluck(x, "@type", .default = NA_character_)) 19 | 20 | data <- purrr::pluck(data, which(tp == "NewsArticle")) 21 | } 22 | 23 | # datetime 24 | datetime <- data$datePublished %>% 25 | lubridate::as_datetime() 26 | 27 | # headline 28 | headline <- data$headline 29 | 30 | # author 31 | author <- purrr::pluck(data$author, "name", .default = NA_character_) %>% 32 | toString() 33 | 34 | # text 35 | text <- html %>% 36 | rvest::html_elements("article .article-content>p,article .brief") %>% 37 | rvest::html_elements(":not(style)") %>% 38 | rvest::html_text2() %>% 39 | paste(collapse = "\n") 40 | 41 | cover_image_url <- purrr::pluck(data, "image", "url", .default = NA_character_) 42 | if (!is.na(cover_image_url)) { 43 | cover_image_url <- gsub("amp;", "", cover_image_url, fixed = TRUE) 44 | } 45 | 46 | s_n_list( 47 | datetime, 48 | author, 49 | headline, 50 | text, 51 | cover_image_url 52 | ) 53 | 54 | } 55 | -------------------------------------------------------------------------------- /R/deliver_presseportal_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.presseportal_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("article.story p:not([class])") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_prosieben_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.prosieben_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | if (length(json_txt) == 2) { 12 | json_txt <- json_txt[2] 13 | } 14 | json_df <- jsonlite::fromJSON(json_txt) 15 | if (json_df$`@type` != "VideoObject" && json_df$`@type` != "FAQPage") { # NewsArticle 16 | datetime <- lubridate::as_datetime(json_df$datePublished) 17 | headline <- json_df$headline 18 | author <- toString(json_df$author$name) 19 | text <- html %>% 20 | rvest::html_elements(".css-f9qfdi p.css-bq2685,.css-f9qfdi h2") %>% 21 | rvest::html_text2() %>% 22 | paste(collapse = "\n") 23 | } else if (json_df$`@type` != "FAQPage") { 24 | return(s_n_list()) 25 | } else { 26 | datetime <- lubridate::as_datetime(json_df$uploadDate) 27 | headline <- json_df$name 28 | author <- "" 29 | text <- json_df$description # for video objects, use description as text 30 | } 31 | 32 | s_n_list( 33 | datetime, 34 | author, 35 | headline, 36 | text 37 | ) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /R/deliver_rbb24_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.rbb24_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | datetime <- html %>% 8 | rvest::html_elements(".technicalline .lineinfo") %>% 9 | rvest::html_text2() %>% 10 | gsub(".*(\\d{2}\\.\\d{2}\\.\\d{2}) \\| (\\d{2}:\\d{2}).*", "\\1 \\2", .) %>% 11 | lubridate::as_datetime(format = "%d.%m.%y %H:%M", tz = "UTC") # This will not be the correct timezone 12 | 13 | 14 | headline <- html %>% 15 | rvest::html_elements(".titletext") %>% 16 | rvest::html_text2() 17 | 18 | author <- "" # no article with author info found 19 | 20 | text <- html %>% 21 | rvest::html_elements(".shorttext p, .textblock p, h4.texttitle") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_rnd_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.rnd_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | if (length(json_txt) <= 2) { 12 | return(s_n_list()) 13 | } 14 | json_df <- jsonlite::fromJSON(json_txt[3]) 15 | 16 | datetime <- lubridate::as_datetime(json_df$datePublished) 17 | headline <- json_df$headline 18 | author <- toString(json_df$author$name) 19 | text <- html %>% 20 | rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% 21 | rvest::html_text2() 22 | 23 | more_items <- html %>% # delete content in lists of related items 24 | rvest::html_elements("div[data-is-element-rendered='true']") %>% 25 | rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% 26 | rvest::html_text2() 27 | text <- text[!text %in% more_items] %>% paste(collapse = "\n") 28 | s_n_list( 29 | datetime, 30 | author, 31 | headline, 32 | text 33 | ) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /R/deliver_rollingstone_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.rollingstone_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- json_df$headline 21 | text <- html %>% 22 | rvest::html_elements(".asmb-article-excerpt,.asmb-article-content-container h2,.asmb-article-content-container p") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") 25 | 26 | s_n_list( 27 | datetime, 28 | author, 29 | headline, 30 | text 31 | ) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /R/deliver_rp_online_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.rp_online_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("strong[data-cy=\"intro\"],div[data-cy=\"article_content\"] p") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_rte_ie.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.rte_ie <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("[property=\"article:published_time\"]") %>% 13 | rvest::html_attr("content") %>% 14 | lubridate::as_datetime() 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_element("title") %>% 19 | rvest::html_text2() 20 | 21 | # author 22 | author <- html %>% 23 | rvest::html_elements("[itemprop=\"author\"]>[itemprop=\"name\"]") %>% 24 | rvest::html_attr("content") %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements(".article-body p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | type <- html %>% 34 | rvest::html_element("[name=\"article-type\"]") %>% 35 | rvest::html_attr("content") 36 | 37 | cover_image_html <- html %>% 38 | rvest::html_element("#main-article-image img") %>% 39 | as.character() 40 | 41 | cover_image_url <- html %>% 42 | rvest::html_element("#main-article-image img") %>% 43 | rvest::html_attr("src") 44 | 45 | s_n_list( 46 | datetime, 47 | author, 48 | headline, 49 | text, 50 | type, 51 | cover_image_url, 52 | cover_image_html 53 | ) 54 | 55 | } 56 | -------------------------------------------------------------------------------- /R/deliver_rtl_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.rtl_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | if (any(json_df$`@type` %in% c("VideoGame"))) { 13 | return(s_n_list()) 14 | } 15 | if (json_df$`@type` != "VideoObject") { # NewsArticle 16 | datetime <- lubridate::as_datetime(json_df$datePublished) 17 | headline <- json_df$headline 18 | author <- toString(json_df$author$name) 19 | text <- html %>% 20 | rvest::html_elements(".article-body .LeadText_lead__rfwFU,.article-body .AnnotatedMarkup_paragraph__IUT9l") %>% 21 | rvest::html_text2() %>% 22 | paste(collapse = "\n") 23 | } else { 24 | datetime <- lubridate::as_datetime(json_df$uploadDate) 25 | headline <- json_df$name 26 | author <- "" 27 | text <- json_df$transcript # for video objects, use transcript as text 28 | } 29 | 30 | s_n_list( 31 | datetime, 32 | author, 33 | headline, 34 | text 35 | ) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /R/deliver_rtl_nl.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.rtl_nl <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("[property=\"article:published_time\"]") %>% 13 | rvest::html_attr("content") %>% 14 | lubridate::as_datetime() 15 | 16 | if (is.na(datetime)) { 17 | datetime <- html %>% 18 | rvest::html_element("time") %>% 19 | rvest::html_attr("datetime") %>% 20 | lubridate::as_datetime() 21 | } 22 | 23 | # headline 24 | headline <- html %>% 25 | rvest::html_element("[property=\"og:title\"]") %>% 26 | rvest::html_attr("content") 27 | 28 | # author 29 | author <- html %>% 30 | rvest::html_element("[data-testid=\"author\"]") %>% 31 | rvest::html_text2() %>% 32 | toString() %>% 33 | # would be cleaner to remove the child, but not sure how 34 | gsub("\\..*", "", .) 35 | 36 | # text 37 | text <- html %>% 38 | rvest::html_elements("main p") %>% 39 | rvest::html_text2() %>% 40 | paste(collapse = "\n") 41 | 42 | # the helper function safely creates a named list from objects 43 | s_n_list( 44 | datetime, 45 | author, 46 | headline, 47 | text 48 | ) 49 | 50 | } 51 | 52 | pb_deliver_paper.rtlnieuws_nl <- pb_deliver_paper.rtl_nl 53 | -------------------------------------------------------------------------------- /R/deliver_ruhr24_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.ruhr24_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_ruhrnachrichten_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.ruhrnachrichten_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph` 13 | if (any(json_df$`@type` == "Person")) { 14 | author <- toString(json_df$name[json_df$`@type` == "Person"]) 15 | } else { 16 | author <- "" 17 | } 18 | json_df <- json_df[grepl("NewsArticle|Article", json_df$`@type`), ] 19 | datetime <- lubridate::as_datetime(json_df$datePublished) 20 | headline <- json_df$headline 21 | text <- html %>% 22 | rvest::html_elements("p.article__teaser-text,.article__content p, .article__content h2") %>% 23 | rvest::html_text2() %>% 24 | paste(collapse = "\n") %>% 25 | gsub("\nZur Startseite$", "", .) 26 | 27 | s_n_list( 28 | datetime, 29 | author, 30 | headline, 31 | text 32 | ) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /R/deliver_saechsische_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.saechsische_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | if (length(json_txt) >= 2) { 12 | return(s_n_list()) 13 | } 14 | json_df <- jsonlite::fromJSON(json_txt[3]) 15 | 16 | datetime <- lubridate::as_datetime(json_df$datePublished) 17 | headline <- json_df$headline 18 | author <- toString(json_df$author$name) 19 | text <- html %>% 20 | rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% 21 | rvest::html_text2() %>% 22 | paste(collapse = "\n") 23 | 24 | s_n_list( 25 | datetime, 26 | author, 27 | headline, 28 | text 29 | ) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_schwaebische_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.schwaebische_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".tw-text-title-md, p.paragraph, h2.tw-mb-4") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_seznamzpravy_cz.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.seznamzpravy_cz <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # data about the article is nicely stored in a json string 9 | data <- html %>% 10 | rvest::html_elements("[type=\"application/ld+json\"]") %>% 11 | rvest::html_text() %>% 12 | lapply(jsonlite::fromJSON) 13 | 14 | # usually there are more than one, 15 | if (length(data) > 1L) { 16 | tp <- purrr::map_chr(data, function(x) 17 | purrr::pluck(x, "@type", .default = NA_character_)) 18 | 19 | data <- purrr::pluck(data, which(tp == "NewsArticle")) 20 | } 21 | 22 | datetime <- data$datePublished %>% 23 | lubridate::as_datetime() 24 | 25 | headline <- data$headline 26 | 27 | author <- data$author$name %>% 28 | toString() 29 | 30 | # text 31 | text <- html %>% 32 | rvest::html_elements("article p") %>% 33 | rvest::html_text2() %>% 34 | paste(collapse = "\n") 35 | 36 | cover_image_url <- purrr::pluck(data, "image", "url", .default = NA_character_) 37 | 38 | s_n_list( 39 | datetime, 40 | author, 41 | headline, 42 | text, 43 | cover_image_url 44 | ) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /R/deliver_sfgate_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.sfgate_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_elements("[name=\"sailthru.date\"]") %>% 11 | rvest::html_attr("content") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_elements("[property=\"sailthru.title\"]") %>% 17 | rvest::html_attr("content") 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_elements("[name=\"sailthru.author\"]") %>% 22 | rvest::html_attr("content") %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements("p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | s_n_list( 32 | datetime, 33 | author, 34 | headline, 35 | text 36 | ) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /R/deliver_shz_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.shz_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("p.w-600, p,h2.h4") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_skwawkbox_org.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.skwawkbox_org <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element(".entry-date") %>% 11 | rvest::html_attr("datetime") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element(".entry-title") %>% 17 | rvest::html_text() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element(".byline") %>% 22 | rvest::html_text2() %>% 23 | toString() %>% 24 | gsub("by ", "", ., fixed = TRUE) 25 | 26 | # text 27 | text <- html %>% 28 | rvest::html_elements(".entry-content>p:not(:contains('The SKWAWKBOX needs your support'))") %>% 29 | rvest::html_text2() %>% 30 | paste(collapse = "\n") 31 | 32 | # in-text links 33 | text_links <- html %>% 34 | rvest::html_elements(".entry-content>p:not(:contains('The SKWAWKBOX needs your support'))>a") %>% 35 | rvest::html_attr("href") %>% 36 | as.list() 37 | 38 | # the helper function safely creates a named list from objects 39 | s_n_list( 40 | datetime, 41 | author, 42 | headline, 43 | text, 44 | text_links 45 | ) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /R/deliver_sky_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.sky_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | data <- html %>% 11 | rvest::html_elements("[type=\"application/ld+json\"]") %>% 12 | rvest::html_text2() %>% 13 | lapply(jsonlite::fromJSON) 14 | 15 | # usually there are more than one, 16 | if (length(data) > 1L) { 17 | tp <- purrr::map_chr(data, function(x) 18 | purrr::pluck(x, "@type", .default = NA_character_)) 19 | 20 | data <- purrr::pluck(data, which(tp == "NewsArticle"), .default = NA) 21 | } 22 | 23 | if (!isTRUE(is.na(data))) { 24 | 25 | # datetime 26 | datetime <- data$datePublished %>% 27 | lubridate::as_datetime() 28 | 29 | # headline 30 | headline <- data$headline 31 | 32 | # author 33 | author <- data$author$name %>% 34 | toString() 35 | 36 | # text 37 | text <- html %>% 38 | rvest::html_elements(".sdc-article-body p") %>% 39 | rvest::html_text2() %>% 40 | paste(collapse = "\n") 41 | 42 | cover_image_url <- purrr::pluck(data$image, "url", .default = NA) 43 | 44 | type <- data$`@type` 45 | 46 | s_n_list( 47 | datetime, 48 | author, 49 | headline, 50 | text, 51 | type, 52 | cover_image_url 53 | ) 54 | } else { 55 | s_n_list() 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /R/deliver_spiegel_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.spiegel_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | datetime <- html %>% 8 | html_search(c("time"), c("datetime")) %>% 9 | lubridate::as_datetime() 10 | 11 | # headline 12 | headline <- html %>% 13 | rvest::html_element("article") %>% 14 | rvest::html_attr("aria-label") 15 | 16 | # author 17 | author <- html %>% 18 | rvest::html_element("meta[name=\"author\"]") %>% 19 | rvest::html_attr("content") %>% 20 | toString() 21 | 22 | # text 23 | text <- html %>% 24 | rvest::html_elements("div[data-area = \"text\"]") %>% 25 | rvest::html_text2() %>% 26 | paste(collapse = "\n") 27 | 28 | # the helper function safely creates a named list from objects 29 | s_n_list( 30 | datetime, 31 | author, 32 | headline, 33 | text 34 | ) 35 | } 36 | -------------------------------------------------------------------------------- /R/deliver_srf_ch.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.srf_ch <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_df <- html %>% 8 | rvest::html_element("span#config__js") %>% 9 | rvest::html_attr("data-analytics-webtrekk-survey-gizmo-value-object") %>% 10 | jsonlite::fromJSON() 11 | 12 | datetime <- lubridate::as_datetime(json_df$params$content_publication_datetime) 13 | 14 | headline <- html %>% 15 | rvest::html_elements("h1 .article-title__text") %>% 16 | rvest::html_text() 17 | 18 | author <- "" # no article with author info founds 19 | 20 | text <- html %>% 21 | rvest::html_elements(".article-content p, .article-content h2") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_stern_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.stern_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt)[1, ] 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".intro,.text-element") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | 22 | s_n_list( 23 | datetime, 24 | author, 25 | headline, 26 | text 27 | ) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /R/deliver_stuttgarter_zeitung_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.stuttgarter_zeitung_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".brick.intro-text p,.brickgroup p,.brickgroup h2") %>% 18 | rvest::html_text2() 19 | rm_text <- c("StZ-Plus-Abonnement", "Vertrag mit Werbung") 20 | 21 | text <- text[!text %in% rm_text] %>% 22 | paste(collapse = "\n") 23 | 24 | s_n_list( 25 | datetime, 26 | author, 27 | headline, 28 | text 29 | ) 30 | } 31 | } 32 | # rss feed includes pages that cannot be parsed because they are subpages 33 | # rss feed also includes podcast, which cannot be parsed 34 | -------------------------------------------------------------------------------- /R/deliver_sueddeutsche_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.sueddeutsche_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- json_df$articleBody 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_suedkurier_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.suedkurier_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | datetime <- lubridate::as_datetime(json_df$datePublished) 13 | headline <- html %>% 14 | rvest::html_element("header h1") %>% 15 | rvest::html_text() 16 | author <- paste0("

", json_df$author$name, "

", collapse = ",") %>% 17 | rvest::read_html() %>% 18 | rvest::html_text() %>% 19 | toString() 20 | text <- html %>% 21 | rvest::html_elements(".article-summary,.article-jsonld.article-paywall-summary,.article-jsonld p") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /R/deliver_swp_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.swp_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | datetime <- lubridate::as_datetime(json_df$datePublished) 13 | headline <- json_df$headline 14 | author <- toString(json_df$author$name) 15 | text <- html %>% 16 | rvest::html_elements(".u-article-header .fs-4,.u-paragraph, .u-title.u-headline") %>% 17 | rvest::html_text2() %>% 18 | paste(collapse = "\n") 19 | 20 | s_n_list( 21 | datetime, 22 | author, 23 | headline, 24 | text 25 | ) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /R/deliver_swr3_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.swr3_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | datetime <- html %>% 7 | rvest::html_elements(".meta-top time") %>% 8 | rvest::html_attr("datetime") %>% 9 | lubridate::as_datetime() 10 | 11 | headline <- html %>% 12 | rvest::html_elements("h1.headline") %>% 13 | rvest::html_text() 14 | 15 | author <- html %>% 16 | rvest::html_elements(".meta-top .meta-author-name a") %>% 17 | rvest::html_text2() %>% 18 | toString() 19 | 20 | text <- html %>% 21 | rvest::html_elements("p.lead, .bodytext p, .bodytext h2") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_swr_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.swr_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | 8 | datetime <- html %>% 9 | rvest::html_element("time") %>% 10 | rvest::html_attr("datetime") %>% 11 | lubridate::as_datetime() 12 | headline <- html %>% 13 | rvest::html_element("h1.headline") %>% 14 | rvest::html_text() 15 | author <- html %>% 16 | rvest::html_elements(".meta-top .meta-authors .meta-author-name a") %>% 17 | rvest::html_text2() %>% 18 | toString() 19 | text <- html %>% 20 | rvest::html_elements(".detail-body .lead, .bodytext p, .bodytext h2") %>% 21 | rvest::html_text2() %>% 22 | paste(collapse = "\n") 23 | 24 | s_n_list( 25 | datetime, 26 | author, 27 | headline, 28 | text 29 | ) 30 | } 31 | -------------------------------------------------------------------------------- /R/deliver_swrfernsehen_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.swrfernsehen_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | datetime <- html %>% 7 | rvest::html_elements(".meta-top .meta-description time") %>% 8 | rvest::html_attr("datetime") %>% 9 | lubridate::as_datetime() 10 | 11 | headline <- html %>% 12 | rvest::html_elements("h1.headline") %>% 13 | rvest::html_text() 14 | 15 | author <- html %>% 16 | rvest::html_elements(".meta-top .meta-author-name a") %>% 17 | rvest::html_text2() %>% 18 | toString() 19 | 20 | text <- html %>% 21 | rvest::html_elements(".detail-body .lead,.bodytext p,.bodytext h2") %>% 22 | rvest::html_text2() %>% 23 | paste(collapse = "\n") 24 | 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_t3n_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.t3n_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- gsub("\r\n", "\n", json_df$articleBody) 17 | text <- gsub("\\[.*?\\]", "", text) 18 | 19 | s_n_list( 20 | datetime, 21 | author, 22 | headline, 23 | text 24 | ) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /R/deliver_t_online_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.t_online_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$`@graph`[1, ] 13 | 14 | datetime <- lubridate::as_datetime(json_df$datePublished) 15 | headline <- json_df$headline 16 | author <- toString(json_df$author[[1]]$name) 17 | text <- html %>% 18 | rvest::html_elements("div[data-testid=\"ArticleBody.StreamLayout\"] p") %>% 19 | rvest::html_text2() %>% 20 | paste(collapse = "\n") 21 | 22 | s_n_list( 23 | datetime, 24 | author, 25 | headline, 26 | text 27 | ) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /R/deliver_tag24_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.tag24_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | datetime <- lubridate::as_datetime(json_df$datePublished) 13 | headline <- json_df$headline 14 | author <- toString(json_df$author$name) 15 | text <- json_df$articleBody 16 | 17 | s_n_list( 18 | datetime, 19 | author, 20 | headline, 21 | text 22 | ) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /R/deliver_tagesschau_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.tagesschau_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- trimws(gsub("<[^>]+>", "", json_df$articleBody)) 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_tagesspiegel_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.tagesspiegel_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("#story-elements p") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_taz_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.taz_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) <= 2) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[3]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- json_df$articleBody 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_techrepublic_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.techrepublic_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("[property=\"article:published_time\"]") %>% 13 | rvest::html_attr("content") %>% 14 | lubridate::as_datetime() 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_element("[property=\"og:title\"]") %>% 19 | rvest::html_attr("content") 20 | 21 | # author 22 | author <- html %>% 23 | rvest::html_element("[name=\"author\"]") %>% 24 | rvest::html_attr("content") %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements(".article-summary,section") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | # the helper function safely creates a named list from objects 34 | s_n_list( 35 | datetime, 36 | author, 37 | headline, 38 | text 39 | ) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /R/deliver_telegraaf_nl.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.telegraaf_nl <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | data <- html %>% 9 | rvest::html_elements("[data-name=\"PageTracking\"]") %>% 10 | rvest::html_text2() %>% 11 | jsonlite::fromJSON() 12 | 13 | type <- purrr::pluck(data, "article", "type") 14 | paywall <- purrr::pluck(data, "article", "premium") 15 | 16 | # datetime 17 | datetime <- purrr::pluck(data, "article", "publishDate") %>% 18 | lubridate::as_datetime() 19 | 20 | # headline 21 | headline <- purrr::pluck(data, "article", "title") 22 | 23 | # author 24 | author <- purrr::pluck(data, "article", "author", .default = NA_character_) 25 | 26 | # text 27 | if (type == "normal") { 28 | text <- html %>% 29 | rvest::html_elements(".Article__intro,.DetailBodyBlocks p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | } else { 33 | text <- paste0("[", type, "]") 34 | } 35 | 36 | cover_image_html <- html %>% 37 | rvest::html_element(".DetailArticleImage img") %>% 38 | as.character() 39 | 40 | cover_image_url <- html %>% 41 | rvest::html_element(".DetailArticleImage img") %>% 42 | rvest::html_attr("src") 43 | 44 | if (!is.na(cover_image_url)) 45 | cover_image_url <- paste0("https://www.telegraaf.nl", cover_image_url) 46 | 47 | # the helper function safely creates a named list from objects 48 | s_n_list( 49 | datetime, 50 | author, 51 | headline, 52 | text, 53 | type, 54 | paywall, 55 | cover_image_url, 56 | cover_image_html 57 | ) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /R/deliver_telegraph_co_uk.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.telegraph_co_uk <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | html_search("[itemprop=\"datePublished\"]", 11 | c("content", "datetime")) %>% 12 | as.POSIXct(format = "%Y-%m-%dT%H:%M%z") %>% 13 | utils::head(1L) 14 | 15 | # headline 16 | headline <- html %>% 17 | rvest::html_elements("[property=\"og:title\"]") %>% 18 | rvest::html_attr("content") 19 | 20 | # author 21 | author <- html %>% 22 | rvest::html_elements("[class*=\"byline__author\"]") %>% 23 | rvest::html_attr("content") %>% 24 | toString() %>% 25 | gsub("^By\\s", "", .) 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("[class*=\"article-body-text\"]") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | # type 34 | content_type <- html %>% 35 | rvest::html_element("[property=\"og:type\"]") %>% 36 | rvest::html_attr("content") 37 | 38 | s_n_list( 39 | datetime, 40 | author, 41 | headline, 42 | text, 43 | content_type 44 | ) 45 | } 46 | -------------------------------------------------------------------------------- /R/deliver_thecanary_co.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.thecanary_co <- function(x, verbose = NULL, pb, ...) { 3 | 4 | pb_tick(x, verbose, pb) 5 | # raw html is stored in column content_raw 6 | html <- rvest::read_html(x$content_raw) 7 | 8 | # datetime 9 | datetime <- html %>% 10 | rvest::html_element("time") %>% 11 | rvest::html_attr("datetime") %>% 12 | lubridate::as_datetime() 13 | 14 | # headline 15 | headline <- html %>% 16 | rvest::html_element(".entry-title") %>% 17 | rvest::html_text() 18 | 19 | # author 20 | author <- html %>% 21 | rvest::html_element(".author") %>% 22 | rvest::html_text2() %>% 23 | toString() 24 | 25 | # text 26 | text <- html %>% 27 | rvest::html_elements(".entry-content>p") %>% 28 | rvest::html_text2() %>% 29 | paste(collapse = "\n") 30 | 31 | # in-text links 32 | text_links <- html %>% 33 | rvest::html_elements(".entry-content>p>a") %>% 34 | rvest::html_attr("href") %>% 35 | as.list() 36 | 37 | # the helper function safely creates a named list from objects 38 | s_n_list( 39 | datetime, 40 | author, 41 | headline, 42 | text, 43 | text_links 44 | ) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /R/deliver_thejournal_ie.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.thejournal_ie <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("[property=\"article:post_date\"]") %>% 13 | rvest::html_attr("content") %>% 14 | lubridate::as_datetime() 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_element("title") %>% 19 | rvest::html_text2() 20 | 21 | # author 22 | author <- html %>% 23 | rvest::html_elements("[property=\"article:author\"]") %>% 24 | rvest::html_attr("content") %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("[itemprop=\"articleBody\"] p:not(.article-updated-redesign)") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_html <- html %>% 34 | rvest::html_element(".article-primary-img-redesign") %>% 35 | as.character() 36 | 37 | cover_image_url <- html %>% 38 | rvest::html_element(".article-primary-img-redesign") %>% 39 | rvest::html_attr("srcset") 40 | 41 | s_n_list( 42 | datetime, 43 | author, 44 | headline, 45 | text, 46 | cover_image_url, 47 | cover_image_html 48 | ) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /R/deliver_thesun_ie.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.thesun_ie <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | data <- html %>% 11 | rvest::html_element("[type=\"application/ld+json\"]") %>% 12 | rvest::html_text2() 13 | 14 | if (!isTRUE(is.na(data))) { 15 | data <- jsonlite::fromJSON(data) 16 | # datetime 17 | datetime <- data$datePublished %>% 18 | lubridate::as_datetime() 19 | 20 | # headline 21 | headline <- data$headline 22 | 23 | # author 24 | author <- data$author$name %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("article p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_url <- utils::head(data$image$url, 1L) 34 | 35 | type <- data$`@type` 36 | 37 | s_n_list( 38 | datetime, 39 | author, 40 | headline, 41 | text, 42 | type, 43 | cover_image_url 44 | ) 45 | } else { 46 | s_n_list() 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_thueringer_allgemeine_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.thueringer_allgemeine_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".article-body p, .article-body h3") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_tz_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.tz_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_vice_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.vice_com <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".entry-content.entry-content p,.entry-content entry-content h2") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_volksstimme_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.volksstimme_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".fp-article-heading__excerpt,.fp-paragraph, .fp-subheading") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_vox_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.vox_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | if (length(json_df$`@type`) > 1) { 13 | json_df <- json_df[json_df$`@type` == "Article", ] 14 | } 15 | datetime <- lubridate::as_datetime(json_df$datePublished) 16 | headline <- json_df$headline 17 | author <- toString(json_df$author$name) 18 | text <- json_df$articleBody 19 | if (author == "VOX Online") { 20 | # the text might have the author abbr. at the end 21 | author_abbr <- sub(".*\\(([^)]+)\\)$", "\\1", text) 22 | if (author_abbr != "") { 23 | author <- author_abbr 24 | } 25 | } 26 | s_n_list( 27 | datetime, 28 | author, 29 | headline, 30 | text 31 | ) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /R/deliver_wa_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.wa_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | json_df <- json_df$mainEntity 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_watson_ch.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.watson_ch <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- json_df$articleBody 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_watson_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.watson_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[2]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- json_df$articleBody 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_waz_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.waz_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | if (length(json_txt) == 1) { 12 | return(s_n_list()) 13 | } 14 | json_df <- jsonlite::fromJSON(json_txt[2]) 15 | 16 | datetime <- lubridate::as_datetime(json_df$datePublished) 17 | headline <- json_df$headline 18 | author <- toString(json_df$author$name) 19 | text <- html %>% 20 | rvest::html_elements(".article-body p,.article-body h3") %>% 21 | rvest::html_text2() %>% 22 | paste(collapse = "\n") 23 | 24 | s_n_list( 25 | datetime, 26 | author, 27 | headline, 28 | text 29 | ) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /R/deliver_wdr_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.wdr_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | # careful: json can have many objects but the first seems to be the article 8 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 9 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 10 | return(s_n_list()) 11 | } else { 12 | json_df <- jsonlite::fromJSON(json_txt[1]) 13 | date_tmp <- json_df$datePublished # missing sec 14 | date_tmp <- sub("(\\d{2}:\\d{2})(\\+\\d{2}:\\d{2})", "\\1:00\\2", date_tmp) 15 | datetime <- lubridate::as_datetime(date_tmp) 16 | headline <- json_df$headline 17 | author <- toString(json_df$author$name) %>% gsub("/", ",", .) 18 | text <- html %>% 19 | rvest::html_elements(".einleitung,.text,.subtitle") %>% 20 | rvest::html_text2() %>% 21 | paste(collapse = "\n") 22 | 23 | s_n_list( 24 | datetime, 25 | author, 26 | headline, 27 | text 28 | ) 29 | } 30 | } 31 | # rss feed contains also overviews of articles which make the parser fail 32 | -------------------------------------------------------------------------------- /R/deliver_welt_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.welt_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- trimws(gsub("<[^>]+>", "", json_df$articleBody)) 17 | 18 | s_n_list( 19 | datetime, 20 | author, 21 | headline, 22 | text 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /R/deliver_wiwo_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.wiwo_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | if (length(json_txt) != 0) { # otherwise the article is paywalled and not scrapeable 12 | json_df <- jsonlite::fromJSON(json_txt[1]) 13 | 14 | datetime <- lubridate::as_datetime(json_df$datePublished) 15 | headline <- json_df$headline 16 | author <- toString(json_df$creator) 17 | text <- html %>% 18 | rvest::html_elements(".c-leadtext,.u-richtext h3,.u-richtext p") %>% 19 | rvest::html_text2() %>% 20 | .[!grepl("Lesen Sie auch", .)] %>% # Remove links in between 21 | paste(collapse = "\n") 22 | } else { 23 | return(s_n_list()) 24 | } 25 | s_n_list( 26 | datetime, 27 | author, 28 | headline, 29 | text 30 | ) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /R/deliver_wsj_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | 3 | pb_deliver_paper.wsj_com <- function(x, verbose = NULL, pb, ...) { 4 | 5 | pb_tick(x, verbose, pb) 6 | # raw html is stored in column content_raw 7 | html <- rvest::read_html(x$content_raw) 8 | 9 | # datetime 10 | datetime <- html %>% 11 | rvest::html_elements("[name=\"article.published\"]") %>% 12 | rvest::html_attr("content") %>% 13 | lubridate::as_datetime() %>% 14 | utils::head(1L) 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_elements("title") %>% 19 | rvest::html_text() %>% 20 | paste(collapse = "\n") 21 | 22 | # author 23 | author <- html %>% 24 | rvest::html_elements("[name=\"author\"]") %>% 25 | rvest::html_attr("content") %>% 26 | toString() 27 | 28 | # text 29 | text <- html %>% 30 | rvest::html_elements("p:not([id|=\"footer\"])") %>% 31 | rvest::html_text2() %>% 32 | paste(collapse = "\n") 33 | 34 | s_n_list( 35 | datetime, 36 | author, 37 | headline, 38 | text 39 | ) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /R/deliver_wz_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.wz_de <- function(x, verbose = NULL, pb, ...) { 3 | pb_tick(x, verbose, pb) 4 | # raw html is stored in column content_raw 5 | html <- rvest::read_html(x$content_raw) 6 | 7 | json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() 8 | if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { 9 | return(s_n_list()) 10 | } else { 11 | json_df <- jsonlite::fromJSON(json_txt[1]) 12 | 13 | datetime <- lubridate::as_datetime(json_df$datePublished) 14 | headline <- json_df$headline 15 | author <- toString(json_df$author$name) 16 | text <- html %>% 17 | rvest::html_elements("article p.richtext,article h2.font-sans") %>% 18 | rvest::html_text2() %>% 19 | paste(collapse = "\n") 20 | 21 | s_n_list( 22 | datetime, 23 | author, 24 | headline, 25 | text 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /R/deliver_yahoo_com.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.yahoo_com <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | data <- html %>% 11 | rvest::html_element("article [type=\"application/ld+json\"]") %>% 12 | rvest::html_text2() 13 | 14 | if (!isTRUE(is.na(data))) { 15 | data <- jsonlite::fromJSON(data) 16 | # datetime 17 | datetime <- data$datePublished %>% 18 | lubridate::as_datetime() 19 | 20 | # headline 21 | headline <- data$headline 22 | 23 | # author 24 | author <- data$author$name %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("article p") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | cover_image_url <- utils::head(data$image$url, 1L) 34 | 35 | type <- purrr::pluck(data, "@type") 36 | 37 | s_n_list( 38 | datetime, 39 | author, 40 | headline, 41 | text, 42 | type, 43 | cover_image_url 44 | ) 45 | } else { 46 | s_n_list() 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /R/deliver_zeit_de.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.zeit_de <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | html_search(selectors = c( 13 | ".metadata__date>time", 14 | "meta[name=\"date\"]" 15 | ), attributes = c( 16 | "datetime", "content" 17 | )) %>% 18 | lubridate::as_datetime() 19 | 20 | # headline 21 | headline <- html %>% 22 | rvest::html_element("[property=\"og:title\"]") %>% 23 | rvest::html_attr("content") 24 | 25 | # author 26 | author <- html %>% 27 | rvest::html_element("[rel=\"author\"],.metadata__source") %>% 28 | rvest::html_text2() %>% 29 | toString() 30 | 31 | # text 32 | text <- html %>% 33 | rvest::html_elements(".article-body p") %>% 34 | rvest::html_text2() %>% 35 | paste(collapse = "\n") 36 | 37 | # the helper function safely creates a named list from objects 38 | s_n_list( 39 | datetime, 40 | author, 41 | headline, 42 | text 43 | ) 44 | 45 | } 46 | -------------------------------------------------------------------------------- /R/html_search.R: -------------------------------------------------------------------------------- 1 | #' Search raw html for attributes 2 | #' 3 | #' @param html raw html 4 | #' @param selectors a vector of CSS selectors to include in search. 5 | #' @param attributes attributes to extract. If NULL, returns text. 6 | #' @param all if TRUE, all selectors are collected. Otherwise, only the first 7 | #' non-empty result is used. 8 | #' @param n if multiple are found, how many to return 9 | #' 10 | #' @return a vector of max length n 11 | #' @keywords internal 12 | html_search <- function(html, 13 | selectors, 14 | attributes = NULL, 15 | all = TRUE, 16 | n = 1L) { 17 | 18 | if (all) { 19 | res <- rvest::html_elements(html, paste0(selectors, collapse = ",")) 20 | } else { 21 | res <- NULL 22 | i <- 1L 23 | l <- length(selectors) 24 | while (length(res) < 1 && i < l) { 25 | res <- rvest::html_elements(html, selectors[i]) 26 | i <- i + 1 27 | } 28 | } 29 | 30 | want_text <- "text" %in% attributes 31 | if (want_text) attributes <- setdiff(attributes, "text") 32 | 33 | out <- rvest::html_attrs(res) %>% 34 | unlist(recursive = FALSE) %>% 35 | subset(., names(.) %in% 36 | attributes) %>% 37 | unname() 38 | 39 | if (want_text) out <- c(out, rvest::html_text2(res)) 40 | 41 | if (is.null(out)) { 42 | return(NA_character_) 43 | } else { 44 | return(utils::head(out, n)) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /R/inspect.R: -------------------------------------------------------------------------------- 1 | #' Inspect content collected with pb_collect 2 | #' 3 | #' Opens a browser to display the content saved in a row of a data.frame created 4 | #' with \link{pb_collect}. 5 | #' 6 | #' @param x a data.frame returned by \link{pb_collect}. 7 | #' @param i which entry to display. 8 | #' @param host_ip,port host IP and port to create the temporary web server that 9 | #' shows the content. 10 | #' 11 | #' @export 12 | pb_inspect <- function(x, 13 | i = 1L, 14 | host_ip = "127.0.0.1", 15 | port = httpuv::randomPort()) { 16 | 17 | content_raw <- NULL 18 | rlang::check_installed("httpuv") 19 | 20 | if (!"content_raw" %in% names(x)) 21 | stop("Only works with output from pb_collect()") 22 | 23 | if (!is.null(paperboy.env$server)) paperboy.env$server$stop() 24 | 25 | if (grepl("<|>", x$content_raw[i])) { 26 | paperboy.env$server <- httpuv::startServer( 27 | host = host_ip, 28 | port = port, 29 | app = list( 30 | call = function(req) { 31 | list( 32 | status = 200L, 33 | headers = list("Content-Type" = "text/html"), 34 | body = x$content_raw[i] 35 | ) 36 | } 37 | ) 38 | ) 39 | utils::browseURL(paste0("http://", host_ip, ":", port)) 40 | } else { 41 | utils::browseURL(x$content_raw[i]) 42 | } 43 | 44 | 45 | } 46 | -------------------------------------------------------------------------------- /R/read_cookies.R: -------------------------------------------------------------------------------- 1 | #' Read in cookie file 2 | #' 3 | #' Deprecated in favour of \link[cookiemonster]{add_cookies}. 4 | #' 5 | #' @param ... not used. 6 | #' @export 7 | pb_read_cookies <- function(...) { 8 | 9 | .Deprecated(msg = "this functionality has been moved to the cookiemonster package. See `?cookiemonster::add_cookies`") 10 | 11 | } 12 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JBGruber/paperboy/1f9dc4e64faae5f656a2bebac12c28465310f406/R/sysdata.rda -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname, pkgname) { 2 | verbose <- getOption("paperboy_verbose") 3 | if (is.null(verbose)) options(paperboy_verbose = TRUE) 4 | } 5 | paperboy.env <- new.env() 6 | . <- NULL 7 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry(bibtype = "Manual", 2 | title = "paperboy. A comprehensive collection of news media scrapers", 3 | author = as.person("Johannes Gruber"), 4 | year = format(Sys.Date(), "%Y"), 5 | url = "https://github.com/JBGruber/paperboy", 6 | note = "R package version 0.0.5.9000" 7 | ) 8 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | CMD 2 | Codecov 3 | Datenschutzerklärung 4 | Guide’s 5 | Lifecycle 6 | Nutzungsbedingungen 7 | POSIXct 8 | ac 9 | aktualne 10 | anotherangryvoice 11 | bbc 12 | blesk 13 | blogspot 14 | boston 15 | bostonglobe 16 | breitbart 17 | buzzfeed 18 | cbsileads 19 | cbslnk 20 | cbsnews 21 | ceskatelevize 22 | cnet 23 | cnn 24 | com 25 | csv 26 | cz 27 | dailymail 28 | datetime 29 | denikn 30 | doctype 31 | eu 32 | evolvepolitics 33 | faz 34 | feedly 35 | forbes 36 | foxbusiness 37 | foxnews 38 | ftw 39 | geenstijl 40 | hn 41 | huffingtonpost 42 | huffpost 43 | idnes 44 | irozhlas 45 | itemscope 46 | latimes 47 | lidovky 48 | lnk 49 | marketwatch 50 | mediacloud 51 | mediacourant 52 | metronieuws 53 | msnbc 54 | newsweek 55 | nl 56 | nos 57 | novinky 58 | nrc 59 | nypost 60 | nytimes 61 | org 62 | pagesix 63 | parlamentnilisty 64 | pb 65 | seznamzpravy 66 | sfgate 67 | skwawkbox 68 | stri 69 | stringi 70 | techrepublic 71 | telegraaf 72 | thecanary 73 | theguardian 74 | thelily 75 | thismorningwithgordondeal 76 | tibble 77 | tribpub 78 | uk 79 | un 80 | urls 81 | usatoday 82 | volkskrant 83 | washingtonpost 84 | webscraper 85 | webscraping 86 | wsj 87 | ’A 88 | -------------------------------------------------------------------------------- /inst/templates/deliver_.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pb_deliver_paper.{{newspaper}} <- function(x, verbose = NULL, pb, ...) { 3 | 4 | # updates progress bar 5 | pb_tick(x, verbose, pb) 6 | 7 | # raw html is stored in column content_raw 8 | html <- rvest::read_html(x$content_raw) 9 | 10 | # datetime 11 | datetime <- html %>% 12 | rvest::html_element("") %>% 13 | rvest::html_attr("") %>% 14 | lubridate::as_datetime() 15 | 16 | # headline 17 | headline <- html %>% 18 | rvest::html_element("") %>% 19 | rvest::html_attr("") 20 | 21 | # author 22 | author <- html %>% 23 | rvest::html_element("") %>% 24 | rvest::html_text2() %>% 25 | toString() 26 | 27 | # text 28 | text <- html %>% 29 | rvest::html_elements("") %>% 30 | rvest::html_text2() %>% 31 | paste(collapse = "\n") 32 | 33 | # the helper function safely creates a named list from objects 34 | s_n_list( 35 | datetime, 36 | author, 37 | headline, 38 | text 39 | ) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /man/html_search.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/html_search.R 3 | \name{html_search} 4 | \alias{html_search} 5 | \title{Search raw html for attributes} 6 | \usage{ 7 | html_search(html, selectors, attributes = NULL, all = TRUE, n = 1L) 8 | } 9 | \arguments{ 10 | \item{html}{raw html} 11 | 12 | \item{selectors}{a vector of CSS selectors to include in search.} 13 | 14 | \item{attributes}{attributes to extract. If NULL, returns text.} 15 | 16 | \item{all}{if TRUE, all selectors are collected. Otherwise, only the first 17 | non-empty result is used.} 18 | 19 | \item{n}{if multiple are found, how many to return} 20 | } 21 | \value{ 22 | a vector of max length n 23 | } 24 | \description{ 25 | Search raw html for attributes 26 | } 27 | \keyword{internal} 28 | -------------------------------------------------------------------------------- /man/pb_available.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{pb_available} 4 | \alias{pb_available} 5 | \title{Show available parsers} 6 | \usage{ 7 | pb_available(...) 8 | } 9 | \arguments{ 10 | \item{...}{optionally pass URLs to check if respective parser(s) is/are available.} 11 | } 12 | \value{ 13 | A character vector of supported domains. 14 | } 15 | \description{ 16 | Show available parsers 17 | } 18 | \examples{ 19 | pb_available() 20 | pb_available("https://edition.cnn.com/", 21 | "https://www.nytimes.com/", 22 | "https://www.google.com/") 23 | } 24 | -------------------------------------------------------------------------------- /man/pb_collect_rss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rss.r 3 | \name{pb_collect_rss} 4 | \alias{pb_collect_rss} 5 | \title{Collect RSS feed} 6 | \usage{ 7 | pb_collect_rss(x, parse = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{x}{URL(s) to RSS or Atom feed(s).} 11 | 12 | \item{parse}{Whether the results should be parsed into a data.frame. Turn off for debugging.} 13 | 14 | \item{...}{passed to pb_collect.} 15 | } 16 | \value{ 17 | a data.frame or list 18 | } 19 | \description{ 20 | Collect articles from RSS or Atom feed(s) 21 | } 22 | \examples{ 23 | \dontrun{ 24 | pb_collect_rss("https://www.washingtonpost.com/arcio/rss/") 25 | # works with atom feeds too 26 | pb_collect_rss("https://www.nu.nl/rss") 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/pb_deliver.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deliver.R 3 | \name{pb_deliver} 4 | \alias{pb_deliver} 5 | \title{Deliver online news articles} 6 | \usage{ 7 | pb_deliver(x, try_default = TRUE, ignore_fails = FALSE, verbose = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{x}{Either a vector of URLs or a data.frame returned by 11 | \link{pb_collect}.} 12 | 13 | \item{try_default}{if no parser is available, should a generic parser be used 14 | \code{TRUE} or should the URL be skipped \code{FALSE}?} 15 | 16 | \item{ignore_fails}{normally the function errors raw content for a URL can't 17 | be parsed. Setting to \code{TRUE} ignores all parsing errors (use with 18 | caution).} 19 | 20 | \item{verbose}{\code{FALSE} turns deliver silent. \code{TRUE} prints status 21 | messages and a progress bar on the screen. \code{2L} turns on debug mode. 22 | If \code{NULL} will be determined from 23 | \code{getOption("paperboy_verbose")}.} 24 | 25 | \item{...}{Passed on to \link{pb_collect}.} 26 | } 27 | \value{ 28 | A data.frame (tibble) with media data and full text. 29 | } 30 | \description{ 31 | This function will determine the website of the urls given to it 32 | and call the appropriate webscraper. 33 | } 34 | -------------------------------------------------------------------------------- /man/pb_deliver_paper.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deliver.R 3 | \name{pb_deliver_paper} 4 | \alias{pb_deliver_paper} 5 | \title{internal function to deliver specific newspapers} 6 | \usage{ 7 | pb_deliver_paper(x, verbose, pb, ...) 8 | } 9 | \arguments{ 10 | \item{x}{Either a vector of URLs or a data.frame returned by 11 | \link{pb_collect}.} 12 | 13 | \item{verbose}{\code{FALSE} turns deliver silent. \code{TRUE} prints status 14 | messages and a progress bar on the screen. \code{2L} turns on debug mode. 15 | If \code{NULL} will be determined from 16 | \code{getOption("paperboy_verbose")}.} 17 | 18 | \item{pb}{a progress bar object.} 19 | 20 | \item{...}{Passed on to \link{pb_collect}.} 21 | } 22 | \description{ 23 | internal function to deliver specific newspapers 24 | } 25 | \keyword{internal} 26 | -------------------------------------------------------------------------------- /man/pb_find_rss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rss.r 3 | \name{pb_find_rss} 4 | \alias{pb_find_rss} 5 | \title{Find RSS feed on a newspapers website} 6 | \usage{ 7 | pb_find_rss(x, use = c("main", "suffixes", "feedly")) 8 | } 9 | \arguments{ 10 | \item{x}{main domain of the newspaper site to check for RSS feeds.} 11 | 12 | \item{use}{which steps to include in the search (see Details). Default is to 13 | include all.} 14 | } 15 | \value{ 16 | A URL to the RSS feed(s) or NULL if nothing is found 17 | } 18 | \description{ 19 | Find RSS feed on a newspapers website 20 | } 21 | \details{ 22 | Uses a three step heuristic to find RSS feeds: 23 | \enumerate{ 24 | \item Scrapes the main page (without any paths) to see if the RSS feed is 25 | advertised 26 | \item Checks a number of common paths where sites put their RSS feeds 27 | \item Queries the \href{https://feedly.com/}{feedly.com} API to for feeds associated 28 | with a page 29 | } 30 | } 31 | \examples{ 32 | pb_find_rss("https://www.buzzfeed.com/") 33 | } 34 | \references{ 35 | Approach inspired by \url{https://github.com/mediacloud/feed_seeker} 36 | } 37 | -------------------------------------------------------------------------------- /man/pb_inspect.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inspect.R 3 | \name{pb_inspect} 4 | \alias{pb_inspect} 5 | \title{Inspect content collected with pb_collect} 6 | \usage{ 7 | pb_inspect(x, i = 1L, host_ip = "127.0.0.1", port = httpuv::randomPort()) 8 | } 9 | \arguments{ 10 | \item{x}{a data.frame returned by \link{pb_collect}.} 11 | 12 | \item{i}{which entry to display.} 13 | 14 | \item{host_ip, port}{host IP and port to create the temporary web server that 15 | shows the content.} 16 | } 17 | \description{ 18 | Opens a browser to display the content saved in a row of a data.frame created 19 | with \link{pb_collect}. 20 | } 21 | -------------------------------------------------------------------------------- /man/pb_new.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils_dev.R 3 | \name{pb_new} 4 | \alias{pb_new} 5 | \title{Create new scraper} 6 | \usage{ 7 | pb_new(np, author = "", issue = "") 8 | } 9 | \arguments{ 10 | \item{np}{domain or a URL of the newspaper this scraper is for.} 11 | 12 | \item{author}{who wrote it.} 13 | 14 | \item{issue}{is there a GitHub issue?} 15 | } 16 | \description{ 17 | Create new scraper 18 | } 19 | \examples{ 20 | \dontrun{ 21 | paperboy:::pb_new(np = "https://www.buzzfeed.com/", 22 | author = "[@JBGruber](https://github.com/JBGruber/)") 23 | 24 | paperboy:::pb_new_done() 25 | } 26 | } 27 | \keyword{internal} 28 | -------------------------------------------------------------------------------- /man/pb_read_cookies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_cookies.R 3 | \name{pb_read_cookies} 4 | \alias{pb_read_cookies} 5 | \title{Read in cookie file} 6 | \usage{ 7 | pb_read_cookies(...) 8 | } 9 | \arguments{ 10 | \item{...}{not used.} 11 | } 12 | \description{ 13 | Deprecated in favour of \link[cookiemonster]{add_cookies}. 14 | } 15 | -------------------------------------------------------------------------------- /man/reexports.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \docType{import} 4 | \name{reexports} 5 | \alias{reexports} 6 | \alias{\%>\%} 7 | \title{Objects exported from other packages} 8 | \keyword{internal} 9 | \description{ 10 | These objects are imported from other packages. Follow the links 11 | below to see their documentation. 12 | 13 | \describe{ 14 | \item{magrittr}{\code{\link[magrittr]{\%>\%}}} 15 | }} 16 | 17 | -------------------------------------------------------------------------------- /man/test_parser.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils_dev.R 3 | \name{test_parser} 4 | \alias{test_parser} 5 | \title{Test a Parser} 6 | \usage{ 7 | test_parser(test_data) 8 | } 9 | \arguments{ 10 | \item{test_data}{A data frame of raw content.} 11 | } 12 | \value{ 13 | A success or failure message. 14 | } 15 | \description{ 16 | Test a parser using a data frame from \link{pb_collect}. 17 | } 18 | -------------------------------------------------------------------------------- /paperboy.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | 22 | UseNativePipeOperator: No 23 | -------------------------------------------------------------------------------- /submit2cran.r: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | ## Update roxygen and check 3 | roxygen2::roxygenise(clean = TRUE) 4 | devtools::check() 5 | 6 | ## Check code quality 7 | lintr::lint_package() 8 | goodpractice::gp() 9 | 10 | ## Check spelling 11 | spelling::spell_check_package() 12 | spelling::update_wordlist() 13 | spelling::spell_check_files("README.Rmd", ignore = readLines("./inst/WORDLIST"), lang = "en-GB") 14 | 15 | ## build manual 16 | devtools::build_manual() 17 | 18 | # build readme 19 | parser_df <- rio::import("inst/status.csv") %>% 20 | arrange(domain) 21 | ## check if all parsers are listed 22 | parser_available <- pb_available() %>% 23 | str_remove("www.") %>% 24 | tibble(parser = .) 25 | 26 | parser_available %>% 27 | anti_join(parser_df, by = c("parser" = "domain")) 28 | 29 | rio::export(parser_df, "inst/status.csv") 30 | devtools::build_readme() 31 | lines <- readLines("README.md") 32 | writeLines(gsub("[\\#", "[#", lines, fixed = TRUE), "README.md") 33 | 34 | 35 | 36 | ## build vignette 37 | devtools::build_vignettes() 38 | 39 | ## test covr 40 | devtools::test_coverage() 41 | 42 | 43 | # For release on CRAN 44 | ## test on winbuilder 45 | devtools::check_win_devel() 46 | devtools::check_win_oldrelease() 47 | devtools::check_win_release() 48 | 49 | ## check r_hub 50 | ch <- rhub::check_for_cran(show_status = FALSE) 51 | ch$livelog() # check status 52 | 53 | ## release 54 | revdepcheck::revdep_check() 55 | devtools::release() 56 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if (requireNamespace("spelling", quietly = TRUE)) { 2 | spelling::spell_check_test( 3 | vignettes = TRUE, 4 | error = FALSE, 5 | skip_on_cran = TRUE 6 | ) 7 | } 8 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(paperboy) 3 | 4 | test_check("paperboy") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-collect.R: -------------------------------------------------------------------------------- 1 | test_that("status", { 2 | expect_message( 3 | pb_collect("https://httpbin.org/status/404"), 4 | "1 link had issues." 5 | ) 6 | expect_error( 7 | pb_collect("test"), 8 | "Connection error. Set" 9 | ) 10 | expect_equal( 11 | dim(pb_collect("test", ignore_fails = TRUE)), 12 | c(1, 5) 13 | ) 14 | }) 15 | 16 | test_that("expandurls", { 17 | expect_equal( 18 | dim(pb_collect(urls = "https://httpbin.org/")), 19 | c(1, 5) 20 | ) 21 | expect_warning( 22 | pb_collect(urls = "https://httpbin.org/delay/10", timeout = 1, ignore_fails = TRUE), 23 | "download.did.not.finish.before.timeout." 24 | ) 25 | }) 26 | 27 | test_that("send cookies", { 28 | jar <- options(cookie_dir = tempdir()) 29 | withr::defer(options(jar)) 30 | withr::defer(unlink(file.path(tempdir(), paste0("cookies.rds")))) 31 | expect_equivalent({ 32 | cookiemonster::add_cookies(cookiestring = "test=true; success=yes", domain = "https://hb.cran.dev", confirm = TRUE) 33 | unclass(pb_collect("https://hb.cran.dev/cookies", use_cookies = TRUE, verbose = FALSE)$content_raw) 34 | }, "{\n \"cookies\": {\n \"success\": \"yes\", \n \"test\": \"true\"\n }\n}\n") 35 | }) 36 | 37 | test_that("store local", { 38 | tmp <- tempdir() 39 | expect_true({ 40 | pb_collect(urls = "https://httpbin.org/status/200", 41 | save_dir = tmp) 42 | file.exists(file.path(tmp, "d84c33c485e54845b489f53feada52f0.html")) 43 | }) 44 | }) 45 | 46 | test_that("verbosity", { 47 | expect_no_condition(pb_collect(urls = "https://httpbin.org/status/200", verbose = FALSE)) 48 | expect_message(pb_collect(urls = "https://httpbin.org/status/200", verbose = TRUE), 49 | "unique URLs provided") 50 | expect_message(pb_collect(urls = "https://httpbin.org/status/200", verbose = TRUE), 51 | "Fetching pages...") 52 | }) 53 | -------------------------------------------------------------------------------- /tests/testthat/test-deliver.R: -------------------------------------------------------------------------------- 1 | test_that("Test infrascture", { 2 | expect_message( 3 | pb_deliver("google.com", verbose = TRUE), 4 | "No parser for domain" 5 | ) 6 | # only warn first time 7 | expect_no_message( 8 | pb_deliver("google.com", verbose = FALSE) 9 | ) 10 | # still warn with new site 11 | expect_message( 12 | pb_deliver("duckduckgo.com/", verbose = TRUE), 13 | "No parser for domain" 14 | ) 15 | expect_equal( 16 | nrow(pb_deliver("duckduckgo.com/", try_default = FALSE)), 17 | 0L 18 | ) 19 | expect_error( 20 | pb_deliver(list("google.com"), verbose = FALSE), 21 | "No method for class list." 22 | ) 23 | expect_error( 24 | pb_deliver(data.frame(test = "google.com"), verbose = FALSE), 25 | "must be a character vector of URLs" 26 | ) 27 | expect_message( 28 | pb_deliver(pb_collect("https://httpbin.org/status/404", verbose = FALSE)), 29 | "1 URL removed due to bad status." 30 | ) 31 | }) 32 | 33 | test_that("Test theguardian scraper", { 34 | skip_if_offline() 35 | expect_equal({ 36 | out <- pb_deliver("https://tinyurl.com/386e98k5", verbose = FALSE) 37 | c(class(out), ncol(out), nrow(out)) 38 | }, c("tbl_df", "tbl", "data.frame", "9", "1")) 39 | }) 40 | 41 | test_that("Test huffpost scraper", { 42 | skip_if_offline() 43 | expect_equal({ 44 | out <- pb_deliver("https://tinyurl.com/4shbwkxs", verbose = FALSE) 45 | c(class(out), ncol(out), nrow(out)) 46 | }, c("tbl_df", "tbl", "data.frame", "9", "1")) 47 | }) 48 | 49 | -------------------------------------------------------------------------------- /tests/testthat/test-misc.R: -------------------------------------------------------------------------------- 1 | test_that("normalise_df works", { 2 | expect_equal( 3 | names(normalise_df(data.frame(test = TRUE))), 4 | c("url", "expanded_url", "domain", "status", "datetime", "author", 5 | "headline", "text", "misc") 6 | ) 7 | expect_equal({ 8 | out <- normalise_df(list( 9 | tibble::tibble(url = "test.com/1", test = TRUE), 10 | tibble::tibble(url = "test.com/2", test = list(c(TRUE, FALSE))) 11 | )) 12 | purrr::map(out$misc, "test") 13 | }, list(list(TRUE), list(c(TRUE, FALSE)))) 14 | }) 15 | 16 | test_that("pb_available works", { 17 | expect_equal({ 18 | out <- pb_available() 19 | c(class(out), length(out) > 10) 20 | }, 21 | c("character", "TRUE") 22 | ) 23 | }) 24 | 25 | test_that("Test safe named list making", { 26 | expect_equal({ 27 | text <- "hello world" 28 | author <- "Max Mustermann" 29 | headline <- "lorem ipsum" 30 | datetime <- character() 31 | 32 | paperboy:::s_n_list( 33 | text, 34 | author, 35 | headline, 36 | datetime 37 | ) 38 | }, 39 | tibble::tibble(text = "hello world", 40 | author = "Max Mustermann", 41 | headline = "lorem ipsum", 42 | datetime = NA) 43 | ) 44 | expect_equal({ 45 | text <- "hello world" 46 | author <- c("Max Mustermann", "Erika Mustermann") 47 | headline <- "lorem ipsum" 48 | datetime <- character() 49 | 50 | paperboy:::s_n_list( 51 | text, 52 | author, 53 | headline, 54 | datetime 55 | ) 56 | }, 57 | tibble::tibble(text = "hello world", 58 | author = list(c("Max Mustermann", "Erika Mustermann")), 59 | headline = "lorem ipsum", 60 | datetime = NA) 61 | ) 62 | }) 63 | -------------------------------------------------------------------------------- /tests/testthat/test-parser.R: -------------------------------------------------------------------------------- 1 | test_parse_rss <- function(rss) { 2 | 3 | test_that(desc = paste("test:", rss), { 4 | expect_no_error({ 5 | test_df <- pb_collect(rss, collect_rss = TRUE, verbose = FALSE, timeout = 90) 6 | if (all(!test_df$status < 400L)) { 7 | stop("No data could be retrieved from the RSS feed") 8 | } 9 | test_parser(test_df) 10 | }) 11 | }) 12 | 13 | } 14 | 15 | if (as.logical(Sys.getenv("PB_TEST_PARSER", unset = "FALSE"))) { 16 | status <- utils::read.csv(system.file("status.csv", package = "paperboy")) 17 | rss_feeds <- setdiff( 18 | na.omit(status$rss), 19 | c( 20 | "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" 21 | ) 22 | ) 23 | lapply(rss_feeds, test_parse_rss) 24 | } 25 | -------------------------------------------------------------------------------- /tests/testthat/test-rss.R: -------------------------------------------------------------------------------- 1 | test_that("rss is collected", { 2 | nyt <- pb_collect_rss("https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml") 3 | expect_s3_class( 4 | nyt, 5 | "data.frame" 6 | ) 7 | expect_more_than( 8 | nrow(nyt), 9 | 0 10 | ) 11 | expect_equal({ 12 | c(nrow(nyt) > 1, c("title", "link", "published") %in% colnames(nyt)) 13 | }, rep(TRUE, 4)) 14 | }) 15 | 16 | test_that("rss is expanded", { 17 | expect_equal({ 18 | res <- pb_collect(urls = "https://rss.nytimes.com/services/xml/rss/nyt/World.xml") 19 | c(nrow(res) > 1, ncol(res)) 20 | }, c(1, 5)) 21 | }) 22 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/inspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JBGruber/paperboy/1f9dc4e64faae5f656a2bebac12c28465310f406/vignettes/inspect.png --------------------------------------------------------------------------------