├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CRAN-RELEASE
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
    ├── extract_meta_data.R
    ├── get_media_source.R
    ├── get_story.R
    ├── get_story_list.R
    └── meta_data_html.R
├── README.Rmd
├── README.md
├── codecov.yml
├── cran-comments.md
├── data
    └── meta_data_html.rda
├── man
    ├── extract_meta_data.Rd
    ├── get_media_source.Rd
    ├── get_story.Rd
    ├── get_story_list.Rd
    └── meta_data_html.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_extract_meta_data.R
    │   ├── test_get_media_source.R
    │   ├── test_get_story.R
    │   └── test_get_story_list.R
└── vignettes
    ├── .gitignore
    └── extract_meta_data.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^README\.Rmd$
 4 | ^.travis.yml$
 5 | ^codecov\.yml$
 6 | ^doc$
 7 | ^Meta$
 8 | ^cran-comments.md$
 9 | ^CRAN-RELEASE$
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | *.Rproj
23 | 
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 | 
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 | 
31 | # knitr and R markdown default cache directories
32 | /*_cache/
33 | /cache/
34 | 
35 | # Temporary files created by R markdown
36 | *.utf8.md
37 | *.knit.md
38 | inst/doc
39 | doc
40 | Meta
41 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | r:
 3 |   - oldrel
 4 |   - release
 5 |   - devel
 6 | after_success:
 7 |   - Rscript -e 'covr::codecov()'
 8 | sudo: false
 9 | cache: packages
10 | 


--------------------------------------------------------------------------------
/CRAN-RELEASE:
--------------------------------------------------------------------------------
1 | This package was submitted to CRAN on 2019-07-21.
2 | Once it is accepted, delete this file and tag the release (commit e62672467f).
3 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: mediacloudr
 2 | Type: Package
 3 | Title: Wrapper for the 'mediacloud.org' API
 4 | Version: 0.1.1.9000
 5 | Depends: R (>= 3.2.0)
 6 | Authors@R: c(person("Dix", "Jan", email = "jan.dix@uni-konstanz.de", role = c("cre", "aut")))
 7 | Description: API wrapper to gather news stories, media information and tags from the 'mediacloud.org' API, based on a multilevel query <https://mediacloud.org/>. A personal API key is required.
 8 | License: MIT + file LICENSE
 9 | Encoding: UTF-8
10 | LazyData: true
11 | Imports:
12 |   httr,
13 |   jsonlite,
14 |   rvest,
15 |   xml2
16 | Suggests:
17 |     testthat,
18 |     covr,
19 |     knitr,
20 |     rmarkdown
21 | RoxygenNote: 6.1.1
22 | VignetteBuilder: knitr
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Jan Dix
3 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(extract_meta_data)
 4 | export(get_media_source)
 5 | export(get_story)
 6 | export(get_story_list)
 7 | importFrom(httr,GET)
 8 | importFrom(httr,build_url)
 9 | importFrom(httr,content)
10 | importFrom(httr,http_error)
11 | importFrom(httr,http_status)
12 | importFrom(httr,parse_url)
13 | importFrom(jsonlite,fromJSON)
14 | importFrom(rvest,html_attr)
15 | importFrom(rvest,html_node)
16 | importFrom(rvest,html_text)
17 | importFrom(xml2,read_html)
18 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # version 0.1.0 (2019-07-10)
2 | 
3 | - initial publication on GitHub
4 | - prepare CRAN release
5 | 


--------------------------------------------------------------------------------
/R/extract_meta_data.R:
--------------------------------------------------------------------------------
  1 | #' Extract meta data
  2 | #'
  3 | #' \code{extract_meta_data} extracts native, open graph and twitter meta data
  4 | #' from html documents. The meta data include url, title, description and image.
  5 | #' The html document is parsed within the function
  6 | #'
  7 | #' @param html_doc Character string including the html document.
  8 | #'
  9 | #' @examples
 10 | #' \dontrun{
 11 | #'  library(httr)
 12 | #'  url <- "https://bits.blogs.nytimes.com/2013/04/07/the-potential-and-the-risks-of-data-science"
 13 | #'  response <- GET(url)
 14 | #'  html_document <- content(response, type = "text", encoding = "UTF-8")
 15 | #'  meta_data <- extract_meta_data(html_doc = html_document)
 16 | #' }
 17 | #'
 18 | #' @return List with three sublists for native, open graph and twitter.
 19 | #'
 20 | #' @importFrom xml2 read_html
 21 | #' @importFrom rvest html_attr html_text html_node
 22 | #'
 23 | #' @export
 24 | 
 25 | extract_meta_data <- function (html_doc) {
 26 | 
 27 |   # errors and warnings --------------------------------------------------------
 28 |   # check if html document is passed
 29 |   if (missing(html_doc)) stop("Please define a html document.")
 30 | 
 31 |   # parse document and prepare empty result set --------------------------------
 32 |   # parse html
 33 |   parsed_html <- xml2::read_html(html_doc)
 34 |   # define empty return object
 35 |   meta_data <- list(
 36 |     open_graph = list(
 37 |       url = NA,
 38 |       type = NA,
 39 |       title = NA,
 40 |       image = NA,
 41 |       description = NA
 42 |     ),
 43 |     twitter = list(
 44 |       url = NA,
 45 |       title = NA,
 46 |       description = NA,
 47 |       image = NA,
 48 |       image_alt = NA,
 49 |       card = NA
 50 |     ),
 51 |     native = list (
 52 |       title = NA,
 53 |       description = NA,
 54 |       image = NA,
 55 |       thumbnail = NA
 56 |     )
 57 |   )
 58 | 
 59 |   # extract meta data ----------------------------------------------------------
 60 |   # og url
 61 |   meta_data$open_graph$url <- rvest::html_attr(
 62 |     rvest::html_node(parsed_html, "meta[property='og:url']"),
 63 |     "content"
 64 |   )
 65 |   # og type
 66 |   meta_data$open_graph$type <- rvest::html_attr(
 67 |     rvest::html_node(parsed_html, "meta[property='og:type']"),
 68 |     "content"
 69 |   )
 70 |   # og title
 71 |   meta_data$open_graph$title <- rvest::html_attr(
 72 |     rvest::html_node(parsed_html, "meta[property='og:title']"),
 73 |     "content"
 74 |   )
 75 |   # og image
 76 |   meta_data$open_graph$image <- rvest::html_attr(
 77 |     rvest::html_node(parsed_html, "meta[property='og:image']"),
 78 |     "content"
 79 |   )
 80 |   # og description
 81 |   meta_data$open_graph$description <- rvest::html_attr(
 82 |     rvest::html_node(parsed_html, "meta[property='og:description']"),
 83 |     "content"
 84 |   )
 85 |   # twitter url
 86 |   meta_data$twitter$url <- rvest::html_attr(
 87 |     rvest::html_node(parsed_html, "meta[property='twitter:url']"),
 88 |     "content"
 89 |   )
 90 |   # twitter title
 91 |   meta_data$twitter$title <- rvest::html_attr(
 92 |     rvest::html_node(parsed_html, "meta[property='twitter:title']"),
 93 |     "content"
 94 |   )
 95 |   # twitter description
 96 |   meta_data$twitter$description <- rvest::html_attr(
 97 |     rvest::html_node(parsed_html, "meta[property='twitter:description']"),
 98 |     "content"
 99 |   )
100 |   # twitter image
101 |   meta_data$twitter$image <- rvest::html_attr(
102 |     rvest::html_node(parsed_html, "meta[property='twitter:image']"),
103 |     "content"
104 |   )
105 |   # twitter image_alt
106 |   meta_data$twitter$image_alt <- rvest::html_attr(
107 |     rvest::html_node(parsed_html, "meta[property='twitter:image:alt']"),
108 |     "content"
109 |   )
110 |   # twitter card
111 |   meta_data$twitter$card <- rvest::html_attr(
112 |     rvest::html_node(parsed_html, "meta[property='twitter:card']"),
113 |     "content"
114 |   )
115 |   # native title
116 |   meta_data$native$title <- rvest::html_text(
117 |     rvest::html_node(parsed_html, "title")
118 |   )
119 |   # native description
120 |   meta_data$native$description <- rvest::html_attr(
121 |     rvest::html_node(parsed_html, "meta[name='description']"),
122 |     "content"
123 |   )
124 |   # native image
125 |   meta_data$native$image <- rvest::html_attr(
126 |     rvest::html_node(parsed_html, "meta[name='image']"),
127 |     "content"
128 |   )
129 |   # native thumbnail
130 |   meta_data$native$thumbnail <- rvest::html_attr(
131 |     rvest::html_node(parsed_html, "meta[name='thumbnail']"),
132 |     "content"
133 |   )
134 | 
135 |   # process and return result set ----------------------------------------------
136 |   # replace empty characters
137 |   meta_data <- lapply(meta_data, lapply,
138 |                       function (x) ifelse(length(x) <= 0, NA, x))
139 |   # return results
140 |   return(meta_data)
141 | }
142 | 


--------------------------------------------------------------------------------
/R/get_media_source.R:
--------------------------------------------------------------------------------
 1 | #' Get media by id
 2 | #'
 3 | #' \code{get_media} returns media source by their id. A media source
 4 | #' is one publisher. Every story that can be collected via \code{get_story}
 5 | #' or \code{get_story_list} belongs to one media source.
 6 | #'
 7 | #' @param media_id Positive integer that contains a valid media`` id.
 8 | #' @param api_key Character string with the API key you get from mediacloud.org.
 9 | #'                Passing it is compulsory. Alternatively, function can be
10 | #'                provided from the global environment.
11 | #'
12 | #' @return Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#media} for field descriptions.
13 | #'
14 | #' @examples
15 | #' \dontrun{
16 | #'  media_source <- get_media_source(media_id = 604L)
17 | #' }
18 | #'
19 | #' @importFrom httr parse_url build_url GET http_error http_status content
20 | #' @importFrom jsonlite fromJSON
21 | #'
22 | #' @export
23 | 
24 | get_media_source <- function(media_id,
25 |                              api_key = Sys.getenv("MEDIACLOUD_API_KEY")) {
26 | 
27 |   # errors and warnings --------------------------------------------------------
28 |   # check if media_id is passed
29 |   if (missing(media_id)) stop("Please define a media id.")
30 | 
31 |   # check if story_id integer and positive
32 |   if (!is.integer(media_id) | media_id < 0L)
33 |     stop("Please provide a positive integer for media id.")
34 | 
35 |   # check if api key is passed
36 |   if (nchar(api_key) == 0) {
37 |     stop("Please define an API key.")
38 |   }
39 | 
40 |   # define and build url  ------------------------------------------------------
41 |   # define base url
42 |   url <-  "https://api.mediacloud.org/api/v2/media/single"
43 |   # parse url
44 |   url <- httr::parse_url(url = url)
45 |   # add api key query parameter
46 |   url$query <- list(
47 |     key = api_key
48 |   )
49 |   # add story id to path
50 |   url$path <- paste(url$path, media_id, sep = "/")
51 |   # build url
52 |   url <- httr::build_url(url)
53 | 
54 |   # query and parse api --------------------------------------------------------
55 |   # query api
56 |   response <- httr::GET(url)
57 |   # parse response
58 |   parsed_response <- httr::content(response, type = "text", encoding = "UTF-8")
59 |   # parse json
60 |   parsed_json <- jsonlite::fromJSON(parsed_response)
61 | 
62 |   # check possible errors ------------------------------------------------------
63 |   # check if any error
64 |   if (httr::http_error(response)) {
65 |     stop(parsed_json$error)
66 |   }
67 | 
68 |   # define and return result object --------------------------------------------
69 |   # return result set
70 |   return(parsed_json)
71 | }
72 | 


--------------------------------------------------------------------------------
/R/get_story.R:
--------------------------------------------------------------------------------
 1 | #' Get story by id
 2 | #'
 3 | #' \code{get_story} returns news stories by their id. One story represents
 4 | #' one online publication. Each story refers to a single URL from any feed
 5 | #' within a single media source.
 6 | #'
 7 | #' @param story_id Positive numeric that contains a valid story id.
 8 | #' @param api_key Character string with the API key you get from mediacloud.org.
 9 | #'                Passing it is compulsory. Alternatively, function can be
10 | #'                provided from the global environment.
11 | #'
12 | #' @examples
13 | #' \dontrun{
14 | #'  story <- get_story(story_id = 604L)
15 | #' }
16 | #'
17 | #' @return Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#stories} for field descriptions.
18 | #'
19 | #' @importFrom httr parse_url build_url GET http_error http_status content
20 | #' @importFrom jsonlite fromJSON
21 | #'
22 | #' @export
23 | 
24 | get_story <- function(story_id,
25 |                       api_key = Sys.getenv("MEDIACLOUD_API_KEY")) {
26 | 
27 |   # errors and warnings --------------------------------------------------------
28 |   # check if story_id is passed
29 |   if (missing(story_id)) stop("Please define a story id.")
30 | 
31 |   # check if story_id integer and positive
32 |   if (!is.integer(story_id) | story_id < 0L)
33 |     stop("Please provide a positive integer for story id.")
34 | 
35 |   # check if api key is passed
36 |   if (nchar(api_key) == 0) {
37 |     stop("Please define an API key.")
38 |   }
39 | 
40 |   # define and build url  ------------------------------------------------------
41 |   # define base url
42 |   url <-  "https://api.mediacloud.org/api/v2/stories_public/single"
43 |   # parse url
44 |   url <- httr::parse_url(url = url)
45 |   # add api key query parameter
46 |   url$query <- list(
47 |     key = api_key
48 |   )
49 |   # add story id to path
50 |   url$path <- paste(url$path, story_id, sep = "/")
51 |   # build url
52 |   url <- httr::build_url(url)
53 | 
54 |   # query and parse api --------------------------------------------------------
55 |   # query api
56 |   response <- httr::GET(url)
57 |   # parse response
58 |   parsed_response <- httr::content(response, type = "text", encoding = "UTF-8")
59 |   # parse json
60 |   parsed_json <- jsonlite::fromJSON(parsed_response)
61 | 
62 |   # check possible errors ------------------------------------------------------
63 |   # check if any error
64 |   if (httr::http_error(response)) {
65 |     stop(parsed_json$error)
66 |   }
67 | 
68 |   # define and return result object --------------------------------------------
69 |   # return result set
70 |   return(parsed_json)
71 | }
72 | 


--------------------------------------------------------------------------------
/R/get_story_list.R:
--------------------------------------------------------------------------------
 1 | #' Get story list
 2 | #'
 3 | #' \code{get_story} returns a list of stories based on a multifaceted query. One
 4 | #' story represents one online publication. Each story refers to a single URL
 5 | #' from any feed within a single media source.
 6 | #'
 7 | #' @param last_process_stories_id Return stories in which the
 8 | #'                                processed_stories_id is greater than this
 9 | #'                                value.
10 | #' @param rows Number of stories to return, max 1000.
11 | #' @param feeds_id Return only stories that match the given feeds_id, sorted
12 | #'                 my descending publish date
13 | #' @param q If specified, return only results that match the given Solr query.
14 | #'          Only one q parameter may be included.
15 | #' @param fq If specified, file results by the given Solr query. More than one
16 | #'           fq parameter may be included.
17 | #' @param sort Returned results sort order. Supported values:
18 | #'             processed_stories_id, random
19 | #' @param wc If set to TRUE, include a 'word_count' field with each story that
20 | #'           includes a count of the most common words in the story
21 | #' @param show_feeds If set to TRUE, include a 'feeds' field with a list of the
22 | #'                   feeds associated with this story
23 | #' @param api_key Character string with the API key you get from mediacloud.org.
24 | #'                Passing it is compulsory. Alternatively, function can be
25 | #'                provided from the global environment.
26 | #' @examples
27 | #' \dontrun{
28 | #'  stories <- get_story_list()
29 | #'  stories <- get_story_list(q = "Trump")
30 | #' }
31 | #'
32 | #' @return Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#stories} for field descriptions.
33 | #' @export
34 | 
35 | get_story_list <- function(last_process_stories_id = 0L,
36 |                            rows                    = 100,
37 |                            feeds_id                = NULL,
38 |                            q                       = NULL,
39 |                            fq                      = NULL,
40 |                            sort                    = "processed_stories_id",
41 |                            wc                      = FALSE,
42 |                            show_feeds              = FALSE,
43 |                            api_key                 = Sys.getenv("MEDIACLOUD_API_KEY")) {
44 | 
45 |   # errors and warnings --------------------------------------------------------
46 |   # check if last_process_stories_id valid is passed
47 |   # check if story_id integer and positive
48 |   if (!is.integer(last_process_stories_id) | last_process_stories_id < 0)
49 |     stop("Please provide a positive integer for last process stories id.")
50 | 
51 |   # check if rows in range
52 |   if (rows <= 0 | rows > 1000)
53 |     stop("Rows should be larger than 0 and smaller or equal to 1000.")
54 | 
55 |   # check if api key is passed
56 |   if (nchar(api_key) == 0) {
57 |     stop("Please define an API key.")
58 |   }
59 | 
60 |   # define and build url  ------------------------------------------------------
61 |   # define base url
62 |   url <-  "https://api.mediacloud.org/api/v2/stories_public/list"
63 |   # parse url
64 |   url <- httr::parse_url(url = url)
65 |   # add api key query parameter
66 |   url$query <- list(
67 |     last_process_stories_id = last_process_stories_id,
68 |     rows = rows,
69 |     feeds_id = feeds_id,
70 |     q = q,
71 |     fq = fq,
72 |     sort = sort,
73 |     wc = wc,
74 |     show_feeds = show_feeds,
75 |     key = api_key
76 |   )
77 |   # build url
78 |   url <- httr::build_url(url)
79 | 
80 |   # query and parse api --------------------------------------------------------
81 |   # query api
82 |   response <- httr::GET(url)
83 |   # parse response
84 |   parsed_response <- httr::content(response, type = "text", encoding = "UTF-8")
85 |   # parse json
86 |   parsed_json <- jsonlite::fromJSON(parsed_response)
87 | 
88 |   # check possible errors ------------------------------------------------------
89 |   # check if any error
90 |   if (httr::http_error(response)) {
91 |     stop(parsed_json$error)
92 |   }
93 | 
94 |   # define and return result object --------------------------------------------
95 |   # return result set
96 |   return(parsed_json)
97 | }
98 | 


--------------------------------------------------------------------------------
/R/meta_data_html.R:
--------------------------------------------------------------------------------
1 | #' HTML document to test \code{extract_meta_data}
2 | #'
3 | #' A HTML document with basic meta tags for open-graph, twitter and native
4 | #' meta data.
5 | "meta_data_html"
6 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "man/figures/README-",
 12 |   out.width = "100%"
 13 | )
 14 | ```
 15 | # mediacloudr
 16 | 
 17 | <!-- badges: start -->
 18 | [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
 19 | [![Build Status](https://travis-ci.org/jandix/mediacloudr.svg?branch=master)](https://travis-ci.org/jandix/mediacloudr)
 20 | [![Codecov test coverage](https://codecov.io/gh/jandix/mediacloudr/branch/master/graph/badge.svg)](https://codecov.io/gh/jandix/mediacloudr?branch=master)
 21 | <!-- badges: end -->
 22 | 
 23 | The goal of mediacloudr is to provide a consistent wrapper for the 
 24 | mediacloud.org API. The Media Cloud platform is an open-source platform, 
 25 | that collects all kind of news stories and provides various functionalities to 
 26 | query, download and analyze them. This packages tries to support R users by
 27 | providing a set of functions to access various functionalities of the 
 28 | mediacloud.org.
 29 | 
 30 | ## Installation
 31 | 
 32 | The **mediacloudr**-package is now on CRAN :tada: . You can install the released 
 33 | version of mediacloudr from [CRAN](https://CRAN.R-project.org) with:
 34 | 
 35 | ``` r
 36 | install.packages("mediacloudr")
 37 | ```
 38 | 
 39 | 
 40 | And the development version from [GitHub](https://github.com/) with:
 41 | 
 42 | ``` r
 43 | # install.packages("devtools")
 44 | devtools::install_github("jandix/mediacloudr")
 45 | ```
 46 | 
 47 | ## API key
 48 | 
 49 | Please [register](https://topics.mediacloud.org/#/user/signup)  as a new user. 
 50 | Afterwards, you can copy your API key from your 
 51 | [profile page](https://topics.mediacloud.org/#/user/profile).
 52 | 
 53 | I suggest to save the API key to your R environment file. The R environment file
 54 | is loaded every time R is started/restarted. You should not add the key to your
 55 | scripts, because other users could misuse your key. The following steps show
 56 | how to add the key to your R environment file.
 57 | 
 58 | 1. Open your .Renviron file. The file is usually located in 
 59 | your home directory. If the file does not exist, just create one and name it
 60 | `.Renviron`.
 61 | 2. Add a new line and enter your API key in the following format: 
 62 | `MEDIACLOUD_API_KEY=<YOUR_API_KEY>`.
 63 | 3. Save the file and restart your current R session to start using mediacloudr.
 64 | 
 65 | ## Request Limits
 66 | 
 67 | The mediacloud.org states the following for API request/rate limits:
 68 | 
 69 | "Each user is limited to 1,000 API calls and 20,000 stories returned in any 7 
 70 | day period. Requests submitted beyond this limit will result in a status 403 
 71 | error. Users who need access to more requests should email 
 72 | [info@mediacloud.org](mailto:info@mediacloud.org)."
 73 | 
 74 | ## Examples
 75 | 
 76 | ### Get a news story by id
 77 | 
 78 | You can query news stories by their ids. The ids can be found using the 
 79 | graphical interface or using the `get_story_list` function.
 80 | 
 81 | *Note*: You don't have to add the `api_key` argument if you followed the 
 82 | steps to add the api key to your R environment file.
 83 | 
 84 | ```{r story example, eval=FALSE}
 85 | story_id <- 27456565L
 86 | story <- get_story(story_id = story_id)
 87 | ```
 88 | 
 89 | ### Get a list with news stories
 90 | 
 91 | You can query a list of news stories using `get_story_list`. You can use the 
 92 | `q` and `fq` arguments to filter stories. A guide to the query parameters can 
 93 | be found [here](https://mediacloud.org/support/query-guide/).
 94 | 
 95 | *Note*: You don't have to add the `api_key` argument if you followed the 
 96 | steps to add the api key to your R environment file.
 97 | 
 98 | ```{r story list example, eval=FALSE}
 99 | stories <- get_story_list(q = "trump")
100 | ```
101 | 
102 | ### Get a media source by id
103 | 
104 | You can query media sources by their ids. The ids can be found using the 
105 | graphical online interface. mediacloud.org provides various meta data for
106 | their media sources.
107 | 
108 | *Note*: You don't have to add the `api_key` argument if you followed the 
109 | steps to add the api key to your R environment file.
110 | 
111 | ```{r media source example, eval=FALSE}
112 | media_id <- 1L
113 | media_source <- get_media_source(media_id =  media_id)
114 | ```
115 | 
116 | ### Download article and extract social media meta data
117 | 
118 | You can use the article URL to download the complete article and extract 
119 | social meta data. The meta data can be analyzed using techniques such as 
120 | sentiment analysis or simply compared to the article content.
121 | 
122 | ```{r extract_meta_data, eval=FALSE}
123 | # load httr
124 | library(httr)
125 | # define article url
126 | url <- "https://bits.blogs.nytimes.com/2013/04/07/the-potential-and-the-risks-of-data-science"
127 | # download article
128 | response <- GET(url)
129 | # extract article html
130 | html_document <- content(response, type = "text", encoding = "UTF-8")
131 | # extract meta data from html document
132 | meta_data <- extract_meta_data(html_doc = html_document)
133 | ```
134 | 
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # mediacloudr
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![Project Status: Active – The project has reached a stable, usable
  9 | state and is being actively
 10 | developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
 11 | [![Build
 12 | Status](https://travis-ci.org/jandix/mediacloudr.svg?branch=master)](https://travis-ci.org/jandix/mediacloudr)
 13 | [![Codecov test
 14 | coverage](https://codecov.io/gh/jandix/mediacloudr/branch/master/graph/badge.svg)](https://codecov.io/gh/jandix/mediacloudr?branch=master)
 15 | <!-- badges: end -->
 16 | 
 17 | The goal of mediacloudr is to provide a consistent wrapper for the
 18 | mediacloud.org API. The Media Cloud platform is an open-source platform,
 19 | that collects all kind of news stories and provides various
 20 | functionalities to query, download and analyze them. This packages tries
 21 | to support R users by providing a set of functions to access various
 22 | functionalities of the mediacloud.org.
 23 | 
 24 | ## Installation
 25 | 
 26 | The **mediacloudr**-package is now on CRAN :tada: . You can install the
 27 | released version of mediacloudr from [CRAN](https://CRAN.R-project.org)
 28 | with:
 29 | 
 30 | ``` r
 31 | install.packages("mediacloudr")
 32 | ```
 33 | 
 34 | And the development version from [GitHub](https://github.com/) with:
 35 | 
 36 | ``` r
 37 | # install.packages("devtools")
 38 | devtools::install_github("jandix/mediacloudr")
 39 | ```
 40 | 
 41 | ## API key
 42 | 
 43 | Please [register](https://topics.mediacloud.org/#/user/signup) as a new
 44 | user. Afterwards, you can copy your API key from your [profile
 45 | page](https://topics.mediacloud.org/#/user/profile).
 46 | 
 47 | I suggest to save the API key to your R environment file. The R
 48 | environment file is loaded every time R is started/restarted. You should
 49 | not add the key to your scripts, because other users could misuse your
 50 | key. The following steps show how to add the key to your R environment
 51 | file.
 52 | 
 53 | 1.  Open your .Renviron file. The file is usually located in your home
 54 |     directory. If the file does not exist, just create one and name it
 55 |     `.Renviron`.
 56 | 2.  Add a new line and enter your API key in the following format:
 57 |     `MEDIACLOUD_API_KEY=<YOUR_API_KEY>`.
 58 | 3.  Save the file and restart your current R session to start using
 59 |     mediacloudr.
 60 | 
 61 | ## Request Limits
 62 | 
 63 | The mediacloud.org states the following for API request/rate limits:
 64 | 
 65 | “Each user is limited to 1,000 API calls and 20,000 stories returned in
 66 | any 7 day period. Requests submitted beyond this limit will result in a
 67 | status 403 error. Users who need access to more requests should email
 68 | <info@mediacloud.org>.”
 69 | 
 70 | ## Examples
 71 | 
 72 | ### Get a news story by id
 73 | 
 74 | You can query news stories by their ids. The ids can be found using the
 75 | graphical interface or using the `get_story_list` function.
 76 | 
 77 | *Note*: You don’t have to add the `api_key` argument if you followed the
 78 | steps to add the api key to your R environment file.
 79 | 
 80 | ``` r
 81 | story_id <- 27456565L
 82 | story <- get_story(story_id = story_id)
 83 | ```
 84 | 
 85 | ### Get a list with news stories
 86 | 
 87 | You can query a list of news stories using `get_story_list`. You can use
 88 | the `q` and `fq` arguments to filter stories. A guide to the query
 89 | parameters can be found
 90 | [here](https://mediacloud.org/support/query-guide/).
 91 | 
 92 | *Note*: You don’t have to add the `api_key` argument if you followed the
 93 | steps to add the api key to your R environment file.
 94 | 
 95 | ``` r
 96 | stories <- get_story_list(q = "trump")
 97 | ```
 98 | 
 99 | ### Get a media source by id
100 | 
101 | You can query media sources by their ids. The ids can be found using the
102 | graphical online interface. mediacloud.org provides various meta data
103 | for their media sources.
104 | 
105 | *Note*: You don’t have to add the `api_key` argument if you followed the
106 | steps to add the api key to your R environment file.
107 | 
108 | ``` r
109 | media_id <- 1L
110 | media_source <- get_media_source(media_id =  media_id)
111 | ```
112 | 
113 | ### Download article and extract social media meta data
114 | 
115 | You can use the article URL to download the complete article and extract
116 | social meta data. The meta data can be analyzed using techniques such as
117 | sentiment analysis or simply compared to the article content.
118 | 
119 | ``` r
120 | # load httr
121 | library(httr)
122 | # define article url
123 | url <- "https://bits.blogs.nytimes.com/2013/04/07/the-potential-and-the-risks-of-data-science"
124 | # download article
125 | response <- GET(url)
126 | # extract article html
127 | html_document <- content(response, type = "text", encoding = "UTF-8")
128 | # extract meta data from html document
129 | meta_data <- extract_meta_data(html_doc = html_document)
130 | ```
131 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |     patch:
10 |       default:
11 |         target: auto
12 |         threshold: 1%
13 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Resubmission
 2 | This is a resubmission. In this version I have:
 3 | 
 4 | * Updated the description as suggested by the CRAN maintainers.
 5 | 
 6 | ## Resubmission
 7 | This is a resubmission. In this version I have:
 8 | 
 9 | * Updated the package title as suggested by the CRAN maintainers.
10 | 
11 | * Replaced a CRAN URL with canonical form.
12 | 
13 | ## Test environments
14 | * local ubuntu 18.10, R 3.6.0
15 | * ubuntu 14.04.5 (on travis-ci), R 3.5.3 (oldrel, devel and release)
16 | * win-builder (devel and release)
17 | 
18 | ## R CMD check results
19 | There were no ERRORs or WARNINGs.
20 | 
21 | There was 1 NOTE:
22 | 
23 | * checking CRAN incoming feasibility ... NOTE
24 | 
25 | ## Downstream dependencies
26 | There are currently no downstream dependencies for this package.
27 | 


--------------------------------------------------------------------------------
/data/meta_data_html.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jandix/mediacloudr/c1d997690e820ac957cdcc4421e9b0e972caea42/data/meta_data_html.rda


--------------------------------------------------------------------------------
/man/extract_meta_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_meta_data.R
 3 | \name{extract_meta_data}
 4 | \alias{extract_meta_data}
 5 | \title{Extract meta data}
 6 | \usage{
 7 | extract_meta_data(html_doc)
 8 | }
 9 | \arguments{
10 | \item{html_doc}{Character string including the html document.}
11 | }
12 | \value{
13 | List with three sublists for native, open graph and twitter.
14 | }
15 | \description{
16 | \code{extract_meta_data} extracts native, open graph and twitter meta data
17 | from html documents. The meta data include url, title, description and image.
18 | The html document is parsed within the function
19 | }
20 | \examples{
21 | \dontrun{
22 |  library(httr)
23 |  url <- "https://bits.blogs.nytimes.com/2013/04/07/the-potential-and-the-risks-of-data-science"
24 |  response <- GET(url)
25 |  html_document <- content(response, type = "text", encoding = "UTF-8")
26 |  meta_data <- extract_meta_data(html_doc = html_document)
27 | }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/man/get_media_source.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_media_source.R
 3 | \name{get_media_source}
 4 | \alias{get_media_source}
 5 | \title{Get media by id}
 6 | \usage{
 7 | get_media_source(media_id, api_key = Sys.getenv("MEDIACLOUD_API_KEY"))
 8 | }
 9 | \arguments{
10 | \item{media_id}{Positive integer that contains a valid media`` id.}
11 | 
12 | \item{api_key}{Character string with the API key you get from mediacloud.org.
13 | Passing it is compulsory. Alternatively, function can be
14 | provided from the global environment.}
15 | }
16 | \value{
17 | Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#media} for field descriptions.
18 | }
19 | \description{
20 | \code{get_media} returns media source by their id. A media source
21 | is one publisher. Every story that can be collected via \code{get_story}
22 | or \code{get_story_list} belongs to one media source.
23 | }
24 | \examples{
25 | \dontrun{
26 |  media_source <- get_media_source(media_id = 604L)
27 | }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/man/get_story.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_story.R
 3 | \name{get_story}
 4 | \alias{get_story}
 5 | \title{Get story by id}
 6 | \usage{
 7 | get_story(story_id, api_key = Sys.getenv("MEDIACLOUD_API_KEY"))
 8 | }
 9 | \arguments{
10 | \item{story_id}{Positive numeric that contains a valid story id.}
11 | 
12 | \item{api_key}{Character string with the API key you get from mediacloud.org.
13 | Passing it is compulsory. Alternatively, function can be
14 | provided from the global environment.}
15 | }
16 | \value{
17 | Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#stories} for field descriptions.
18 | }
19 | \description{
20 | \code{get_story} returns news stories by their id. One story represents
21 | one online publication. Each story refers to a single URL from any feed
22 | within a single media source.
23 | }
24 | \examples{
25 | \dontrun{
26 |  story <- get_story(story_id = 604L)
27 | }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/man/get_story_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_story_list.R
 3 | \name{get_story_list}
 4 | \alias{get_story_list}
 5 | \title{Get story list}
 6 | \usage{
 7 | get_story_list(last_process_stories_id = 0L, rows = 100,
 8 |   feeds_id = NULL, q = NULL, fq = NULL,
 9 |   sort = "processed_stories_id", wc = FALSE, show_feeds = FALSE,
10 |   api_key = Sys.getenv("MEDIACLOUD_API_KEY"))
11 | }
12 | \arguments{
13 | \item{last_process_stories_id}{Return stories in which the
14 | processed_stories_id is greater than this
15 | value.}
16 | 
17 | \item{rows}{Number of stories to return, max 1000.}
18 | 
19 | \item{feeds_id}{Return only stories that match the given feeds_id, sorted
20 | my descending publish date}
21 | 
22 | \item{q}{If specified, return only results that match the given Solr query.
23 | Only one q parameter may be included.}
24 | 
25 | \item{fq}{If specified, file results by the given Solr query. More than one
26 | fq parameter may be included.}
27 | 
28 | \item{sort}{Returned results sort order. Supported values:
29 | processed_stories_id, random}
30 | 
31 | \item{wc}{If set to TRUE, include a 'word_count' field with each story that
32 | includes a count of the most common words in the story}
33 | 
34 | \item{show_feeds}{If set to TRUE, include a 'feeds' field with a list of the
35 | feeds associated with this story}
36 | 
37 | \item{api_key}{Character string with the API key you get from mediacloud.org.
38 | Passing it is compulsory. Alternatively, function can be
39 | provided from the global environment.}
40 | }
41 | \value{
42 | Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#stories} for field descriptions.
43 | }
44 | \description{
45 | \code{get_story} returns a list of stories based on a multifaceted query. One
46 | story represents one online publication. Each story refers to a single URL
47 | from any feed within a single media source.
48 | }
49 | \examples{
50 | \dontrun{
51 |  stories <- get_story_list()
52 |  stories <- get_story_list(q = "Trump")
53 | }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/man/meta_data_html.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/meta_data_html.R
 3 | \docType{data}
 4 | \name{meta_data_html}
 5 | \alias{meta_data_html}
 6 | \title{HTML document to test \code{extract_meta_data}}
 7 | \format{An object of class \code{character} of length 1.}
 8 | \usage{
 9 | meta_data_html
10 | }
11 | \description{
12 | A HTML document with basic meta tags for open-graph, twitter and native
13 | meta data.
14 | }
15 | \keyword{datasets}
16 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(mediacloudr)
3 | test_check("mediacloudr")
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test_extract_meta_data.R:
--------------------------------------------------------------------------------
 1 | context("Extract Meta Data")
 2 | 
 3 | # invalid inputs ---------------------------------------------------------------
 4 | testthat::test_that(
 5 |   "test that function returns error if no html document provided", {
 6 |     testthat::expect_error(mediacloudr::extract_meta_data(),
 7 |                            regexp = "Please define a html document.")
 8 |   })
 9 | 
10 | # test html document -----------------------------------------------------------
11 | html_doc <- mediacloudr::meta_data_html
12 | meta_data <- list(
13 |   open_graph = list(
14 |     url = "https://github.com/jandix/mediacloudr",
15 |     type = "article",
16 |     title = "Hello world!",
17 |     image = "https://images.pexels.com/photos/113338/pexels-photo-113338.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=650&w=940",
18 |     description = "mediacloudr hello world test file"
19 |   ),
20 |   twitter = list(
21 |     url = "https://github.com/jandix/mediacloudr",
22 |     title = "Hello world!",
23 |     description = "mediacloudr hello world test file",
24 |     image = "https://images.pexels.com/photos/113338/pexels-photo-113338.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=650&w=940",
25 |     image_alt = "Trees",
26 |     card = "summary_large_image"
27 |   ),
28 |   native = list (
29 |     title = "Hello world!",
30 |     description = "mediacloudr hello world test file",
31 |     image = "https://images.pexels.com/photos/113338/pexels-photo-113338.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=650&w=940",
32 |     thumbnail = "https://images.pexels.com/photos/113338/pexels-photo-113338.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=50"
33 |   )
34 | )
35 | testthat::test_that(
36 |   "test that function parses the document correct", {
37 |     testthat::expect_equal(mediacloudr::extract_meta_data(html_doc = html_doc), meta_data)
38 |   })
39 | 
40 | # test result set --------------------------------------------------------------
41 | testthat::test_that(
42 |   "test that function returns extactly 100 rows", {
43 |     testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
44 |                       message = "API key not available in environment. Skipping test.")
45 |     example_result <- mediacloudr::get_story_list()
46 |     testthat::expect_equal(nrow(example_result), 100,
47 |                            info = paste0("100 row expected, but got ", nrow(example_result), " row(s)."))
48 |   })
49 | testthat::test_that(
50 |   "test that function returns 15 columns", {
51 |     testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
52 |                       message = "API key not available in environment. Skipping test.")
53 |     example_result <- mediacloudr::get_story_list()
54 |     testthat::expect_equal(ncol(example_result), 15,
55 |                            info = paste0("15 cols expected, but got ", ncol(example_result), " col(s)."))
56 |   })
57 | 


--------------------------------------------------------------------------------
/tests/testthat/test_get_media_source.R:
--------------------------------------------------------------------------------
 1 | context("Get Media Source")
 2 | 
 3 | # invalid inputs ---------------------------------------------------------------
 4 | testthat::test_that(
 5 |   "test that function returns error if no media_id provided", {
 6 |     testthat::expect_error(mediacloudr::get_media_source(),
 7 |                            regexp = "Please define a media id.")
 8 |   })
 9 | 
10 | testthat::test_that(
11 |   "test that function returns error if media_id is float", {
12 |     testthat::expect_error(mediacloudr::get_media_source(media_id = 6.496),
13 |                            regexp = "Please provide a positive integer for media id.")
14 |   })
15 | 
16 | testthat::test_that(
17 |   "test that function returns error if media_id is character", {
18 |     testthat::expect_error(mediacloudr::get_media_source(media_id = "abs"),
19 |                            regexp = "Please provide a positive integer for media id.")
20 |   })
21 | 
22 | testthat::test_that(
23 |   "test that function returns error if media_id negative", {
24 |     testthat::expect_error(mediacloudr::get_media_source(media_id = -2L),
25 |                            regexp = "Please provide a positive integer for media id.")
26 |   })
27 | 
28 | testthat::test_that(
29 |   "test that function returns error if no media_id provided", {
30 |     testthat::expect_error(mediacloudr::get_media_source(media_id = 604L, api_key = ""),
31 |                            regexp = "Please define an API key.")
32 |   })
33 | 
34 | # test result set --------------------------------------------------------------
35 | testthat::test_that(
36 |   "test that function returns only one row", {
37 |     testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
38 |                       message = "API key not available in environment. Skipping test.")
39 |     example_result <- mediacloudr::get_media_source(1L)
40 |     testthat::expect_equal(nrow(example_result), 1,
41 |                            info = paste0("1 row expected, but got ", nrow(example_result), " row(s)."))
42 |   })
43 | testthat::test_that(
44 |   "test that function returns 15 columns", {
45 |     testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
46 |                       message = "API key not available in environment. Skipping test.")
47 |     example_result <- mediacloudr::get_media_source(1L)
48 |     testthat::expect_equal(ncol(example_result), 10,
49 |                            info = paste0("10 cols expected, but got ", nrow(example_result), " col(s)."))
50 |   })
51 | 


--------------------------------------------------------------------------------
/tests/testthat/test_get_story.R:
--------------------------------------------------------------------------------
 1 | context("Get Story")
 2 | 
 3 | # invalid inputs ---------------------------------------------------------------
 4 | testthat::test_that(
 5 |   "test that function returns error if no story_id provided", {
 6 |   testthat::expect_error(mediacloudr::get_story(),
 7 |                          regexp = "Please define a story id.")
 8 | })
 9 | 
10 | testthat::test_that(
11 |   "test that function returns error if story_id is float", {
12 |     testthat::expect_error(mediacloudr::get_story(story_id = 6.496),
13 |                            regexp = "Please provide a positive integer for story id.")
14 |   })
15 | 
16 | testthat::test_that(
17 |   "test that function returns error if story_id is character", {
18 |     testthat::expect_error(mediacloudr::get_story(story_id = "abs"),
19 |                            regexp = "Please provide a positive integer for story id.")
20 |   })
21 | 
22 | testthat::test_that(
23 |   "test that function returns error if story_id negative", {
24 |     testthat::expect_error(mediacloudr::get_story(story_id = -2L),
25 |                            regexp = "Please provide a positive integer for story id.")
26 |   })
27 | 
28 | testthat::test_that(
29 |   "test that function returns error if no api_key provided", {
30 |     testthat::expect_error(mediacloudr::get_story(story_id = 604L, api_key = ""),
31 |                            regexp = "Please define an API key.")
32 |   })
33 | 
34 | # test result set --------------------------------------------------------------
35 | testthat::test_that(
36 |   "test that function returns only one row", {
37 |     testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
38 |                       message = "API key not available in environment. Skipping test.")
39 |     example_result <- mediacloudr::get_story(27456565L)
40 |     testthat::expect_equal(nrow(example_result), 1,
41 |                            info = paste0("1 row expected, but got ", nrow(example_result), " row(s)."))
42 |   })
43 | testthat::test_that(
44 |   "test that function returns 15 columns", {
45 |     testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
46 |                       message = "API key not available in environment. Skipping test.")
47 |     example_result <- mediacloudr::get_story(27456565L)
48 |     testthat::expect_equal(ncol(example_result), 15,
49 |                            info = paste0("15 cols expected, but got ", nrow(example_result), " col(s)."))
50 |   })
51 | 


--------------------------------------------------------------------------------
/tests/testthat/test_get_story_list.R:
--------------------------------------------------------------------------------
 1 | context("Get Story List")
 2 | 
 3 | # invalid inputs ---------------------------------------------------------------
 4 | testthat::test_that(
 5 |   "test that function returns error if story_id is float", {
 6 |     testthat::expect_error(mediacloudr::get_story_list(last_process_stories_id = 6.496),
 7 |                            regexp = "Please provide a positive integer for last process stories id")
 8 |   })
 9 | 
10 | testthat::test_that(
11 |   "test that function returns error if story_id is character", {
12 |     testthat::expect_error(mediacloudr::get_story_list(last_process_stories_id = "abs"),
13 |                            regexp = "Please provide a positive integer for last process stories id")
14 |   })
15 | 
16 | testthat::test_that(
17 |   "test that function returns error if story_id negative", {
18 |     testthat::expect_error(mediacloudr::get_story_list(last_process_stories_id = -2L),
19 |                            regexp = "Please provide a positive integer for last process stories id")
20 |   })
21 | 
22 | testthat::test_that(
23 |   "test that function returns error if no api_key provided", {
24 |     testthat::expect_error(mediacloudr::get_story_list(api_key = ""),
25 |                            regexp = "Please define an API key.")
26 |   })
27 | 
28 | testthat::test_that(
29 |   "test that function returns error if row smaller 1", {
30 |     testthat::expect_error(mediacloudr::get_story_list(rows = 0),
31 |                            regexp = "Rows should be larger than 0 and smaller or equal to 1000.")
32 |   })
33 | 
34 | testthat::test_that(
35 |   "test that function returns error if row larger 1000", {
36 |     testthat::expect_error(mediacloudr::get_story_list(rows = 1001),
37 |                            regexp = "Rows should be larger than 0 and smaller or equal to 1000.")
38 |   })
39 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/extract_meta_data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Extract Social Media Meta Data"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{extract_meta_data}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | We start with loading the required packages. `mediacloudr` is used to download
 11 | an article based on a article id received from the 
 12 | [Media Cloud Topic Mapper](https://topics.mediacloud.org/). Furthermore, 
 13 | `mediacloudr` provides a function to extract social media meta data from 
 14 | HTML documents. `httr` is used to turn R into a HTTP client to download and 
 15 | process the responding article page. We use `xml2` to parse - make it readable
 16 | for R - the HTML document and `rvest` to find elements of interest within
 17 | the HTML document.
 18 | 
 19 | ```{r setup, eval=FALSE}
 20 | # load required packages
 21 | library(mediacloudr)
 22 | library(httr)
 23 | library(xml2)
 24 | library(rvest)
 25 | ```
 26 | 
 27 | In the first step, we request the article with the id `1126843780`. It is 
 28 | important to add the upper case `L` to the number to turn the numeric type into
 29 | an integer type. Otherwise the function will throw an error. The article was 
 30 | selected with help of the 
 31 | [Media Cloud Topic Mapper](https://topics.mediacloud.org/) online tool. If you
 32 | created an account, you can create and analyze your own topics.
 33 | 
 34 | ```{r get_article, eval=FALSE}
 35 | # define media id as integer
 36 | story_id <- 1126843780L
 37 | # download article
 38 | article <- get_story(story_id = story_id)
 39 | ```
 40 | 
 41 | The USA Today [news article](https://eu.usatoday.com/story/news/2018/12/27/ice-drops-off-migrants-phoenix-greyhound-bus-station/2429545002/) comes with an URL which we can use to download the 
 42 | complete article using the `httr` package. We use the `GET` function to 
 43 | download the article. Afterwards, we extract the website using the `content`
 44 | function. It is important to provide the `type` argument to extract the text 
 45 | only. Otherwise, the function tries to guess the type and will automatically 
 46 | parse the content based on the `content-type` HTTP header. The author of the
 47 | `httr` package 
 48 | [suggests](https://CRAN.R-project.org/package=httr/vignettes/quickstart.html) 
 49 | to manually parse the content. In this case, we use the `read_html` function
 50 | which is provided in the `xml2` package.
 51 | 
 52 | ```{r download_website, eval=FALSE}
 53 | # download article
 54 | response <- GET(article$url[1])
 55 | # extract article html
 56 | html_document <- content(response, type = "text", encoding = "UTF-8")
 57 | # parse website 
 58 | parsed_html <- read_html(html_document)
 59 | ```
 60 | 
 61 | After parsing the response into a R readable format, we extract the actual body 
 62 | of the article. Therefore, we use the `html_nodes` function to find the html
 63 | tags with defined in the `css` argument. A useful open source tool to find the 
 64 | corresponding tags or css classes is the 
 65 | [Selector Gadget](https://selectorgadget.com/). Alternatively, you can use the
 66 | developer tools of the browser you are usually using. The `html_text` provides
 67 | us with a character vector. Each element contains a paragraph of the article. We
 68 | use the `paste` function to merge the paragraph into one closed text. We could
 69 | analyze the text using different metrics such as word frequencies or sentiment
 70 | analysis.
 71 | 
 72 | ```{r body_content, eval=FALSE}
 73 | # extract article body
 74 | article_body_nodes <- html_nodes(x = parsed_html, css = ".content-well div p")
 75 | article_body <- html_text(x = article_body_nodes)
 76 | # paste character vector to one text
 77 | article_body <- paste(article_body, collapse = " ")
 78 | ```
 79 | 
 80 | In the last step, we extract the social media meta data from the article. Social
 81 | media meta data are shown if the article URL is shared on social media. The
 82 | article representation usually include a heading, summary and a small 
 83 | image/thumbnail. The `extract_meta_data` expects a raw HTML document and 
 84 | provides [Open Graph](http://ogp.me/) (a standard introduced by Facebook), [Twitter](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/overview/markup.html) 
 85 | and native meta data.
 86 | 
 87 | ```{r get_meta_data, eval=FALSE}
 88 | # extract meta data from html document
 89 | meta_data <- extract_meta_data(html_doc = html_document)
 90 | ```
 91 | 
 92 | Open Graph Title: *"ICE drops off migrants at Phoenix bus station"*
 93 | 
 94 | Article Title (provided by mediacloud.org): *"Arizona churches working to help 
 95 | migrants are 'at capacity' or 'tapped out on resources'"*
 96 | 
 97 | The meta data can be compared to the original content of the article. A short
 98 | analysis reveals that USA Today chose a different heading to advertise the 
 99 | article on Facebook. Larger analysis can use quantitative tools such as string
100 | similarity measures, such as provided by the `stringdist` package.
101 | 


--------------------------------------------------------------------------------