├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── CRAN-RELEASE
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
├── extract_meta_data.R
├── get_media_source.R
├── get_story.R
├── get_story_list.R
└── meta_data_html.R
├── README.Rmd
├── README.md
├── codecov.yml
├── cran-comments.md
├── data
└── meta_data_html.rda
├── man
├── extract_meta_data.Rd
├── get_media_source.Rd
├── get_story.Rd
├── get_story_list.Rd
└── meta_data_html.Rd
├── tests
├── testthat.R
└── testthat
│ ├── test_extract_meta_data.R
│ ├── test_get_media_source.R
│ ├── test_get_story.R
│ └── test_get_story_list.R
└── vignettes
├── .gitignore
└── extract_meta_data.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^.travis.yml$
5 | ^codecov\.yml$
6 | ^doc$
7 | ^Meta$
8 | ^cran-comments.md$
9 | ^CRAN-RELEASE$
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | .Rapp.history
4 |
5 | # Session Data files
6 | .RData
7 |
8 | # User-specific files
9 | .Ruserdata
10 |
11 | # Example code in package build process
12 | *-Ex.R
13 |
14 | # Output files from R CMD build
15 | /*.tar.gz
16 |
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 |
20 | # RStudio files
21 | .Rproj.user/
22 | *.Rproj
23 |
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 |
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 |
31 | # knitr and R markdown default cache directories
32 | /*_cache/
33 | /cache/
34 |
35 | # Temporary files created by R markdown
36 | *.utf8.md
37 | *.knit.md
38 | inst/doc
39 | doc
40 | Meta
41 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 | r:
3 | - oldrel
4 | - release
5 | - devel
6 | after_success:
7 | - Rscript -e 'covr::codecov()'
8 | sudo: false
9 | cache: packages
10 |
--------------------------------------------------------------------------------
/CRAN-RELEASE:
--------------------------------------------------------------------------------
1 | This package was submitted to CRAN on 2019-07-21.
2 | Once it is accepted, delete this file and tag the release (commit e62672467f).
3 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: mediacloudr
2 | Type: Package
3 | Title: Wrapper for the 'mediacloud.org' API
4 | Version: 0.1.1.9000
5 | Depends: R (>= 3.2.0)
6 | Authors@R: c(person("Dix", "Jan", email = "jan.dix@uni-konstanz.de", role = c("cre", "aut")))
7 | Description: API wrapper to gather news stories, media information and tags from the 'mediacloud.org' API, based on a multilevel query . A personal API key is required.
8 | License: MIT + file LICENSE
9 | Encoding: UTF-8
10 | LazyData: true
11 | Imports:
12 | httr,
13 | jsonlite,
14 | rvest,
15 | xml2
16 | Suggests:
17 | testthat,
18 | covr,
19 | knitr,
20 | rmarkdown
21 | RoxygenNote: 6.1.1
22 | VignetteBuilder: knitr
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2019
2 | COPYRIGHT HOLDER: Jan Dix
3 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(extract_meta_data)
4 | export(get_media_source)
5 | export(get_story)
6 | export(get_story_list)
7 | importFrom(httr,GET)
8 | importFrom(httr,build_url)
9 | importFrom(httr,content)
10 | importFrom(httr,http_error)
11 | importFrom(httr,http_status)
12 | importFrom(httr,parse_url)
13 | importFrom(jsonlite,fromJSON)
14 | importFrom(rvest,html_attr)
15 | importFrom(rvest,html_node)
16 | importFrom(rvest,html_text)
17 | importFrom(xml2,read_html)
18 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # version 0.1.0 (2019-07-10)
2 |
3 | - initial publication on GitHub
4 | - prepare CRAN release
5 |
--------------------------------------------------------------------------------
/R/extract_meta_data.R:
--------------------------------------------------------------------------------
1 | #' Extract meta data
2 | #'
3 | #' \code{extract_meta_data} extracts native, open graph and twitter meta data
4 | #' from html documents. The meta data include url, title, description and image.
5 | #' The html document is parsed within the function
6 | #'
7 | #' @param html_doc Character string including the html document.
8 | #'
9 | #' @examples
10 | #' \dontrun{
11 | #' library(httr)
12 | #' url <- "https://bits.blogs.nytimes.com/2013/04/07/the-potential-and-the-risks-of-data-science"
13 | #' response <- GET(url)
14 | #' html_document <- content(response, type = "text", encoding = "UTF-8")
15 | #' meta_data <- extract_meta_data(html_doc = html_document)
16 | #' }
17 | #'
18 | #' @return List with three sublists for native, open graph and twitter.
19 | #'
20 | #' @importFrom xml2 read_html
21 | #' @importFrom rvest html_attr html_text html_node
22 | #'
23 | #' @export
24 |
25 | extract_meta_data <- function (html_doc) {
26 |
27 | # errors and warnings --------------------------------------------------------
28 | # check if html document is passed
29 | if (missing(html_doc)) stop("Please define a html document.")
30 |
31 | # parse document and prepare empty result set --------------------------------
32 | # parse html
33 | parsed_html <- xml2::read_html(html_doc)
34 | # define empty return object
35 | meta_data <- list(
36 | open_graph = list(
37 | url = NA,
38 | type = NA,
39 | title = NA,
40 | image = NA,
41 | description = NA
42 | ),
43 | twitter = list(
44 | url = NA,
45 | title = NA,
46 | description = NA,
47 | image = NA,
48 | image_alt = NA,
49 | card = NA
50 | ),
51 | native = list (
52 | title = NA,
53 | description = NA,
54 | image = NA,
55 | thumbnail = NA
56 | )
57 | )
58 |
59 | # extract meta data ----------------------------------------------------------
60 | # og url
61 | meta_data$open_graph$url <- rvest::html_attr(
62 | rvest::html_node(parsed_html, "meta[property='og:url']"),
63 | "content"
64 | )
65 | # og type
66 | meta_data$open_graph$type <- rvest::html_attr(
67 | rvest::html_node(parsed_html, "meta[property='og:type']"),
68 | "content"
69 | )
70 | # og title
71 | meta_data$open_graph$title <- rvest::html_attr(
72 | rvest::html_node(parsed_html, "meta[property='og:title']"),
73 | "content"
74 | )
75 | # og image
76 | meta_data$open_graph$image <- rvest::html_attr(
77 | rvest::html_node(parsed_html, "meta[property='og:image']"),
78 | "content"
79 | )
80 | # og description
81 | meta_data$open_graph$description <- rvest::html_attr(
82 | rvest::html_node(parsed_html, "meta[property='og:description']"),
83 | "content"
84 | )
85 | # twitter url
86 | meta_data$twitter$url <- rvest::html_attr(
87 | rvest::html_node(parsed_html, "meta[property='twitter:url']"),
88 | "content"
89 | )
90 | # twitter title
91 | meta_data$twitter$title <- rvest::html_attr(
92 | rvest::html_node(parsed_html, "meta[property='twitter:title']"),
93 | "content"
94 | )
95 | # twitter description
96 | meta_data$twitter$description <- rvest::html_attr(
97 | rvest::html_node(parsed_html, "meta[property='twitter:description']"),
98 | "content"
99 | )
100 | # twitter image
101 | meta_data$twitter$image <- rvest::html_attr(
102 | rvest::html_node(parsed_html, "meta[property='twitter:image']"),
103 | "content"
104 | )
105 | # twitter image_alt
106 | meta_data$twitter$image_alt <- rvest::html_attr(
107 | rvest::html_node(parsed_html, "meta[property='twitter:image:alt']"),
108 | "content"
109 | )
110 | # twitter card
111 | meta_data$twitter$card <- rvest::html_attr(
112 | rvest::html_node(parsed_html, "meta[property='twitter:card']"),
113 | "content"
114 | )
115 | # native title
116 | meta_data$native$title <- rvest::html_text(
117 | rvest::html_node(parsed_html, "title")
118 | )
119 | # native description
120 | meta_data$native$description <- rvest::html_attr(
121 | rvest::html_node(parsed_html, "meta[name='description']"),
122 | "content"
123 | )
124 | # native image
125 | meta_data$native$image <- rvest::html_attr(
126 | rvest::html_node(parsed_html, "meta[name='image']"),
127 | "content"
128 | )
129 | # native thumbnail
130 | meta_data$native$thumbnail <- rvest::html_attr(
131 | rvest::html_node(parsed_html, "meta[name='thumbnail']"),
132 | "content"
133 | )
134 |
135 | # process and return result set ----------------------------------------------
136 | # replace empty characters
137 | meta_data <- lapply(meta_data, lapply,
138 | function (x) ifelse(length(x) <= 0, NA, x))
139 | # return results
140 | return(meta_data)
141 | }
142 |
--------------------------------------------------------------------------------
/R/get_media_source.R:
--------------------------------------------------------------------------------
1 | #' Get media by id
2 | #'
3 | #' \code{get_media} returns media source by their id. A media source
4 | #' is one publisher. Every story that can be collected via \code{get_story}
5 | #' or \code{get_story_list} belongs to one media source.
6 | #'
7 | #' @param media_id Positive integer that contains a valid media`` id.
8 | #' @param api_key Character string with the API key you get from mediacloud.org.
9 | #' Passing it is compulsory. Alternatively, function can be
10 | #' provided from the global environment.
11 | #'
12 | #' @return Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#media} for field descriptions.
13 | #'
14 | #' @examples
15 | #' \dontrun{
16 | #' media_source <- get_media_source(media_id = 604L)
17 | #' }
18 | #'
19 | #' @importFrom httr parse_url build_url GET http_error http_status content
20 | #' @importFrom jsonlite fromJSON
21 | #'
22 | #' @export
23 |
24 | get_media_source <- function(media_id,
25 | api_key = Sys.getenv("MEDIACLOUD_API_KEY")) {
26 |
27 | # errors and warnings --------------------------------------------------------
28 | # check if media_id is passed
29 | if (missing(media_id)) stop("Please define a media id.")
30 |
31 | # check if story_id integer and positive
32 | if (!is.integer(media_id) | media_id < 0L)
33 | stop("Please provide a positive integer for media id.")
34 |
35 | # check if api key is passed
36 | if (nchar(api_key) == 0) {
37 | stop("Please define an API key.")
38 | }
39 |
40 | # define and build url ------------------------------------------------------
41 | # define base url
42 | url <- "https://api.mediacloud.org/api/v2/media/single"
43 | # parse url
44 | url <- httr::parse_url(url = url)
45 | # add api key query parameter
46 | url$query <- list(
47 | key = api_key
48 | )
49 | # add story id to path
50 | url$path <- paste(url$path, media_id, sep = "/")
51 | # build url
52 | url <- httr::build_url(url)
53 |
54 | # query and parse api --------------------------------------------------------
55 | # query api
56 | response <- httr::GET(url)
57 | # parse response
58 | parsed_response <- httr::content(response, type = "text", encoding = "UTF-8")
59 | # parse json
60 | parsed_json <- jsonlite::fromJSON(parsed_response)
61 |
62 | # check possible errors ------------------------------------------------------
63 | # check if any error
64 | if (httr::http_error(response)) {
65 | stop(parsed_json$error)
66 | }
67 |
68 | # define and return result object --------------------------------------------
69 | # return result set
70 | return(parsed_json)
71 | }
72 |
--------------------------------------------------------------------------------
/R/get_story.R:
--------------------------------------------------------------------------------
1 | #' Get story by id
2 | #'
3 | #' \code{get_story} returns news stories by their id. One story represents
4 | #' one online publication. Each story refers to a single URL from any feed
5 | #' within a single media source.
6 | #'
7 | #' @param story_id Positive numeric that contains a valid story id.
8 | #' @param api_key Character string with the API key you get from mediacloud.org.
9 | #' Passing it is compulsory. Alternatively, function can be
10 | #' provided from the global environment.
11 | #'
12 | #' @examples
13 | #' \dontrun{
14 | #' story <- get_story(story_id = 604L)
15 | #' }
16 | #'
17 | #' @return Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#stories} for field descriptions.
18 | #'
19 | #' @importFrom httr parse_url build_url GET http_error http_status content
20 | #' @importFrom jsonlite fromJSON
21 | #'
22 | #' @export
23 |
24 | get_story <- function(story_id,
25 | api_key = Sys.getenv("MEDIACLOUD_API_KEY")) {
26 |
27 | # errors and warnings --------------------------------------------------------
28 | # check if story_id is passed
29 | if (missing(story_id)) stop("Please define a story id.")
30 |
31 | # check if story_id integer and positive
32 | if (!is.integer(story_id) | story_id < 0L)
33 | stop("Please provide a positive integer for story id.")
34 |
35 | # check if api key is passed
36 | if (nchar(api_key) == 0) {
37 | stop("Please define an API key.")
38 | }
39 |
40 | # define and build url ------------------------------------------------------
41 | # define base url
42 | url <- "https://api.mediacloud.org/api/v2/stories_public/single"
43 | # parse url
44 | url <- httr::parse_url(url = url)
45 | # add api key query parameter
46 | url$query <- list(
47 | key = api_key
48 | )
49 | # add story id to path
50 | url$path <- paste(url$path, story_id, sep = "/")
51 | # build url
52 | url <- httr::build_url(url)
53 |
54 | # query and parse api --------------------------------------------------------
55 | # query api
56 | response <- httr::GET(url)
57 | # parse response
58 | parsed_response <- httr::content(response, type = "text", encoding = "UTF-8")
59 | # parse json
60 | parsed_json <- jsonlite::fromJSON(parsed_response)
61 |
62 | # check possible errors ------------------------------------------------------
63 | # check if any error
64 | if (httr::http_error(response)) {
65 | stop(parsed_json$error)
66 | }
67 |
68 | # define and return result object --------------------------------------------
69 | # return result set
70 | return(parsed_json)
71 | }
72 |
--------------------------------------------------------------------------------
/R/get_story_list.R:
--------------------------------------------------------------------------------
1 | #' Get story list
2 | #'
3 | #' \code{get_story} returns a list of stories based on a multifaceted query. One
4 | #' story represents one online publication. Each story refers to a single URL
5 | #' from any feed within a single media source.
6 | #'
7 | #' @param last_process_stories_id Return stories in which the
8 | #' processed_stories_id is greater than this
9 | #' value.
10 | #' @param rows Number of stories to return, max 1000.
11 | #' @param feeds_id Return only stories that match the given feeds_id, sorted
12 | #' my descending publish date
13 | #' @param q If specified, return only results that match the given Solr query.
14 | #' Only one q parameter may be included.
15 | #' @param fq If specified, file results by the given Solr query. More than one
16 | #' fq parameter may be included.
17 | #' @param sort Returned results sort order. Supported values:
18 | #' processed_stories_id, random
19 | #' @param wc If set to TRUE, include a 'word_count' field with each story that
20 | #' includes a count of the most common words in the story
21 | #' @param show_feeds If set to TRUE, include a 'feeds' field with a list of the
22 | #' feeds associated with this story
23 | #' @param api_key Character string with the API key you get from mediacloud.org.
24 | #' Passing it is compulsory. Alternatively, function can be
25 | #' provided from the global environment.
26 | #' @examples
27 | #' \dontrun{
28 | #' stories <- get_story_list()
29 | #' stories <- get_story_list(q = "Trump")
30 | #' }
31 | #'
32 | #' @return Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#stories} for field descriptions.
33 | #' @export
34 |
35 | get_story_list <- function(last_process_stories_id = 0L,
36 | rows = 100,
37 | feeds_id = NULL,
38 | q = NULL,
39 | fq = NULL,
40 | sort = "processed_stories_id",
41 | wc = FALSE,
42 | show_feeds = FALSE,
43 | api_key = Sys.getenv("MEDIACLOUD_API_KEY")) {
44 |
45 | # errors and warnings --------------------------------------------------------
46 | # check if last_process_stories_id valid is passed
47 | # check if story_id integer and positive
48 | if (!is.integer(last_process_stories_id) | last_process_stories_id < 0)
49 | stop("Please provide a positive integer for last process stories id.")
50 |
51 | # check if rows in range
52 | if (rows <= 0 | rows > 1000)
53 | stop("Rows should be larger than 0 and smaller or equal to 1000.")
54 |
55 | # check if api key is passed
56 | if (nchar(api_key) == 0) {
57 | stop("Please define an API key.")
58 | }
59 |
60 | # define and build url ------------------------------------------------------
61 | # define base url
62 | url <- "https://api.mediacloud.org/api/v2/stories_public/list"
63 | # parse url
64 | url <- httr::parse_url(url = url)
65 | # add api key query parameter
66 | url$query <- list(
67 | last_process_stories_id = last_process_stories_id,
68 | rows = rows,
69 | feeds_id = feeds_id,
70 | q = q,
71 | fq = fq,
72 | sort = sort,
73 | wc = wc,
74 | show_feeds = show_feeds,
75 | key = api_key
76 | )
77 | # build url
78 | url <- httr::build_url(url)
79 |
80 | # query and parse api --------------------------------------------------------
81 | # query api
82 | response <- httr::GET(url)
83 | # parse response
84 | parsed_response <- httr::content(response, type = "text", encoding = "UTF-8")
85 | # parse json
86 | parsed_json <- jsonlite::fromJSON(parsed_response)
87 |
88 | # check possible errors ------------------------------------------------------
89 | # check if any error
90 | if (httr::http_error(response)) {
91 | stop(parsed_json$error)
92 | }
93 |
94 | # define and return result object --------------------------------------------
95 | # return result set
96 | return(parsed_json)
97 | }
98 |
--------------------------------------------------------------------------------
/R/meta_data_html.R:
--------------------------------------------------------------------------------
1 | #' HTML document to test \code{extract_meta_data}
2 | #'
3 | #' A HTML document with basic meta tags for open-graph, twitter and native
4 | #' meta data.
5 | "meta_data_html"
6 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 |
6 |
7 | ```{r, include = FALSE}
8 | knitr::opts_chunk$set(
9 | collapse = TRUE,
10 | comment = "#>",
11 | fig.path = "man/figures/README-",
12 | out.width = "100%"
13 | )
14 | ```
15 | # mediacloudr
16 |
17 |
18 | [](https://www.repostatus.org/#active)
19 | [](https://travis-ci.org/jandix/mediacloudr)
20 | [](https://codecov.io/gh/jandix/mediacloudr?branch=master)
21 |
22 |
23 | The goal of mediacloudr is to provide a consistent wrapper for the
24 | mediacloud.org API. The Media Cloud platform is an open-source platform,
25 | that collects all kind of news stories and provides various functionalities to
26 | query, download and analyze them. This packages tries to support R users by
27 | providing a set of functions to access various functionalities of the
28 | mediacloud.org.
29 |
30 | ## Installation
31 |
32 | The **mediacloudr**-package is now on CRAN :tada: . You can install the released
33 | version of mediacloudr from [CRAN](https://CRAN.R-project.org) with:
34 |
35 | ``` r
36 | install.packages("mediacloudr")
37 | ```
38 |
39 |
40 | And the development version from [GitHub](https://github.com/) with:
41 |
42 | ``` r
43 | # install.packages("devtools")
44 | devtools::install_github("jandix/mediacloudr")
45 | ```
46 |
47 | ## API key
48 |
49 | Please [register](https://topics.mediacloud.org/#/user/signup) as a new user.
50 | Afterwards, you can copy your API key from your
51 | [profile page](https://topics.mediacloud.org/#/user/profile).
52 |
53 | I suggest to save the API key to your R environment file. The R environment file
54 | is loaded every time R is started/restarted. You should not add the key to your
55 | scripts, because other users could misuse your key. The following steps show
56 | how to add the key to your R environment file.
57 |
58 | 1. Open your .Renviron file. The file is usually located in
59 | your home directory. If the file does not exist, just create one and name it
60 | `.Renviron`.
61 | 2. Add a new line and enter your API key in the following format:
62 | `MEDIACLOUD_API_KEY=`.
63 | 3. Save the file and restart your current R session to start using mediacloudr.
64 |
65 | ## Request Limits
66 |
67 | The mediacloud.org states the following for API request/rate limits:
68 |
69 | "Each user is limited to 1,000 API calls and 20,000 stories returned in any 7
70 | day period. Requests submitted beyond this limit will result in a status 403
71 | error. Users who need access to more requests should email
72 | [info@mediacloud.org](mailto:info@mediacloud.org)."
73 |
74 | ## Examples
75 |
76 | ### Get a news story by id
77 |
78 | You can query news stories by their ids. The ids can be found using the
79 | graphical interface or using the `get_story_list` function.
80 |
81 | *Note*: You don't have to add the `api_key` argument if you followed the
82 | steps to add the api key to your R environment file.
83 |
84 | ```{r story example, eval=FALSE}
85 | story_id <- 27456565L
86 | story <- get_story(story_id = story_id)
87 | ```
88 |
89 | ### Get a list with news stories
90 |
91 | You can query a list of news stories using `get_story_list`. You can use the
92 | `q` and `fq` arguments to filter stories. A guide to the query parameters can
93 | be found [here](https://mediacloud.org/support/query-guide/).
94 |
95 | *Note*: You don't have to add the `api_key` argument if you followed the
96 | steps to add the api key to your R environment file.
97 |
98 | ```{r story list example, eval=FALSE}
99 | stories <- get_story_list(q = "trump")
100 | ```
101 |
102 | ### Get a media source by id
103 |
104 | You can query media sources by their ids. The ids can be found using the
105 | graphical online interface. mediacloud.org provides various meta data for
106 | their media sources.
107 |
108 | *Note*: You don't have to add the `api_key` argument if you followed the
109 | steps to add the api key to your R environment file.
110 |
111 | ```{r media source example, eval=FALSE}
112 | media_id <- 1L
113 | media_source <- get_media_source(media_id = media_id)
114 | ```
115 |
116 | ### Download article and extract social media meta data
117 |
118 | You can use the article URL to download the complete article and extract
119 | social meta data. The meta data can be analyzed using techniques such as
120 | sentiment analysis or simply compared to the article content.
121 |
122 | ```{r extract_meta_data, eval=FALSE}
123 | # load httr
124 | library(httr)
125 | # define article url
126 | url <- "https://bits.blogs.nytimes.com/2013/04/07/the-potential-and-the-risks-of-data-science"
127 | # download article
128 | response <- GET(url)
129 | # extract article html
130 | html_document <- content(response, type = "text", encoding = "UTF-8")
131 | # extract meta data from html document
132 | meta_data <- extract_meta_data(html_doc = html_document)
133 | ```
134 |
135 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # mediacloudr
5 |
6 |
7 |
8 | [](https://www.repostatus.org/#active)
11 | [](https://travis-ci.org/jandix/mediacloudr)
13 | [](https://codecov.io/gh/jandix/mediacloudr?branch=master)
15 |
16 |
17 | The goal of mediacloudr is to provide a consistent wrapper for the
18 | mediacloud.org API. The Media Cloud platform is an open-source platform,
19 | that collects all kind of news stories and provides various
20 | functionalities to query, download and analyze them. This packages tries
21 | to support R users by providing a set of functions to access various
22 | functionalities of the mediacloud.org.
23 |
24 | ## Installation
25 |
26 | The **mediacloudr**-package is now on CRAN :tada: . You can install the
27 | released version of mediacloudr from [CRAN](https://CRAN.R-project.org)
28 | with:
29 |
30 | ``` r
31 | install.packages("mediacloudr")
32 | ```
33 |
34 | And the development version from [GitHub](https://github.com/) with:
35 |
36 | ``` r
37 | # install.packages("devtools")
38 | devtools::install_github("jandix/mediacloudr")
39 | ```
40 |
41 | ## API key
42 |
43 | Please [register](https://topics.mediacloud.org/#/user/signup) as a new
44 | user. Afterwards, you can copy your API key from your [profile
45 | page](https://topics.mediacloud.org/#/user/profile).
46 |
47 | I suggest to save the API key to your R environment file. The R
48 | environment file is loaded every time R is started/restarted. You should
49 | not add the key to your scripts, because other users could misuse your
50 | key. The following steps show how to add the key to your R environment
51 | file.
52 |
53 | 1. Open your .Renviron file. The file is usually located in your home
54 | directory. If the file does not exist, just create one and name it
55 | `.Renviron`.
56 | 2. Add a new line and enter your API key in the following format:
57 | `MEDIACLOUD_API_KEY=`.
58 | 3. Save the file and restart your current R session to start using
59 | mediacloudr.
60 |
61 | ## Request Limits
62 |
63 | The mediacloud.org states the following for API request/rate limits:
64 |
65 | “Each user is limited to 1,000 API calls and 20,000 stories returned in
66 | any 7 day period. Requests submitted beyond this limit will result in a
67 | status 403 error. Users who need access to more requests should email
68 | .”
69 |
70 | ## Examples
71 |
72 | ### Get a news story by id
73 |
74 | You can query news stories by their ids. The ids can be found using the
75 | graphical interface or using the `get_story_list` function.
76 |
77 | *Note*: You don’t have to add the `api_key` argument if you followed the
78 | steps to add the api key to your R environment file.
79 |
80 | ``` r
81 | story_id <- 27456565L
82 | story <- get_story(story_id = story_id)
83 | ```
84 |
85 | ### Get a list with news stories
86 |
87 | You can query a list of news stories using `get_story_list`. You can use
88 | the `q` and `fq` arguments to filter stories. A guide to the query
89 | parameters can be found
90 | [here](https://mediacloud.org/support/query-guide/).
91 |
92 | *Note*: You don’t have to add the `api_key` argument if you followed the
93 | steps to add the api key to your R environment file.
94 |
95 | ``` r
96 | stories <- get_story_list(q = "trump")
97 | ```
98 |
99 | ### Get a media source by id
100 |
101 | You can query media sources by their ids. The ids can be found using the
102 | graphical online interface. mediacloud.org provides various meta data
103 | for their media sources.
104 |
105 | *Note*: You don’t have to add the `api_key` argument if you followed the
106 | steps to add the api key to your R environment file.
107 |
108 | ``` r
109 | media_id <- 1L
110 | media_source <- get_media_source(media_id = media_id)
111 | ```
112 |
113 | ### Download article and extract social media meta data
114 |
115 | You can use the article URL to download the complete article and extract
116 | social meta data. The meta data can be analyzed using techniques such as
117 | sentiment analysis or simply compared to the article content.
118 |
119 | ``` r
120 | # load httr
121 | library(httr)
122 | # define article url
123 | url <- "https://bits.blogs.nytimes.com/2013/04/07/the-potential-and-the-risks-of-data-science"
124 | # download article
125 | response <- GET(url)
126 | # extract article html
127 | html_document <- content(response, type = "text", encoding = "UTF-8")
128 | # extract meta data from html document
129 | meta_data <- extract_meta_data(html_doc = html_document)
130 | ```
131 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment: false
2 |
3 | coverage:
4 | status:
5 | project:
6 | default:
7 | target: auto
8 | threshold: 1%
9 | patch:
10 | default:
11 | target: auto
12 | threshold: 1%
13 |
--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## Resubmission
2 | This is a resubmission. In this version I have:
3 |
4 | * Updated the description as suggested by the CRAN maintainers.
5 |
6 | ## Resubmission
7 | This is a resubmission. In this version I have:
8 |
9 | * Updated the package title as suggested by the CRAN maintainers.
10 |
11 | * Replaced a CRAN URL with canonical form.
12 |
13 | ## Test environments
14 | * local ubuntu 18.10, R 3.6.0
15 | * ubuntu 14.04.5 (on travis-ci), R 3.5.3 (oldrel, devel and release)
16 | * win-builder (devel and release)
17 |
18 | ## R CMD check results
19 | There were no ERRORs or WARNINGs.
20 |
21 | There was 1 NOTE:
22 |
23 | * checking CRAN incoming feasibility ... NOTE
24 |
25 | ## Downstream dependencies
26 | There are currently no downstream dependencies for this package.
27 |
--------------------------------------------------------------------------------
/data/meta_data_html.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jandix/mediacloudr/c1d997690e820ac957cdcc4421e9b0e972caea42/data/meta_data_html.rda
--------------------------------------------------------------------------------
/man/extract_meta_data.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/extract_meta_data.R
3 | \name{extract_meta_data}
4 | \alias{extract_meta_data}
5 | \title{Extract meta data}
6 | \usage{
7 | extract_meta_data(html_doc)
8 | }
9 | \arguments{
10 | \item{html_doc}{Character string including the html document.}
11 | }
12 | \value{
13 | List with three sublists for native, open graph and twitter.
14 | }
15 | \description{
16 | \code{extract_meta_data} extracts native, open graph and twitter meta data
17 | from html documents. The meta data include url, title, description and image.
18 | The html document is parsed within the function
19 | }
20 | \examples{
21 | \dontrun{
22 | library(httr)
23 | url <- "https://bits.blogs.nytimes.com/2013/04/07/the-potential-and-the-risks-of-data-science"
24 | response <- GET(url)
25 | html_document <- content(response, type = "text", encoding = "UTF-8")
26 | meta_data <- extract_meta_data(html_doc = html_document)
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/man/get_media_source.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_media_source.R
3 | \name{get_media_source}
4 | \alias{get_media_source}
5 | \title{Get media by id}
6 | \usage{
7 | get_media_source(media_id, api_key = Sys.getenv("MEDIACLOUD_API_KEY"))
8 | }
9 | \arguments{
10 | \item{media_id}{Positive integer that contains a valid media`` id.}
11 |
12 | \item{api_key}{Character string with the API key you get from mediacloud.org.
13 | Passing it is compulsory. Alternatively, function can be
14 | provided from the global environment.}
15 | }
16 | \value{
17 | Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#media} for field descriptions.
18 | }
19 | \description{
20 | \code{get_media} returns media source by their id. A media source
21 | is one publisher. Every story that can be collected via \code{get_story}
22 | or \code{get_story_list} belongs to one media source.
23 | }
24 | \examples{
25 | \dontrun{
26 | media_source <- get_media_source(media_id = 604L)
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/man/get_story.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_story.R
3 | \name{get_story}
4 | \alias{get_story}
5 | \title{Get story by id}
6 | \usage{
7 | get_story(story_id, api_key = Sys.getenv("MEDIACLOUD_API_KEY"))
8 | }
9 | \arguments{
10 | \item{story_id}{Positive numeric that contains a valid story id.}
11 |
12 | \item{api_key}{Character string with the API key you get from mediacloud.org.
13 | Passing it is compulsory. Alternatively, function can be
14 | provided from the global environment.}
15 | }
16 | \value{
17 | Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#stories} for field descriptions.
18 | }
19 | \description{
20 | \code{get_story} returns news stories by their id. One story represents
21 | one online publication. Each story refers to a single URL from any feed
22 | within a single media source.
23 | }
24 | \examples{
25 | \dontrun{
26 | story <- get_story(story_id = 604L)
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/man/get_story_list.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/get_story_list.R
3 | \name{get_story_list}
4 | \alias{get_story_list}
5 | \title{Get story list}
6 | \usage{
7 | get_story_list(last_process_stories_id = 0L, rows = 100,
8 | feeds_id = NULL, q = NULL, fq = NULL,
9 | sort = "processed_stories_id", wc = FALSE, show_feeds = FALSE,
10 | api_key = Sys.getenv("MEDIACLOUD_API_KEY"))
11 | }
12 | \arguments{
13 | \item{last_process_stories_id}{Return stories in which the
14 | processed_stories_id is greater than this
15 | value.}
16 |
17 | \item{rows}{Number of stories to return, max 1000.}
18 |
19 | \item{feeds_id}{Return only stories that match the given feeds_id, sorted
20 | my descending publish date}
21 |
22 | \item{q}{If specified, return only results that match the given Solr query.
23 | Only one q parameter may be included.}
24 |
25 | \item{fq}{If specified, file results by the given Solr query. More than one
26 | fq parameter may be included.}
27 |
28 | \item{sort}{Returned results sort order. Supported values:
29 | processed_stories_id, random}
30 |
31 | \item{wc}{If set to TRUE, include a 'word_count' field with each story that
32 | includes a count of the most common words in the story}
33 |
34 | \item{show_feeds}{If set to TRUE, include a 'feeds' field with a list of the
35 | feeds associated with this story}
36 |
37 | \item{api_key}{Character string with the API key you get from mediacloud.org.
38 | Passing it is compulsory. Alternatively, function can be
39 | provided from the global environment.}
40 | }
41 | \value{
42 | Data frame with results. See \url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#stories} for field descriptions.
43 | }
44 | \description{
45 | \code{get_story} returns a list of stories based on a multifaceted query. One
46 | story represents one online publication. Each story refers to a single URL
47 | from any feed within a single media source.
48 | }
49 | \examples{
50 | \dontrun{
51 | stories <- get_story_list()
52 | stories <- get_story_list(q = "Trump")
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/man/meta_data_html.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/meta_data_html.R
3 | \docType{data}
4 | \name{meta_data_html}
5 | \alias{meta_data_html}
6 | \title{HTML document to test \code{extract_meta_data}}
7 | \format{An object of class \code{character} of length 1.}
8 | \usage{
9 | meta_data_html
10 | }
11 | \description{
12 | A HTML document with basic meta tags for open-graph, twitter and native
13 | meta data.
14 | }
15 | \keyword{datasets}
16 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(mediacloudr)
3 | test_check("mediacloudr")
4 |
--------------------------------------------------------------------------------
/tests/testthat/test_extract_meta_data.R:
--------------------------------------------------------------------------------
1 | context("Extract Meta Data")
2 |
3 | # invalid inputs ---------------------------------------------------------------
4 | testthat::test_that(
5 | "test that function returns error if no html document provided", {
6 | testthat::expect_error(mediacloudr::extract_meta_data(),
7 | regexp = "Please define a html document.")
8 | })
9 |
10 | # test html document -----------------------------------------------------------
11 | html_doc <- mediacloudr::meta_data_html
12 | meta_data <- list(
13 | open_graph = list(
14 | url = "https://github.com/jandix/mediacloudr",
15 | type = "article",
16 | title = "Hello world!",
17 | image = "https://images.pexels.com/photos/113338/pexels-photo-113338.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=650&w=940",
18 | description = "mediacloudr hello world test file"
19 | ),
20 | twitter = list(
21 | url = "https://github.com/jandix/mediacloudr",
22 | title = "Hello world!",
23 | description = "mediacloudr hello world test file",
24 | image = "https://images.pexels.com/photos/113338/pexels-photo-113338.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=650&w=940",
25 | image_alt = "Trees",
26 | card = "summary_large_image"
27 | ),
28 | native = list (
29 | title = "Hello world!",
30 | description = "mediacloudr hello world test file",
31 | image = "https://images.pexels.com/photos/113338/pexels-photo-113338.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=650&w=940",
32 | thumbnail = "https://images.pexels.com/photos/113338/pexels-photo-113338.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=50"
33 | )
34 | )
35 | testthat::test_that(
36 | "test that function parses the document correct", {
37 | testthat::expect_equal(mediacloudr::extract_meta_data(html_doc = html_doc), meta_data)
38 | })
39 |
40 | # test result set --------------------------------------------------------------
41 | testthat::test_that(
42 | "test that function returns extactly 100 rows", {
43 | testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
44 | message = "API key not available in environment. Skipping test.")
45 | example_result <- mediacloudr::get_story_list()
46 | testthat::expect_equal(nrow(example_result), 100,
47 | info = paste0("100 row expected, but got ", nrow(example_result), " row(s)."))
48 | })
49 | testthat::test_that(
50 | "test that function returns 15 columns", {
51 | testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
52 | message = "API key not available in environment. Skipping test.")
53 | example_result <- mediacloudr::get_story_list()
54 | testthat::expect_equal(ncol(example_result), 15,
55 | info = paste0("15 cols expected, but got ", ncol(example_result), " col(s)."))
56 | })
57 |
--------------------------------------------------------------------------------
/tests/testthat/test_get_media_source.R:
--------------------------------------------------------------------------------
1 | context("Get Media Source")
2 |
3 | # invalid inputs ---------------------------------------------------------------
4 | testthat::test_that(
5 | "test that function returns error if no media_id provided", {
6 | testthat::expect_error(mediacloudr::get_media_source(),
7 | regexp = "Please define a media id.")
8 | })
9 |
10 | testthat::test_that(
11 | "test that function returns error if media_id is float", {
12 | testthat::expect_error(mediacloudr::get_media_source(media_id = 6.496),
13 | regexp = "Please provide a positive integer for media id.")
14 | })
15 |
16 | testthat::test_that(
17 | "test that function returns error if media_id is character", {
18 | testthat::expect_error(mediacloudr::get_media_source(media_id = "abs"),
19 | regexp = "Please provide a positive integer for media id.")
20 | })
21 |
22 | testthat::test_that(
23 | "test that function returns error if media_id negative", {
24 | testthat::expect_error(mediacloudr::get_media_source(media_id = -2L),
25 | regexp = "Please provide a positive integer for media id.")
26 | })
27 |
28 | testthat::test_that(
29 | "test that function returns error if no media_id provided", {
30 | testthat::expect_error(mediacloudr::get_media_source(media_id = 604L, api_key = ""),
31 | regexp = "Please define an API key.")
32 | })
33 |
34 | # test result set --------------------------------------------------------------
35 | testthat::test_that(
36 | "test that function returns only one row", {
37 | testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
38 | message = "API key not available in environment. Skipping test.")
39 | example_result <- mediacloudr::get_media_source(1L)
40 | testthat::expect_equal(nrow(example_result), 1,
41 | info = paste0("1 row expected, but got ", nrow(example_result), " row(s)."))
42 | })
43 | testthat::test_that(
44 | "test that function returns 15 columns", {
45 | testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
46 | message = "API key not available in environment. Skipping test.")
47 | example_result <- mediacloudr::get_media_source(1L)
48 | testthat::expect_equal(ncol(example_result), 10,
49 | info = paste0("10 cols expected, but got ", nrow(example_result), " col(s)."))
50 | })
51 |
--------------------------------------------------------------------------------
/tests/testthat/test_get_story.R:
--------------------------------------------------------------------------------
1 | context("Get Story")
2 |
3 | # invalid inputs ---------------------------------------------------------------
4 | testthat::test_that(
5 | "test that function returns error if no story_id provided", {
6 | testthat::expect_error(mediacloudr::get_story(),
7 | regexp = "Please define a story id.")
8 | })
9 |
10 | testthat::test_that(
11 | "test that function returns error if story_id is float", {
12 | testthat::expect_error(mediacloudr::get_story(story_id = 6.496),
13 | regexp = "Please provide a positive integer for story id.")
14 | })
15 |
16 | testthat::test_that(
17 | "test that function returns error if story_id is character", {
18 | testthat::expect_error(mediacloudr::get_story(story_id = "abs"),
19 | regexp = "Please provide a positive integer for story id.")
20 | })
21 |
22 | testthat::test_that(
23 | "test that function returns error if story_id negative", {
24 | testthat::expect_error(mediacloudr::get_story(story_id = -2L),
25 | regexp = "Please provide a positive integer for story id.")
26 | })
27 |
28 | testthat::test_that(
29 | "test that function returns error if no api_key provided", {
30 | testthat::expect_error(mediacloudr::get_story(story_id = 604L, api_key = ""),
31 | regexp = "Please define an API key.")
32 | })
33 |
34 | # test result set --------------------------------------------------------------
35 | testthat::test_that(
36 | "test that function returns only one row", {
37 | testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
38 | message = "API key not available in environment. Skipping test.")
39 | example_result <- mediacloudr::get_story(27456565L)
40 | testthat::expect_equal(nrow(example_result), 1,
41 | info = paste0("1 row expected, but got ", nrow(example_result), " row(s)."))
42 | })
43 | testthat::test_that(
44 | "test that function returns 15 columns", {
45 | testthat::skip_if(nchar(Sys.getenv("MEDIACLOUD_API_KEY")) == 0,
46 | message = "API key not available in environment. Skipping test.")
47 | example_result <- mediacloudr::get_story(27456565L)
48 | testthat::expect_equal(ncol(example_result), 15,
49 | info = paste0("15 cols expected, but got ", nrow(example_result), " col(s)."))
50 | })
51 |
--------------------------------------------------------------------------------
/tests/testthat/test_get_story_list.R:
--------------------------------------------------------------------------------
1 | context("Get Story List")
2 |
3 | # invalid inputs ---------------------------------------------------------------
4 | testthat::test_that(
5 | "test that function returns error if story_id is float", {
6 | testthat::expect_error(mediacloudr::get_story_list(last_process_stories_id = 6.496),
7 | regexp = "Please provide a positive integer for last process stories id")
8 | })
9 |
10 | testthat::test_that(
11 | "test that function returns error if story_id is character", {
12 | testthat::expect_error(mediacloudr::get_story_list(last_process_stories_id = "abs"),
13 | regexp = "Please provide a positive integer for last process stories id")
14 | })
15 |
16 | testthat::test_that(
17 | "test that function returns error if story_id negative", {
18 | testthat::expect_error(mediacloudr::get_story_list(last_process_stories_id = -2L),
19 | regexp = "Please provide a positive integer for last process stories id")
20 | })
21 |
22 | testthat::test_that(
23 | "test that function returns error if no api_key provided", {
24 | testthat::expect_error(mediacloudr::get_story_list(api_key = ""),
25 | regexp = "Please define an API key.")
26 | })
27 |
28 | testthat::test_that(
29 | "test that function returns error if row smaller 1", {
30 | testthat::expect_error(mediacloudr::get_story_list(rows = 0),
31 | regexp = "Rows should be larger than 0 and smaller or equal to 1000.")
32 | })
33 |
34 | testthat::test_that(
35 | "test that function returns error if row larger 1000", {
36 | testthat::expect_error(mediacloudr::get_story_list(rows = 1001),
37 | regexp = "Rows should be larger than 0 and smaller or equal to 1000.")
38 | })
39 |
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/vignettes/extract_meta_data.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Extract Social Media Meta Data"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{extract_meta_data}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | We start with loading the required packages. `mediacloudr` is used to download
11 | an article based on a article id received from the
12 | [Media Cloud Topic Mapper](https://topics.mediacloud.org/). Furthermore,
13 | `mediacloudr` provides a function to extract social media meta data from
14 | HTML documents. `httr` is used to turn R into a HTTP client to download and
15 | process the responding article page. We use `xml2` to parse - make it readable
16 | for R - the HTML document and `rvest` to find elements of interest within
17 | the HTML document.
18 |
19 | ```{r setup, eval=FALSE}
20 | # load required packages
21 | library(mediacloudr)
22 | library(httr)
23 | library(xml2)
24 | library(rvest)
25 | ```
26 |
27 | In the first step, we request the article with the id `1126843780`. It is
28 | important to add the upper case `L` to the number to turn the numeric type into
29 | an integer type. Otherwise the function will throw an error. The article was
30 | selected with help of the
31 | [Media Cloud Topic Mapper](https://topics.mediacloud.org/) online tool. If you
32 | created an account, you can create and analyze your own topics.
33 |
34 | ```{r get_article, eval=FALSE}
35 | # define media id as integer
36 | story_id <- 1126843780L
37 | # download article
38 | article <- get_story(story_id = story_id)
39 | ```
40 |
41 | The USA Today [news article](https://eu.usatoday.com/story/news/2018/12/27/ice-drops-off-migrants-phoenix-greyhound-bus-station/2429545002/) comes with an URL which we can use to download the
42 | complete article using the `httr` package. We use the `GET` function to
43 | download the article. Afterwards, we extract the website using the `content`
44 | function. It is important to provide the `type` argument to extract the text
45 | only. Otherwise, the function tries to guess the type and will automatically
46 | parse the content based on the `content-type` HTTP header. The author of the
47 | `httr` package
48 | [suggests](https://CRAN.R-project.org/package=httr/vignettes/quickstart.html)
49 | to manually parse the content. In this case, we use the `read_html` function
50 | which is provided in the `xml2` package.
51 |
52 | ```{r download_website, eval=FALSE}
53 | # download article
54 | response <- GET(article$url[1])
55 | # extract article html
56 | html_document <- content(response, type = "text", encoding = "UTF-8")
57 | # parse website
58 | parsed_html <- read_html(html_document)
59 | ```
60 |
61 | After parsing the response into a R readable format, we extract the actual body
62 | of the article. Therefore, we use the `html_nodes` function to find the html
63 | tags with defined in the `css` argument. A useful open source tool to find the
64 | corresponding tags or css classes is the
65 | [Selector Gadget](https://selectorgadget.com/). Alternatively, you can use the
66 | developer tools of the browser you are usually using. The `html_text` provides
67 | us with a character vector. Each element contains a paragraph of the article. We
68 | use the `paste` function to merge the paragraph into one closed text. We could
69 | analyze the text using different metrics such as word frequencies or sentiment
70 | analysis.
71 |
72 | ```{r body_content, eval=FALSE}
73 | # extract article body
74 | article_body_nodes <- html_nodes(x = parsed_html, css = ".content-well div p")
75 | article_body <- html_text(x = article_body_nodes)
76 | # paste character vector to one text
77 | article_body <- paste(article_body, collapse = " ")
78 | ```
79 |
80 | In the last step, we extract the social media meta data from the article. Social
81 | media meta data are shown if the article URL is shared on social media. The
82 | article representation usually include a heading, summary and a small
83 | image/thumbnail. The `extract_meta_data` expects a raw HTML document and
84 | provides [Open Graph](http://ogp.me/) (a standard introduced by Facebook), [Twitter](https://developer.twitter.com/en/docs/tweets/optimize-with-cards/overview/markup.html)
85 | and native meta data.
86 |
87 | ```{r get_meta_data, eval=FALSE}
88 | # extract meta data from html document
89 | meta_data <- extract_meta_data(html_doc = html_document)
90 | ```
91 |
92 | Open Graph Title: *"ICE drops off migrants at Phoenix bus station"*
93 |
94 | Article Title (provided by mediacloud.org): *"Arizona churches working to help
95 | migrants are 'at capacity' or 'tapped out on resources'"*
96 |
97 | The meta data can be compared to the original content of the article. A short
98 | analysis reveals that USA Today chose a different heading to advertise the
99 | article on Facebook. Larger analysis can use quantitative tools such as string
100 | similarity measures, such as provided by the `stringdist` package.
101 |
--------------------------------------------------------------------------------