├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── aaa.r ├── gd-top-trending.R ├── list-chyrons.r ├── networks.r ├── newsflash-package.R ├── newsflash.r ├── third-eye.r ├── top-tending-range.r ├── word-cloud.R └── zzz.R ├── README.Rmd ├── README.md ├── README_cache └── gfm │ ├── __packages │ ├── unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.RData │ ├── unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.rdb │ ├── unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.rdx │ ├── unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.RData │ ├── unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.rdb │ ├── unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.rdx │ ├── unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.RData │ ├── unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.rdb │ ├── unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.rdx │ ├── unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.RData │ ├── unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.rdb │ ├── unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.rdx │ ├── unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.RData │ ├── unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.rdb │ ├── unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.rdx │ ├── unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.RData │ ├── unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.rdb │ └── unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.rdx ├── README_files ├── figure-gfm │ ├── unnamed-chunk-10-1.png │ ├── unnamed-chunk-4-1.png │ ├── unnamed-chunk-6-1.png │ └── unnamed-chunk-7-1.png ├── figure-markdown_github-ascii_identifiers │ ├── unnamed-chunk-10-1.png │ ├── unnamed-chunk-11-1.png │ ├── unnamed-chunk-4-1.png │ └── unnamed-chunk-9-1.png └── figure-markdown_github │ ├── unnamed-chunk-10-1.png │ ├── unnamed-chunk-5-1.png │ ├── unnamed-chunk-6-1.png │ ├── unnamed-chunk-7-1.png │ ├── unnamed-chunk-8-1.png │ └── unnamed-chunk-9-1.png ├── man ├── gd_top_trending.Rd ├── iatv_top_trending.Rd ├── list_chyrons.Rd ├── list_networks.Rd ├── newsflash.Rd ├── query_tv.Rd ├── read_chyrons.Rd └── word_cloud.Rd ├── newsflash.Rproj └── tests ├── test-all.R └── testthat └── test-newsflash.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | ^README\.*Rmd$ 5 | ^README\.*html$ 6 | ^NOTES\.*Rmd$ 7 | ^NOTES\.*html$ 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Rproj 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | warnings_are_errors: true 3 | sudo: required 4 | 5 | cache: packages 6 | 7 | r: 8 | - oldrel 9 | - release 10 | - devel 11 | 12 | apt_packages: 13 | - libv8-dev 14 | - xclip 15 | 16 | env: 17 | global: 18 | - CRAN: http://cran.rstudio.com 19 | 20 | notifications: 21 | email: 22 | - bob@rud.is 23 | irc: 24 | channels: 25 | - "104.236.112.222#builds" 26 | nick: travisci 27 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: newsflash 2 | Type: Package 3 | Title: Tools to Work with the Internet Archive and GDELT Television Explorer 4 | Version: 0.6.0 5 | Date: 2017-10-01 6 | Author: Bob Rudis (bob@rud.is) 7 | Authors@R: c( 8 | person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), 9 | comment = c(ORCID = "0000-0001-5670-2640")), 10 | person("Abe", "Neuwirth", role = c("ctb")), 11 | person("Mike", "Gruszczynski", role = c("ctb")) 12 | ) 13 | Encoding: UTF-8 14 | Maintainer: Bob Rudis 15 | Description: The 'GDELT' Television Explorer () 16 | provides a simple and straighforward interface for searching through current and historical 17 | closed-caption records from television news sources all across the globe. Functions are 18 | provided to query and tidy this data for more in-depth analyses. 19 | URL: https://github.com/hrbrmstr/newsflash 20 | BugReports: https://github.com/hrbrmstr/newsflash/issues 21 | License: AGPL 22 | Suggests: 23 | testthat 24 | Depends: 25 | R (>= 3.2.0) 26 | Imports: 27 | httr, 28 | jsonlite, 29 | dplyr, 30 | tidyr, 31 | lubridate, 32 | rvest, 33 | xml2, 34 | stringi, 35 | DT, 36 | scales, 37 | purrr, 38 | tidytext, 39 | curl, 40 | txtplot, 41 | readr, 42 | utils 43 | RoxygenNote: 6.0.1.9000 44 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(gd_top_trending) 4 | export(iatv_top_trending) 5 | export(list_chyrons) 6 | export(list_networks) 7 | export(query_tv) 8 | export(read_chyrons) 9 | export(word_cloud) 10 | import(httr) 11 | importFrom(DT,datatable) 12 | importFrom(curl,curl_fetch_memory) 13 | importFrom(dplyr,"%>%") 14 | importFrom(dplyr,arrange) 15 | importFrom(dplyr,as_data_frame) 16 | importFrom(dplyr,count) 17 | importFrom(dplyr,data_frame) 18 | importFrom(dplyr,mutate) 19 | importFrom(dplyr,progress_estimated) 20 | importFrom(dplyr,select) 21 | importFrom(dplyr,tbl_df) 22 | importFrom(jsonlite,fromJSON) 23 | importFrom(lubridate,is.Date) 24 | importFrom(lubridate,ymd_hms) 25 | importFrom(purrr,"%||%") 26 | importFrom(purrr,discard) 27 | importFrom(purrr,keep) 28 | importFrom(purrr,map) 29 | importFrom(purrr,map_df) 30 | importFrom(purrr,safely) 31 | importFrom(readr,cols) 32 | importFrom(readr,read_tsv) 33 | importFrom(rvest,html_attr) 34 | importFrom(rvest,html_nodes) 35 | importFrom(rvest,html_text) 36 | importFrom(scales,comma) 37 | importFrom(stringi,stri_match_all_regex) 38 | importFrom(stringi,stri_read_lines) 39 | importFrom(stringi,stri_replace_all_regex) 40 | importFrom(stringi,stri_split_fixed) 41 | importFrom(tidyr,unnest) 42 | importFrom(tidytext,unnest_tokens) 43 | importFrom(txtplot,txtbarchart) 44 | importFrom(utils,browseURL) 45 | importFrom(xml2,read_html) 46 | importFrom(xml2,read_xml) 47 | importFrom(xml2,xml_attr) 48 | importFrom(xml2,xml_find_all) 49 | importFrom(xml2,xml_find_first) 50 | importFrom(xml2,xml_text) 51 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 0.6.0 2 | * add functions to work with the Third Eye chyron scraper archive 3 | 4 | 0.5.0 5 | * add `top_trending()` 6 | * add `top_trending_ranged()` 7 | 8 | 0.4.1 9 | * add `txtplot` to `DESCRIPTION`; Fixes #2 10 | 11 | 0.4.0 12 | * had to switch to `curl` direct calls since `httr` was being silly on large JSON results 13 | * sub out `anytime` for `lubridate` to handle hour resolution in `top_matches` 14 | * Handle support for new query features 15 | 16 | 0.3.0 17 | * `top_text()` returns a tidy data frame by default 18 | 19 | 0.2.0 20 | * Some extra helper functions 21 | 22 | 0.1.0 23 | * Initial release 24 | -------------------------------------------------------------------------------- /R/aaa.r: -------------------------------------------------------------------------------- 1 | utils::globalVariables(c("station_values", "date_start", "date_end", "keyword", "network", "date_range", 2 | "station", "show", "show_date", "word", "snippet", ".x")) 3 | 4 | sfj <- purrr::safely(jsonlite::fromJSON) 5 | 6 | s_head <- purrr::safely(httr::HEAD) -------------------------------------------------------------------------------- /R/gd-top-trending.R: -------------------------------------------------------------------------------- 1 | #' Top Trending (GDELT) 2 | #' 3 | #' Retrieve current (last 15 minute) "top topics" being discussed on stations 4 | #' @export 5 | gd_top_trending <- function() { 6 | query_tv("", mode = "TrendingTopics") 7 | } 8 | 9 | -------------------------------------------------------------------------------- /R/list-chyrons.r: -------------------------------------------------------------------------------- 1 | #' Retrieve Third Eye chyron index 2 | #' 3 | #' Returns a data frame with available chyron dates & selected metadata. 4 | #' 5 | #' @md 6 | #' @return data frame with three columns: 7 | #' - `ts` (`POSIXct`) chyron timestamp 8 | #' - `type` (`character`) `raw` or `cleaned` 9 | #' - `size` (`numeric`) size of the feed file in bytes 10 | #' @export 11 | list_chyrons <- function() { 12 | 13 | doc <- xml2::read_xml("https://archive.org/download/third-eye/third-eye_files.xml") 14 | fils <- xml_find_all(doc, ".//file[contains(@name, 'tsv') and (contains(@name, '20'))]") 15 | 16 | fname <- xml_attr(fils, "name") 17 | 18 | data_frame( 19 | ts = as.Date(substr(fname, 1, 10)), 20 | type = ifelse(grepl("twe", fname), "cleaned", "raw"), 21 | size = as.numeric(xml_text(xml_find_first(fils, ".//size"))) 22 | ) %>% arrange(desc(ts)) 23 | 24 | } 25 | -------------------------------------------------------------------------------- /R/networks.r: -------------------------------------------------------------------------------- 1 | #' Helper function to identify station/network keyword and corpus date range for said market 2 | #' 3 | #' The \code{filter_network} of \code{query_tv()} is picky so this helps you idenitify the 4 | #' keyword to use for the particular network/station. 5 | #' 6 | #' The list also shows the date ranges available for the captions, so you can use that as 7 | #' a guide when picking dates. 8 | #' 9 | #' In interactive mode it uses \code{DT::datatable()}. You can force it to just display to 10 | #' the console by passing in \code{widget=FALSE} 11 | #' 12 | #' @export 13 | #' @param widget if `TRUE` then an HTML widget will be displayed to make it easier to 14 | #' sift through stations/networks 15 | #' @return data frame 16 | #' @examples 17 | #' list_networks() # widget 18 | #' print(list_networks(FALSE)) # no widget 19 | list_networks <- function(widget = interactive()) { 20 | 21 | xdf <- jsonlite::fromJSON("https://api.gdeltproject.org/api/v2/tv/tv?mode=stationdetails&format=json") 22 | 23 | xdf$station_details %>% 24 | mutate(StartDate = as.Date(anytime::anytime(StartDate))) %>% 25 | mutate(EndDate = as.Date(anytime::anytime(StartDate))) -> xdf 26 | 27 | if (widget) DT::datatable(xdf) 28 | 29 | class(xdf) <- c("tbl_df", "tbl", "data.frame") 30 | 31 | xdf 32 | 33 | } 34 | 35 | -------------------------------------------------------------------------------- /R/newsflash-package.R: -------------------------------------------------------------------------------- 1 | #' Tools to Work with the Internet Archive and GDELT Television Explorer 2 | #' 3 | #' @name newsflash 4 | #' @docType package 5 | #' @author Bob Rudis (bob@@rud.is) 6 | #' @import httr 7 | #' @importFrom readr read_tsv cols 8 | #' @importFrom rvest html_nodes html_attr html_text 9 | #' @importFrom stringi stri_match_all_regex stri_replace_all_regex stri_split_fixed stri_read_lines 10 | #' @importFrom xml2 read_html read_xml xml_find_all xml_text xml_attr xml_find_first 11 | #' @importFrom lubridate ymd_hms is.Date 12 | #' @importFrom tidyr unnest 13 | #' @importFrom dplyr tbl_df %>% mutate data_frame count as_data_frame select progress_estimated arrange 14 | #' @importFrom purrr map_df %||% safely map discard keep 15 | #' @importFrom jsonlite fromJSON 16 | #' @importFrom DT datatable 17 | #' @importFrom scales comma 18 | #' @importFrom txtplot txtbarchart 19 | #' @importFrom tidytext unnest_tokens 20 | #' @importFrom curl curl_fetch_memory 21 | #' @importFrom utils browseURL 22 | NULL 23 | -------------------------------------------------------------------------------- /R/newsflash.r: -------------------------------------------------------------------------------- 1 | #' Issue a query to the TV Explorer 2 | #' 3 | #' NOTE: The `mode` parameter controls what is returned. See the section on `Mode` for more information on available modes. 4 | #' 5 | #' @section Mode: 6 | #' 7 | #' This specifies the specific output you would like from the API, ranging from timelines to word clouds to clip galleries. 8 | #' 9 | #' - `TimelineVol`. (Default) This tracks how many results your search generates by day/hour over the selected time period, allowing you to assess the relative attention each is paying to the topic and how that attention has varied over time. Using the DATANORM parameter you can control whether this reports results as raw clip counts or as normalized percentages of all coverage (the most robust way of comparing stations). By default, the timeline will not display the most recent 24 hours, since those results are still being generated (it can take up to 2-12 hours for a show to be processed by the Internet Archive and ready for analysis), but you can include those if needed via the LAST24 option. You can also smooth the timeline using the TIMELINESMOOTH option and combine all selected stations into a single time series using the DATACOMB option. 10 | #' - `StationChart`. This compares how many results your search generates from each of the selected stations over the selected time period, allowing you to assess the relative attention each is paying to the topic. Using the DATANORM parameter you can control whether this reports results as raw clip counts or as normalized percentages of all coverage (the most robust way of comparing stations). 11 | #' - `TimelineVolNorm`. This displays the total airtime (in terms of 15 second clips) monitored from each of the stations in your query. It must be combined with a valid query, since it displays the airtime for the stations queried in the search. This mode can be used to identify brief monitoring outages or for advanced normalization, since it reports the total amount of clips monitored overall from each station in each day/hour. 12 | #' 13 | #' @section Queries: 14 | #' 15 | #' The GDELT TV API supports keyword and keyphrase searches, OR statements and a variety of advanced operators. NOTE – all of the operators below must be used as part of the value of the QUERY field, separated by spaces, and cannot be used as URL parameters on their own. 16 | #' 17 | #' - `""`. Anything found inside of quote marks is treated as an exact phrase search. Thus, you can search for "Donald Trump" to find all matches of his name. (e.g `"donald trump"`) 18 | #' - `(a OR b)`. You can specify a list of keywords to be boolean OR'd together by enclosing them in parentheses and placing the capitalized word "OR" between each keyword or phrase. Boolean OR blocks cannot be nested at this time. For example, to search for mentions of Clinton, Sanders or Trump, you would use "`(clinton OR sanders OR trump)`" 19 | #' - `-`. You can place a minus sign in front of any operator, word or phrase to exclude it. For example "-sanders" would exclude results that contained "sanders" from your results. (e.g. `-sanders`) 20 | #' - `Context`. By default all of your keywords/phrases must appear in a single 15 second clip. (Phrases are allowed to span across two clips and are counted towards the clip they started in). The "context" operator allows you to require that a given keyword/phrase appears either in the 15 second clip or in the 15 second clips immediately before or after it. This gives you a bit of additional search fuzziness. Even when searching for a single word, it must appear in quote marks. (e.g. `context:"russia"`) 21 | #' - `Market`. This narrows your search to a particular geographic market. The list of available markets can be found via the Station Details mode (look for the city name in the description of local stations). Example markets include "San Francisco" and "Philadelphia". The market name must be enclosed in quote marks. You can also use the special reserved market "National" to search the major national networks together. (e.g. `market:"San Francisco"`) 22 | #' - `Network`. This narrows your search to a particular television network. The list of available networks can be found via the Station Details mode (look for the network name in the description of local stations). Example markets include "CBS" and "NBC". Do not use quote marks around the network name. (e.g. `network:CBS`) 23 | #' - Show. This narrows your search to a particular television show. This must be the complete show name as returned by the TV API. To find a particular show, search the API and use the "clipgallery" mode to display matching clips and their source show. For example, to limit your search to the show Hardball With Chris Matthews, you'd search for "show:"Hardball With Chris Matthews"". Note that you must surround the show name with quote marks. Remember that the TV API only searches shows monitored by the Internet Archive's Television News Archive, which may not include all shows. (e.g. `show:"Hardball With Chris Matthews"`) 24 | #' - `Station`. This narrows your search to a particular television station. Remember that the TV API only searches stations monitored by the Internet Archive's Television News Archive and not all of those stations have been monitored for the entire 2009-present time period. Do not use quote marks around the name of the station. To find the Station ID of a particular station, use the Station Details mode. (e.g. `station:CNN`) 25 | #' 26 | #' @md 27 | #' @param query query string in GDELT format. See `QUERY` in https://blog.gdeltproject.org/gdelt-2-0-television-api-debuts/ 28 | #' for details; use [list_networks()] to obtain valid station/network identifiers. If 29 | #' no `Network:`, `Market:` or `Station:` qualifiers are found `Market:"National"` is automatically added. 30 | #' @param mode See `Mode` section 31 | #' @param start_date,end_date start/end dates. Leaving both `NULL` searches all archive history. 32 | #' Leaving just `start_date` `NULL` sets the start date to July 2009. Leaving just `end_date` 33 | #' `NULL` sets the end date to today. 34 | #' @param datanorm normalized ("`perc`") vs "`raw`" counts; defaults to `perc`. 35 | #' @param timelinesmooth a smoothing value applying moving averages over 15-minute increments 36 | #' @param datacomb if "`combined`", all network volume is combined into a single value. 37 | #' Defaults to "`separate`". 38 | #' @param last_24 It can take the Internet Archive up to 24 hours to process a broadcast once 39 | #' it concludes. Thus, by default the TV API does not return results from the most recent 40 | #' 24 hours to ensure that analyses are not skewed by partial results. However, when 41 | #' tracking breaking news events, it may be desirable to view partial results with the 42 | #' understanding that any time or station-based trends may not accurately reflect the 43 | #' totality of their coverage. To include results from the most recent 24 hours, 44 | #' set this URL parameter to "yes". 45 | #' @return Different objects for different `mode`s: 46 | #' - `TimelineVol` : a data frame with stations & counts (raw or normalied) 47 | #' - `TimelineVolNorm` : a data frame of station & topic airtime 48 | #' - `StationChart` : a data frame of stations and search result counts (raw or normalized) 49 | #' @references 50 | #' @export 51 | #' @examples 52 | #' query_tv("(terror isis") 53 | #' query_tv("british prime minister") 54 | #' query_tv("mexican president") 55 | query_tv <- function(query, 56 | mode = c("TimelineVol", "StationChart", "TimelineVolNorm"), 57 | start_date = NULL, 58 | end_date = NULL, 59 | datanorm = c("perc", "raw"), 60 | timelinesmooth = 0, 61 | datacomb = c("separate", "combined"), 62 | last_24 = c("yes", "no")) { 63 | 64 | if (!grepl("Network:|Market:|Station:", query, ignore.case = TRUE)) { 65 | query <- sprintf('%s Market:"National"', query) 66 | } 67 | 68 | mode <- mode[1] 69 | 70 | if (!(mode %in% c("TimelineVol", "ClipGallery", "StationChart", 71 | "TimelineVolNorm", "TrendingTopics", "WordCloud"))) { 72 | stop("Invalid 'mode'", call.=FALSE) 73 | } 74 | 75 | datanorm <- match.arg(datanorm, c("perc", "raw")) 76 | 77 | datacomb <- match.arg(datacomb, c("separate", "combined")) 78 | if (datacomb == "separate") datacomb <- NULL 79 | 80 | last_24 <- match.arg(last_24, c("yes", "no")) 81 | if (last_24 == "no") last_24 <- NULL 82 | 83 | if (is.null(start_date)) start_date <- as.Date("2009-07-02") 84 | if (is.null(end_date)) end_date <- Sys.Date() 85 | 86 | start_date <- as.POSIXct(start_date) 87 | end_date <- as.POSIXct(end_date) 88 | 89 | start_date <- format(start_date, "%Y%m%d%H%M%S") 90 | end_date <- format(end_date, "%Y%m%d%H%M%S") 91 | 92 | list( 93 | query = query, 94 | mode = mode, 95 | format = "json", 96 | datanorm = datanorm, 97 | datacomb = datacomb, 98 | startdatetime = start_date, 99 | enddatetime = end_date, 100 | timelinesmooth = timelinesmooth, 101 | last24 = last_24 102 | ) -> query 103 | 104 | if (mode == "ClipGallery") query$maxresults <- 3000L 105 | 106 | httr::GET( 107 | url = "https://api.gdeltproject.org/api/v2/tv/tv", 108 | query = query 109 | ) -> res 110 | 111 | if (!(res$status_code < 300)) { 112 | stop(sprintf("[%s] Query or API error on request [%s]", res$status_code, URL), call.=FALSE) 113 | } 114 | 115 | res <- httr::content(res) 116 | 117 | if (mode %in% c("TimelineVol", "TimelineVolNorm")) { 118 | 119 | tibble::data_frame( 120 | network = res$timeline %>% purrr::map_chr("series"), 121 | data = res$timeline %>% purrr::map("data")) %>% 122 | tidyr::unnest(data) %>% 123 | dplyr::mutate( 124 | date = data %>% 125 | purrr::map_chr("date") %>% 126 | # hourly data doesn't have times - API doesn't return time values - split off non-hms data from date-time 127 | #sapply(., function(x) strsplit(x, "T")[[1]][1]) %>% 128 | lubridate::ymd_hms(), 129 | value = data %>%purrr::map_dbl("value") 130 | ) %>% 131 | dplyr::select(-data) 132 | 133 | } else if (mode == "ClipGallery") { 134 | 135 | purrr::map_df(res$clips, ~.x) %>% 136 | dplyr::mutate(date = anytime::anydate(date)) %>% 137 | dplyr::mutate(show_date = anytime::anydate(show_date)) 138 | 139 | } else if (mode == "StationChart") { 140 | 141 | purrr::map_df(res$stationchart, ~.x) 142 | 143 | } else if (mode == "TrendingTopics") { 144 | 145 | list( 146 | overall_trending_topics = unlist(res$OverallTrendingTopics, use.names = FALSE), 147 | station_trending_topics = purrr::map_df(res$StationTrendingTopics, ~{ 148 | dplyr::data_frame( 149 | station = .x$Station, 150 | topic = unlist(.x$Topics, use.names = FALSE) 151 | ) 152 | }), 153 | station_top_topics = purrr::map_df(res$StationTopTopics, ~{ 154 | dplyr::data_frame( 155 | station = .x$Station, 156 | topic = unlist(.x$Topics, use.names = FALSE) 157 | ) 158 | }), 159 | overall_trending_phrases = unlist(res$OverallTrendingPhrases, use.names=FALSE) 160 | ) 161 | 162 | } else if (mode == "WordCloud") { 163 | 164 | purrr::map_df(res$wordcloud, ~.x) 165 | 166 | } 167 | 168 | } 169 | -------------------------------------------------------------------------------- /R/third-eye.r: -------------------------------------------------------------------------------- 1 | readr::cols( 2 | ts = readr::col_datetime(format = ""), 3 | channel = readr::col_character(), 4 | duration = readr::col_integer(), 5 | details = readr::col_character(), 6 | text = readr::col_character() 7 | ) -> .third_eye_cols 8 | 9 | .third_eye_col_names <- c("ts", "channel", "duration", "details", "text") 10 | .third_eye_url_tmpl <- "https://archive.org/download/third-eye/%s%s.tsv" 11 | 12 | #' Retrieve TV News Archive chyrons from the Internet Archive's Third Eye project 13 | #' 14 | #' The TV News Archive's Third Eye project captures the chyrons–or narrative text–that appear on the lower third of TV news screens and turns them into downloadable data and a Twitter feed for research, journalism, online tools, and other projects. At project launch (September 2017) we are collecting chyrons from BBC News, CNN, Fox News, and MSNBC–more than four million collected over just two weeks. Chyrons have public value because: 15 | #' - Breaking news often appears on chyrons before TV newscasters begin reporting or video is available, whether it's a hurricane or a breaking political story. 16 | #' - Which chyrons a TV news network chooses to display can reveal editorial decisions that can inform public understanding of how news is filtered for different audiences. 17 | #' - Providing chyrons as data–and also on Twitter–in near real-time can serve as a alert system, showing how TV news stations are reporting the news. Often the chyrons are ahead of the general conversation on Twitter. 18 | #' 19 | #' Some notes on the data 20 | #' 21 | #' - chyrons are derived in near real-time from the TV News Archive's collection of TV news. The constantly updating public collection contains 1.4 million TV news shows, some dating back to 2009. 22 | #' - At launch, Third Eye captures four TV cable news channels: BBC News, CNN, Fox News, and MSNBC. 23 | #' - Data can be affected by temporary collection outages, which typically can last minutes or hours, but rarely more. 24 | #' - Dates/times are in UTC (Coordinated Universal Time). 25 | #' - Because the size of the raw data is so large (about 20 megabytes per day), results are limited to seven days per request. 26 | #' - Raw data collection began on August 25, 2017; the clean feed begins on September 7, 2017. 27 | #' - "`duration`" column is in seconds–the amount of time that particular chyron appeared on the screen. 28 | #' 29 | #' @md 30 | #' @note It is _highly_ recommended that you use the "clean" feed unless you're researching 31 | #' how to correct text. This package does it's best to read in the raw feed but 32 | #' it often contains embedded nulls and non-standard text encodings which 33 | #' make it difficult to process. 34 | #' @param chyron_day archive day (`Date` or `character`; if `character` should be 35 | #' in `YYYY-mm-dd` format) 36 | #' @param cleaned logical, default `TRUE`. The "raw feed" option provides all of the 37 | #' OCR'ed text from chyrons at the rate of approximately one entry per second. 38 | #' The "clean feed" download provides the data feed that fuels the Third Eye 39 | #' Twitter bots; this has been filtered to find the most representative, 40 | #' clearest chyrons from a 60-second period, with no more than one entry/tweet per 41 | #' minute (though the duration may be shorter than 60 seconds.) The clean feed 42 | #' relies on algorithms that are a work in progress. 43 | #' @return `NULL` on irrecoverable errors, otherwise a data frame with five columns: 44 | #' - `ts` (`POSIXct`) chyron timestamp 45 | #' - `channel` (`character`) news channel the chyron appeared on 46 | #' - `duration` (`integer`) see Description 47 | #' - `details` (`character`) Internet Archive details path 48 | #' - `text` (`character`) the chyron text 49 | #' @export 50 | read_chyrons <- function(chyron_day = Sys.Date()-1, cleaned = TRUE) { 51 | 52 | if (length(chyron_day) > 1) { 53 | message("Can only retrieve one day's archive at a time. Using first value.") 54 | chyron_day <- chyron_day[1] 55 | } 56 | 57 | if (inherits(chyron_day, "character")) { 58 | chyron_day <- as.Date(chyron_day) # ensure it's valid 59 | } 60 | 61 | chyron_day <- format(chyron_day, "%Y-%m-%d") 62 | 63 | archive_type <- if (cleaned) "-tweets" else "" 64 | 65 | archive_url <- sprintf(.third_eye_url_tmpl, chyron_day, archive_type) 66 | 67 | # see if it's there 68 | res <- s_head(archive_url) 69 | if (is.null(res)) { 70 | message(sprintf("Error reaching the Internet Archive [%s]", res$error)) 71 | return(NULL) 72 | } 73 | 74 | if (httr::status_code(res$result) != 200) { 75 | message(sprintf("Chyron archive request failed: [%s]", httr::http_status(res$result)$message)) 76 | return(NULL) 77 | } 78 | 79 | tf <- tempfile() 80 | download.file(archive_url, tf, quiet = TRUE) 81 | if (cleaned) { 82 | third_eye <- read_tsv(tf, col_names = .third_eye_col_names, .third_eye_cols) 83 | } else { 84 | suppressWarnings(stri_read_lines(tf)) %>% 85 | stri_split_fixed("\t", simplify = TRUE) %>% 86 | as_data_frame() %>% 87 | set_names(c("ts", "channel", "duration", "details", "text")) %>% 88 | mutate(ts = lubridate::ymd_hms(ts)) -> third_eye 89 | } 90 | 91 | unlink(tf) 92 | 93 | third_eye 94 | 95 | } 96 | -------------------------------------------------------------------------------- /R/top-tending-range.r: -------------------------------------------------------------------------------- 1 | #' Top Trending Topics (Internet Archive TV Archive) 2 | #' 3 | #' Provide start & end times in current time zone and this function will generate 4 | #' the proper "every 15-second" values, convert them to GMT values and issue the queries, 5 | #' returning a nested data frame of results. If you want more control, use [top_trending()]. 6 | #' 7 | #' GDELT now generates a snapshot every 15 minutes that records all of the "top trending" 8 | #' tables into a single archive enabling users to ook back over time at what was trending 9 | #' in 15 minute increments historically back to midnight on 2017-09-07. 10 | #' 11 | #' Note that the archives are generated every 15 minutes based on the television shows that 12 | #' have completed processing at that time. It can take several hours for a show to be fully 13 | #' processed by the Internet Archive and available for processing, thus the presence/absence 14 | #' of a topic in these files should not be used to date it precisely to that 15 minute mark, 15 | #' but rather as a rough temporal indicator of what topics were trending up/down in that 16 | #' general time frame. For precise timelines, you should take a topic from this archive and 17 | #' run a search on it using the main Television Explorer interface, select a timeframe of 18 | #' 72 hours and use the resulting timeline to precisely date the topic's coverage (since 19 | #' the Explorer timeline is based on the broadcast timestamp of the show, even if it is 20 | #' processed hours later). 21 | #' 22 | #' @md 23 | #' @param from,to start and end date/time ranges (will auto-convert if properly formatted strings) 24 | #' @param .progress show a progress bar? Defaukts to `TRUE` if in an interactive session. 25 | #' @note The times are auto-converted to GMT 26 | #' @export 27 | #' @examples 28 | #' top_trending("2017-09-08 18:00", "2017-09-09 06:00") 29 | iatv_top_trending <- function(from, to, .progress=interactive()) { 30 | 31 | from <- anytime::anytime(from) 32 | to <- anytime::anytime(to) 33 | 34 | base_url <- "http://data.gdeltproject.org/gdeltv3/iatv_trending/%s.tvtrending.v3.15min.json" 35 | 36 | start_ymd <- format(from, "%Y-%m-%d") 37 | end_ymd <- format(to, "%Y-%m-%d") 38 | 39 | start_hr <- as.numeric(format(from, "%H")) 40 | end_hr <- as.numeric(format(to, "%H")) 41 | 42 | start_min <- as.numeric(format(from, "%M")) 43 | if (!start_min %in% c(0, 15, 30, 45)) start_min <- 0 44 | 45 | end_min <- as.numeric(format(to, "%M")) 46 | if (!end_min %in% c(0, 15, 30, 45)) end_min <- 45 47 | 48 | from <- as.POSIXct(sprintf("%s %02d:%02d:00", start_ymd, start_hr, start_min)) 49 | to <- as.POSIXct(sprintf("%s %02d:%02d:00", end_ymd, end_hr, end_min)) 50 | 51 | full_range <- seq(from, to, "15 mins") 52 | 53 | attr(full_range, "tzone") <- "GMT" 54 | 55 | url_list <- sprintf(base_url, format(full_range, "%Y%m%d%H%M00")) 56 | 57 | pb <- dplyr::progress_estimated(length(url_list)) 58 | purrr::map(url_list, ~{ 59 | if (.progress) pb$tick()$print() 60 | res <- sfj(.x, flatten=TRUE) 61 | res$result 62 | }) -> res 63 | 64 | res <- purrr::discard(res, is.null) 65 | 66 | purrr::map_df(res, ~{ 67 | 68 | date_gen <- .x[["DateGenerated:"]] 69 | suppressWarnings(date_gen <- lubridate::ymd_hms(date_gen)) 70 | suppressWarnings(attr(date_gen, "tzone") <- Sys.timezone()) 71 | 72 | dplyr::data_frame( 73 | ts = date_gen, 74 | overall_trending_topics = list(.x[["OverallTrendingTopics"]]), 75 | station_trending_topics = list(.x[["StationTrendingTopics"]]), 76 | station_top_topics = list(.x[["StationTopTopics"]]), 77 | overall_trending_phrases = list(.x[["OverallTrendingPhrases"]]) 78 | ) 79 | 80 | }) -> out 81 | 82 | out 83 | 84 | } 85 | 86 | -------------------------------------------------------------------------------- /R/word-cloud.R: -------------------------------------------------------------------------------- 1 | #' Retrieve top words that appear most frequently in clips matching your search 2 | #' 3 | #' The API takes the 200 most relevant clips that match your search and returns the 4 | #' terms for a "word cloud" of up to the top 200 most frequent words that appeared in 5 | #' those clips (common stop words are automatically removed). This is a powerful way 6 | #' of understanding the topics and words dominating the relevant coverage and 7 | #' suggesting additional contextual search terms to narrow or evolve your search. 8 | #' Note that if there are too few matching clips for your query, the word cloud may 9 | #' be blank. 10 | #' 11 | #' @md 12 | #' @param query query string in GDELT format. See `QUERY` in https://blog.gdeltproject.org/gdelt-2-0-television-api-debuts/ 13 | #' for details; use [list_networks()] to obtain valid station/network identifiers 14 | #' @param start_date,end_date start/end dates. Leaving both `NULL` searches all archive history. 15 | #' Leaving just `start_date` `NULL` sets the start date to July 2009. Leaving just `end_date` 16 | #' `NULL` sets the end date to today. 17 | #' @export 18 | word_cloud <- function(query, start_date = NULL, end_date = NULL) { 19 | 20 | query_tv( 21 | query = query, 22 | mode = "WordCloud", 23 | start_date = start_date, 24 | end_date = end_date 25 | ) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname, pkgname) { 2 | 3 | packageStartupMessage("NOTE: There are breaking changes to the package API due to GDELT's v2 API") 4 | 5 | } -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: rmarkdown::github_document 3 | editor_options: 4 | chunk_output_type: console 5 | --- 6 | 7 | *** BREAKING CHANGES *** 8 | 9 | # newsflash 10 | 11 | Tools to Work with the Internet Archive and GDELT Television Explorer 12 | 13 | ## Description 14 | 15 | Ref: 16 | 17 | - 18 | - 19 | 20 | TV Explorer: 21 | >_"In collaboration with the Internet Archive's Television News Archive, GDELT's Television Explorer allows you to keyword search the closed captioning streams of the Archive's 6 years of American television news and explore macro-level trends in how America's television news is shaping the conversation around key societal issues. Unlike the Archive's primary Television News interface, which returns results at the level of an hour or half-hour "show," the interface here reaches inside of those six years of programming and breaks the more than one million shows into individual sentences and counts how many of those sentences contain your keyword of interest. Instead of reporting that CNN had 24 hour-long shows yesterday that mentioned Donald Trump, the interface here will count how many sentences uttered on CNN yesterday mentioned his name - a vastly more accurate metric for assessing media attention."_ 22 | 23 | Third Eye: 24 | >_The TV News Archive's Third Eye project captures the chyrons–or narrative text–that appear on the lower third of TV news screens and turns them into downloadable data and a Twitter feed for research, journalism, online tools, and other projects. At project launch (September 2017) we are collecting chyrons from BBC News, CNN, Fox News, and MSNBC–more than four million collected over just two weeks."_ 25 | 26 | An advantage of using this over the TV Explorer interactive selector & downloader or Third Eye API is that you get tidy tibbles with this package, ready to use in R. 27 | 28 | NOTE: While I don't claim that this alpha-package is anywhere near perfect, the IA/GDELT TV API hiccups every so often so when there are critical errors run the same query in their web interface before submitting an issue. I kept getting errors when searching all affiliate markets for the "mexican president" query that also generate errors on the web site when JSON is selected as output (it's fine on the web site if the choice is interactive browser visualizations). Submit those errors to them, not here. 29 | 30 | ## What's Inside The Tin 31 | 32 | The following functions are implemented: 33 | 34 | - `list_chyrons`: Retrieve Third Eye chyron index 35 | - `list_networks`: Helper function to identify station/network keyword and corpus date range for said market 36 | - `newsflash`: Tools to Work with the Internet Archive and GDELT Television Explorer 37 | - `query_tv`: Issue a query to the TV Explorer 38 | - `read_chyrons`: Retrieve TV News Archive chyrons from the Internet Archive's Third Eye project 39 | - `gd_top_trending`: Top Trending (GDELT) 40 | - `iatv_top_trending: Top Trending Topics (Internet Archive TV Archive) 41 | - `word_cloud`: Retrieve top words that appear most frequently in clips matching your search 42 | 43 | ## Installation 44 | 45 | ```{r eval=FALSE} 46 | devtools::install_github("hrbrmstr/newsflash") 47 | ``` 48 | 49 | ```{r message=FALSE, warning=FALSE, error=FALSE} 50 | options(width=120) 51 | ``` 52 | 53 | ## Usage 54 | 55 | ```{r message=FALSE, warning=FALSE, error=FALSE} 56 | library(newsflash) 57 | library(ggalt) 58 | library(hrbrthemes) 59 | library(tidyverse) 60 | 61 | # current verison 62 | packageVersion("newsflash") 63 | ``` 64 | 65 | ### "Third Eye" Chyrons are simpler so we'll start with them first: 66 | 67 | ```{r fig.width=8, fig.height=5, cache=TRUE} 68 | list_chyrons() 69 | 70 | ch <- read_chyrons("2018-04-13") 71 | 72 | mutate( 73 | ch, 74 | hour = lubridate::hour(ts), 75 | text = tolower(text), 76 | mention = grepl("comey", text) 77 | ) %>% 78 | filter(mention) %>% 79 | count(hour, channel) %>% 80 | ggplot(aes(hour, n)) + 81 | geom_segment(aes(xend=hour, yend=0), color = "lightslategray", size=1) + 82 | scale_x_continuous(name="Hour (GMT)", breaks=seq(0, 23, 6), 83 | labels=sprintf("%02d:00", seq(0, 23, 6))) + 84 | scale_y_continuous(name="# Chyrons", limits=c(0,20)) + 85 | facet_wrap(~channel, scales="free") + 86 | labs(title="Chyrons mentioning 'Comey' per hour per channel", 87 | caption="Source: Internet Archive Third Eye project & ") + 88 | theme_ipsum_rc(grid="Y") 89 | ``` 90 | 91 | ## Now for the TV Explorer: 92 | 93 | ### See what networks & associated corpus date ranges are available: 94 | 95 | ```{r} 96 | list_networks(widget=FALSE) 97 | ``` 98 | 99 | ### Basic search: 100 | 101 | ```{r fig.width=8, fig.height=7, cache=TRUE} 102 | comey <- query_tv('comey', start_date = "2018-04-01") 103 | 104 | comey 105 | 106 | query_tv('comey', start_date = "2018-04-01") %>% 107 | arrange(date) %>% 108 | ggplot(aes(date, value, group=network)) + 109 | ggalt::geom_xspline(aes(color=network)) + 110 | ggthemes::scale_color_tableau(name=NULL) + 111 | labs(x=NULL, y="Volume Metric", title="'Comey' Trends Across National Networks") + 112 | facet_wrap(~network) + 113 | theme_ipsum_rc(grid="XY") + 114 | theme(legend.position="none") 115 | ``` 116 | 117 | ```{r cache=TRUE} 118 | query_tv("comey Network:CNN", mode = "TimelineVol", start_date = "2018-01-01") %>% 119 | arrange(date) %>% 120 | ggplot(aes(date, value, group=network)) + 121 | ggalt::geom_xspline(color="lightslategray") + 122 | ggthemes::scale_color_tableau(name=NULL) + 123 | labs(x=NULL, y="Volume Metric", title="'Comey' Trend on CNN") + 124 | theme_ipsum_rc(grid="XY") 125 | ``` 126 | 127 | ### Relative Network Attention To Syria since January 1, 2018 128 | 129 | ```{r cache=TRUE} 130 | query_tv('syria Market:"National"', mode = "StationChart", start_date = "2018-01-01") %>% 131 | arrange(desc(count)) %>% 132 | knitr::kable("markdown") 133 | ``` 134 | 135 | ### Video Clips 136 | 137 | ```{r cache=TRUE} 138 | clips <- query_tv('comey Market:"National"', mode = "ClipGallery", start_date = "2018-01-01") 139 | 140 | clips 141 | ``` 142 | 143 | `r clips$show_date[1]` | `r clips$station[1]` | `r clips$show[1]` 144 | 145 | 146 | 147 | `r clips$snippet[1]` 148 | 149 | ### "Word Cloud" (top associated words to the query) 150 | 151 | ```{r fig.height=8, fig.width=8, cache=TRUE} 152 | wc <- query_tv('hannity Market:"National"', mode = "WordCloud", start_date = "2018-04-13") 153 | 154 | ggplot(wc, aes(x=1, y=1)) + 155 | ggrepel::geom_label_repel(aes(label=label, size=count), segment.colour="#00000000", segment.size=0) + 156 | scale_size_continuous(trans="sqrt") + 157 | labs(x=NULL, y=NULL) + 158 | theme_ipsum_rc(grid="") + 159 | theme(axis.text=element_blank()) + 160 | theme(legend.position="none") 161 | ``` 162 | 163 | ### Last 15 Minutes Top Trending 164 | 165 | ```{r} 166 | gd_top_trending() 167 | ``` 168 | 169 | ### Top Overall Trending from the Internet Archive TV Archive (2017 and earlier) 170 | 171 | ```{r} 172 | iatv_top_trending("2017-12-01 18:00", "2017-12-02 06:00") 173 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | \*\*\* BREAKING CHANGES \*\*\* 3 | 4 | # newsflash 5 | 6 | Tools to Work with the Internet Archive and GDELT Television 7 | Explorer 8 | 9 | ## Description 10 | 11 | Ref: 12 | 13 | - 14 | - 15 | 16 | TV Explorer: 17 | 18 | >_“In collaboration with the Internet Archive’s 19 | Television News Archive, GDELT’s Television Explorer allows you to 20 | keyword search the closed captioning streams of the Archive’s 6 years of 21 | American television news and explore macro-level trends in how America’s 22 | television news is shaping the conversation around key societal issues. 23 | Unlike the Archive’s primary Television News interface, which returns 24 | results at the level of an hour or half-hour”show," the interface here 25 | reaches inside of those six years of programming and breaks the more 26 | than one million shows into individual sentences and counts how many of 27 | those sentences contain your keyword of interest. Instead of reporting 28 | that CNN had 24 hour-long shows yesterday that mentioned Donald Trump, 29 | the interface here will count how many sentences uttered on CNN 30 | yesterday mentioned his name - a vastly more accurate metric for 31 | assessing media attention.“_ 32 | 33 | Third Eye: 34 | 35 | >_The TV News Archive’s Third Eye project captures the 36 | chyrons–or narrative text–that appear on the lower third of TV news 37 | screens and turns them into downloadable data and a Twitter feed for 38 | research, journalism, online tools, and other projects. At project 39 | launch (September 2017) we are collecting chyrons from BBC News, CNN, 40 | Fox News, and MSNBC–more than four million collected over just two 41 | weeks.“_ 42 | 43 | An advantage of using this over the TV Explorer interactive selector & 44 | downloader or Third Eye API is that you get tidy tibbles with this 45 | package, ready to use in R. 46 | 47 | NOTE: While I don’t claim that this alpha-package is anywhere near 48 | perfect, the IA/GDELT TV API hiccups every so often so when there are 49 | critical errors run the same query in their web interface before 50 | submitting an issue. I kept getting errors when searching all affiliate 51 | markets for the “mexican president” query that also generate errors on 52 | the web site when JSON is selected as output (it’s fine on the web site 53 | if the choice is interactive browser visualizations). Submit those 54 | errors to them, not here. 55 | 56 | ## What’s Inside The Tin 57 | 58 | The following functions are implemented: 59 | 60 | - `list_chyrons`: Retrieve Third Eye chyron index 61 | - `list_networks`: Helper function to identify station/network keyword 62 | and corpus date range for said market 63 | - `newsflash`: Tools to Work with the Internet Archive and GDELT 64 | Television Explorer 65 | - `query_tv`: Issue a query to the TV Explorer 66 | - `read_chyrons`: Retrieve TV News Archive chyrons from the Internet 67 | Archive’s Third Eye project 68 | - `gd_top_trending`: Top Trending (GDELT) 69 | - \`iatv\_top\_trending: Top Trending Topics (Internet Archive TV 70 | Archive) 71 | - `word_cloud`: Retrieve top words that appear most frequently in 72 | clips matching your search 73 | 74 | ## Installation 75 | 76 | ``` r 77 | devtools::install_github("hrbrmstr/newsflash") 78 | ``` 79 | 80 | ``` r 81 | options(width=120) 82 | ``` 83 | 84 | ## Usage 85 | 86 | ``` r 87 | library(newsflash) 88 | library(ggalt) 89 | library(hrbrthemes) 90 | library(tidyverse) 91 | 92 | # current verison 93 | packageVersion("newsflash") 94 | ``` 95 | 96 | ## [1] '0.6.0' 97 | 98 | ### “Third Eye” Chyrons are simpler so we’ll start with them first: 99 | 100 | ``` r 101 | list_chyrons() 102 | ``` 103 | 104 | ## # A tibble: 457 x 3 105 | ## ts type size 106 | ## 107 | ## 1 2018-04-16 cleaned 297177. 108 | ## 2 2018-04-16 raw 10436998. 109 | ## 3 2018-04-15 cleaned 347063. 110 | ## 4 2018-04-15 raw 9884284. 111 | ## 5 2018-04-14 cleaned 470448. 112 | ## 6 2018-04-14 raw 13709682. 113 | ## 7 2018-04-13 cleaned 410976. 114 | ## 8 2018-04-13 raw 12058117. 115 | ## 9 2018-04-12 cleaned 384796. 116 | ## 10 2018-04-12 raw 11750908. 117 | ## # ... with 447 more rows 118 | 119 | ``` r 120 | ch <- read_chyrons("2018-04-13") 121 | 122 | mutate( 123 | ch, 124 | hour = lubridate::hour(ts), 125 | text = tolower(text), 126 | mention = grepl("comey", text) 127 | ) %>% 128 | filter(mention) %>% 129 | count(hour, channel) %>% 130 | ggplot(aes(hour, n)) + 131 | geom_segment(aes(xend=hour, yend=0), color = "lightslategray", size=1) + 132 | scale_x_continuous(name="Hour (GMT)", breaks=seq(0, 23, 6), 133 | labels=sprintf("%02d:00", seq(0, 23, 6))) + 134 | scale_y_continuous(name="# Chyrons", limits=c(0,20)) + 135 | facet_wrap(~channel, scales="free") + 136 | labs(title="Chyrons mentioning 'Comey' per hour per channel", 137 | caption="Source: Internet Archive Third Eye project & ") + 138 | theme_ipsum_rc(grid="Y") 139 | ``` 140 | 141 | ![](README_files/figure-gfm/unnamed-chunk-4-1.png) 142 | 143 | ## Now for the TV Explorer: 144 | 145 | ### See what networks & associated corpus date ranges are available: 146 | 147 | ``` r 148 | list_networks(widget=FALSE) 149 | ``` 150 | 151 | ## # A tibble: 159 x 6 152 | ## StationID Description Market Network StartDate EndDate 153 | ## 154 | ## 1 ALJAZ Al Jazeera International ALJAZ 2017-09-11 2017-09-11 155 | ## 2 ALJAZAM Al Jazeera America NationalDiscontinued ALJAZAM 2013-08-20 2013-08-20 156 | ## 3 BBCNEWS BBC News International BBCNEWS 2017-01-01 2017-01-01 157 | ## 4 BETW BET - San Francisco (BETW) San Francisco BET 2016-12-13 2016-12-13 158 | ## 5 BLOOMBERG Bloomberg National BLOOMBERG 2013-12-05 2013-12-05 159 | ## 6 CNBC CNBC National CNBC 2009-07-02 2009-07-02 160 | ## 7 CNN CNN National CNN 2009-07-02 2009-07-02 161 | ## 8 COM Comedy Central NationalSpecialty COM 2011-05-10 2011-05-10 162 | ## 9 CSPAN CSPAN National CSPAN 2009-06-04 2009-06-04 163 | ## 10 CSPAN2 CSPAN2 National CSPAN 2009-06-04 2009-06-04 164 | ## # ... with 149 more rows 165 | 166 | ### Basic search: 167 | 168 | ``` r 169 | comey <- query_tv('comey', start_date = "2018-04-01") 170 | 171 | comey 172 | ``` 173 | 174 | ## # A tibble: 144 x 3 175 | ## network date value 176 | ## 177 | ## 1 CSPAN3 2018-04-01 0.0273 178 | ## 2 CSPAN3 2018-04-02 0. 179 | ## 3 CSPAN3 2018-04-03 0. 180 | ## 4 CSPAN3 2018-04-04 0.0241 181 | ## 5 CSPAN3 2018-04-05 0. 182 | ## 6 CSPAN3 2018-04-06 0. 183 | ## 7 CSPAN3 2018-04-07 0. 184 | ## 8 CSPAN3 2018-04-08 0. 185 | ## 9 CSPAN3 2018-04-09 0. 186 | ## 10 CSPAN3 2018-04-10 0. 187 | ## # ... with 134 more rows 188 | 189 | ``` r 190 | query_tv('comey', start_date = "2018-04-01") %>% 191 | arrange(date) %>% 192 | ggplot(aes(date, value, group=network)) + 193 | ggalt::geom_xspline(aes(color=network)) + 194 | ggthemes::scale_color_tableau(name=NULL) + 195 | labs(x=NULL, y="Volume Metric", title="'Comey' Trends Across National Networks") + 196 | facet_wrap(~network) + 197 | theme_ipsum_rc(grid="XY") + 198 | theme(legend.position="none") 199 | ``` 200 | 201 | ![](README_files/figure-gfm/unnamed-chunk-6-1.png) 202 | 203 | ``` r 204 | query_tv("comey Network:CNN", mode = "TimelineVol", start_date = "2018-01-01") %>% 205 | arrange(date) %>% 206 | ggplot(aes(date, value, group=network)) + 207 | ggalt::geom_xspline(color="lightslategray") + 208 | ggthemes::scale_color_tableau(name=NULL) + 209 | labs(x=NULL, y="Volume Metric", title="'Comey' Trend on CNN") + 210 | theme_ipsum_rc(grid="XY") 211 | ``` 212 | 213 | ![](README_files/figure-gfm/unnamed-chunk-7-1.png) 214 | 215 | ### Relative Network Attention To Syria since January 1, 2018 216 | 217 | ``` r 218 | query_tv('syria Market:"National"', mode = "StationChart", start_date = "2018-01-01") %>% 219 | arrange(desc(count)) %>% 220 | knitr::kable("markdown") 221 | ``` 222 | 223 | | station | count | 224 | | :----------- | -----: | 225 | | FOX News | 1.0148 | 226 | | CNN | 0.8804 | 227 | | MSNBC | 0.7668 | 228 | | CSPAN | 0.6192 | 229 | | FOX Business | 0.5121 | 230 | | CSPAN2 | 0.3346 | 231 | | Bloomberg | 0.3208 | 232 | | CSPAN3 | 0.2392 | 233 | | CNBC | 0.2171 | 234 | 235 | ### Video Clips 236 | 237 | ``` r 238 | clips <- query_tv('comey Market:"National"', mode = "ClipGallery", start_date = "2018-01-01") 239 | 240 | clips 241 | ``` 242 | 243 | ## # A tibble: 32 x 8 244 | ## preview_url ia_show_id date station show show_date preview_thumb snippet 245 | ## 246 | ## 1 https://archive.… FOXNEWSW_201… 2018-04-13 FOX Ne… Shepa… 2018-04-13 https://archive.org/do… comey -- i mention it … 247 | ## 2 https://archive.… MSNBCW_20180… 2018-03-20 MSNBC MTP D… 2018-03-20 https://archive.org/do… donald trump ousted co… 248 | ## 3 https://archive.… CNNW_2018041… 2018-04-16 CNN CNN S… 2018-04-16 https://archive.org/do… comey versus comey or … 249 | ## 4 https://archive.… MSNBCW_20180… 2018-04-12 MSNBC The R… 2018-04-12 https://archive.org/do… and the president of c… 250 | ## 5 https://archive.… FOXNEWSW_201… 2018-04-13 FOX Ne… The I… 2018-04-13 https://archive.org/do… comey announced when h… 251 | ## 6 https://archive.… FBC_20180413… 2018-04-13 FOX Bu… After… 2018-04-13 https://archive.org/do… untethered to the trut… 252 | ## 7 https://archive.… FBC_20180415… 2018-04-15 FOX Bu… The J… 2018-04-15 https://archive.org/do… that we haven't alread… 253 | ## 8 https://archive.… CNNW_2018031… 2018-03-18 CNN New D… 2018-03-18 https://archive.org/do… media. after comey lea… 254 | ## 9 https://archive.… MSNBCW_20180… 2018-02-20 MSNBC The B… 2018-02-20 https://archive.org/do… trump caused this inve… 255 | ## 10 https://archive.… CNBC_2018041… 2018-04-13 CNBC Power… 2018-04-13 https://archive.org/do… he is ego different an… 256 | ## # ... with 22 more rows 257 | 258 | 2018-04-13 | FOX News | Shepard Smith 259 | Reporting 260 | 261 | 262 | 263 | comey – i mention it because comey is in the news. treats comey like a 264 | white knight and points out that director comey would have a vested 265 | interest in distancing himself from andrew mccabe because the inspector 266 | general was also looking at comey and 267 | his 268 | 269 | ### “Word Cloud” (top associated words to the query) 270 | 271 | ``` r 272 | wc <- query_tv('hannity Market:"National"', mode = "WordCloud", start_date = "2018-04-13") 273 | 274 | ggplot(wc, aes(x=1, y=1)) + 275 | ggrepel::geom_label_repel(aes(label=label, size=count), segment.colour="#00000000", segment.size=0) + 276 | scale_size_continuous(trans="sqrt") + 277 | labs(x=NULL, y=NULL) + 278 | theme_ipsum_rc(grid="") + 279 | theme(axis.text=element_blank()) + 280 | theme(legend.position="none") 281 | ``` 282 | 283 | ![](README_files/figure-gfm/unnamed-chunk-10-1.png) 284 | 285 | ### Last 15 Minutes Top Trending 286 | 287 | ``` r 288 | gd_top_trending() 289 | ``` 290 | 291 | ## $overall_trending_topics 292 | ## [1] "commonwealth" "shirley" "caribbean" "florida" 293 | ## [5] "jim comey" "boston" "sandra" "nell" 294 | ## [9] "george stephanopoulos" "vincent kompany" "pallab ghosh" "brighthouse financial" 295 | ## [13] "islamic state" "wetherspoon" "europe" "sorrell" 296 | ## [17] "north carolina" "nasa" "starbucks" "pakistan" 297 | ## [21] "whitbread" "cliff richard" "asia" "hilary clinton" 298 | ## [25] "ghouta" "kevin johnson" "west" "philadelphia" 299 | ## [29] "renee" "zimbabwe" "city" "bill chaplin" 300 | ## [33] "james" "grassley" "quetta" "myrbetriq" 301 | ## [37] "barbara" "john heilemann" "carrie underwood" "joe" 302 | ## [41] "houston" "balochistan" "ibm" "medicare" 303 | ## [45] "barclays" "fidelity" "jason aldean" "rhonda" 304 | ## [49] "michael flynn" "belfast" "kohler" 305 | ## 306 | ## $station_trending_topics 307 | ## # A tibble: 112 x 2 308 | ## station topic 309 | ## 310 | ## 1 CNN brilinta 311 | ## 2 CNN jim comey 312 | ## 3 CNN christine 313 | ## 4 CNN michael flynn 314 | ## 5 CNN tremfya 315 | ## 6 CNN tal 316 | ## 7 CNN nick paton walsh 317 | ## 8 CNN geico 318 | ## 9 CNN vladimir putin 319 | ## 10 CNN lynch 320 | ## # ... with 102 more rows 321 | ## 322 | ## $station_top_topics 323 | ## # A tibble: 112 x 2 324 | ## station topic 325 | ## 326 | ## 1 CNN fbi 327 | ## 2 CNN russia 328 | ## 3 CNN donald trump 329 | ## 4 CNN james comey 330 | ## 5 CNN mueller 331 | ## 6 CNN syria 332 | ## 7 CNN united states 333 | ## 8 CNN michael cohen 334 | ## 9 CNN clinton 335 | ## 10 CNN cnn 336 | ## # ... with 102 more rows 337 | ## 338 | ## $overall_trending_phrases 339 | ## [1] "morally unfit" "unfit to be president" "good morning" 340 | ## [4] "medically unfit" "president of the united" "islamic state group" 341 | ## [7] "night sky" "bank of america" "xfinity delivers gig" 342 | ## [10] "give this guy gig-" "delivers gig speed" "give this guy" 343 | ## [13] "gig speed" "speed to more homes" "xfinity delivers gig speed" 344 | ## [16] "guy gig-" "treats women" "xfinity delivers" 345 | ## [19] "donald trump" "gig-speed internet" "kennedy space centre" 346 | ## [22] "people watching" "threatens new sanctions" "donald trump unfit" 347 | ## [25] "exclusive interview" "evidence of obstruction" "sees moral equivalence" 348 | ## [28] "100 years" "air strikes" "fit to be president" 349 | ## [31] "new york" "maintaining a level" "shield annuity" 350 | ## [34] "growth opportunities" "lies constantly" "time to make" 351 | ## [37] "level of protection" "support for president assad" "removing donald trump" 352 | ## [40] "support for president" "buy the stuff" "2700 journalists" 353 | ## [43] "pallab ghosh" "brighthouse financial- established" "mission to scan" 354 | ## [46] "stars resonate" "voting booth" "star makes" 355 | ## [49] "james comey comments" "embody respect" "adhere to the values" 356 | 357 | ### Top Overall Trending from the Internet Archive TV Archive (2017 and earlier) 358 | 359 | ``` r 360 | iatv_top_trending("2017-12-01 18:00", "2017-12-02 06:00") 361 | ``` 362 | 363 | ## # A tibble: 49 x 5 364 | ## ts overall_trending_topics station_trending_topics station_top_topics overall_trending_phrases 365 | ## 366 | ## 1 2017-12-01 18:00:00 367 | ## 2 2017-12-01 18:15:00 368 | ## 3 2017-12-01 18:30:00 369 | ## 4 2017-12-01 18:45:00 370 | ## 5 2017-12-01 19:00:00 371 | ## 6 2017-12-01 19:15:00 372 | ## 7 2017-12-01 19:30:00 373 | ## 8 2017-12-01 19:45:00 374 | ## 9 2017-12-01 20:00:00 375 | ## 10 2017-12-01 20:15:00 376 | ## # ... with 39 more rows 377 | -------------------------------------------------------------------------------- /README_cache/gfm/__packages: -------------------------------------------------------------------------------- 1 | base 2 | newsflash 3 | ggplot2 4 | ggalt 5 | hrbrthemes 6 | tidyverse 7 | tibble 8 | tidyr 9 | readr 10 | purrr 11 | dplyr 12 | stringr 13 | forcats 14 | bindrcpp 15 | -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.RData -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.rdb -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-10_b71663ca7f74ee9c9e0993f800680bdc.rdx -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.RData -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.rdb -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-4_202c6a4374c7d2d43d1df0021f5e1de3.rdx -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.RData -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.rdb -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-6_60e162ac3d416f213d19662cf1a02510.rdx -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.RData -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.rdb -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-7_2f3c308173042d1baf25844e64d232cb.rdx -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.RData -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.rdb -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-8_63ed08ea6bddbf23012e183bdb415c89.rdx -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.RData -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.rdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.rdb -------------------------------------------------------------------------------- /README_cache/gfm/unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.rdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_cache/gfm/unnamed-chunk-9_8b52c64d46d2221a5b0cbdaefa9e655b.rdx -------------------------------------------------------------------------------- /README_files/figure-gfm/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-gfm/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /README_files/figure-gfm/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-gfm/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /README_files/figure-gfm/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-gfm/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /README_files/figure-gfm/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-gfm/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github-ascii_identifiers/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /README_files/figure-markdown_github/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/newsflash/5075be6afdd214bdf257ebe87f525a09c9cb1b80/README_files/figure-markdown_github/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /man/gd_top_trending.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gd-top-trending.R 3 | \name{gd_top_trending} 4 | \alias{gd_top_trending} 5 | \title{Top Trending (GDELT)} 6 | \usage{ 7 | gd_top_trending() 8 | } 9 | \description{ 10 | Retrieve current (last 15 minute) "top topics" being discussed on stations 11 | } 12 | -------------------------------------------------------------------------------- /man/iatv_top_trending.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/top-tending-range.r 3 | \name{iatv_top_trending} 4 | \alias{iatv_top_trending} 5 | \title{Top Trending Topics (Internet Archive TV Archive)} 6 | \usage{ 7 | iatv_top_trending(from, to, .progress = interactive()) 8 | } 9 | \arguments{ 10 | \item{from, to}{start and end date/time ranges (will auto-convert if properly formatted strings)} 11 | 12 | \item{.progress}{show a progress bar? Defaukts to \code{TRUE} if in an interactive session.} 13 | } 14 | \description{ 15 | Provide start & end times in current time zone and this function will generate 16 | the proper "every 15-second" values, convert them to GMT values and issue the queries, 17 | returning a nested data frame of results. If you want more control, use \code{\link[=top_trending]{top_trending()}}. 18 | } 19 | \details{ 20 | GDELT now generates a snapshot every 15 minutes that records all of the "top trending" 21 | tables into a single archive enabling users to ook back over time at what was trending 22 | in 15 minute increments historically back to midnight on 2017-09-07. 23 | 24 | Note that the archives are generated every 15 minutes based on the television shows that 25 | have completed processing at that time. It can take several hours for a show to be fully 26 | processed by the Internet Archive and available for processing, thus the presence/absence 27 | of a topic in these files should not be used to date it precisely to that 15 minute mark, 28 | but rather as a rough temporal indicator of what topics were trending up/down in that 29 | general time frame. For precise timelines, you should take a topic from this archive and 30 | run a search on it using the main Television Explorer interface, select a timeframe of 31 | 72 hours and use the resulting timeline to precisely date the topic's coverage (since 32 | the Explorer timeline is based on the broadcast timestamp of the show, even if it is 33 | processed hours later). 34 | } 35 | \note{ 36 | The times are auto-converted to GMT 37 | } 38 | \examples{ 39 | top_trending("2017-09-08 18:00", "2017-09-09 06:00") 40 | } 41 | -------------------------------------------------------------------------------- /man/list_chyrons.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/list-chyrons.r 3 | \name{list_chyrons} 4 | \alias{list_chyrons} 5 | \title{Retrieve Third Eye chyron index} 6 | \usage{ 7 | list_chyrons() 8 | } 9 | \value{ 10 | data frame with three columns: 11 | \itemize{ 12 | \item \code{ts} (\code{POSIXct}) chyron timestamp 13 | \item \code{type} (\code{character}) \code{raw} or \code{cleaned} 14 | \item \code{size} (\code{numeric}) size of the feed file in bytes 15 | } 16 | } 17 | \description{ 18 | Returns a data frame with available chyron dates & selected metadata. 19 | } 20 | -------------------------------------------------------------------------------- /man/list_networks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/networks.r 3 | \name{list_networks} 4 | \alias{list_networks} 5 | \title{Helper function to identify station/network keyword and corpus date range for said market} 6 | \usage{ 7 | list_networks(widget = interactive()) 8 | } 9 | \arguments{ 10 | \item{widget}{if `TRUE` then an HTML widget will be displayed to make it easier to 11 | sift through stations/networks} 12 | } 13 | \value{ 14 | data frame 15 | } 16 | \description{ 17 | The \code{filter_network} of \code{query_tv()} is picky so this helps you idenitify the 18 | keyword to use for the particular network/station. 19 | } 20 | \details{ 21 | The list also shows the date ranges available for the captions, so you can use that as 22 | a guide when picking dates. 23 | 24 | In interactive mode it uses \code{DT::datatable()}. You can force it to just display to 25 | the console by passing in \code{widget=FALSE} 26 | } 27 | \examples{ 28 | list_networks() # widget 29 | print(list_networks(FALSE)) # no widget 30 | } 31 | -------------------------------------------------------------------------------- /man/newsflash.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/newsflash-package.R 3 | \docType{package} 4 | \name{newsflash} 5 | \alias{newsflash} 6 | \alias{newsflash-package} 7 | \title{Tools to Work with the Internet Archive and GDELT Television Explorer} 8 | \description{ 9 | Tools to Work with the Internet Archive and GDELT Television Explorer 10 | } 11 | \author{ 12 | Bob Rudis (bob@rud.is) 13 | } 14 | -------------------------------------------------------------------------------- /man/query_tv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/newsflash.r 3 | \name{query_tv} 4 | \alias{query_tv} 5 | \title{Issue a query to the TV Explorer} 6 | \usage{ 7 | query_tv(query, mode = c("TimelineVol", "StationChart", "TimelineVolNorm"), 8 | start_date = NULL, end_date = NULL, datanorm = c("perc", "raw"), 9 | timelinesmooth = 0, datacomb = c("separate", "combined"), 10 | last_24 = c("yes", "no")) 11 | } 12 | \arguments{ 13 | \item{query}{query string in GDELT format. See \code{QUERY} in https://blog.gdeltproject.org/gdelt-2-0-television-api-debuts/ 14 | for details; use \code{\link[=list_networks]{list_networks()}} to obtain valid station/network identifiers. If 15 | no \code{Network:}, \code{Market:} or \code{Station:} qualifiers are found \code{Market:"National"} is automatically added.} 16 | 17 | \item{mode}{See \code{Mode} section} 18 | 19 | \item{start_date, end_date}{start/end dates. Leaving both \code{NULL} searches all archive history. 20 | Leaving just \code{start_date} \code{NULL} sets the start date to July 2009. Leaving just \code{end_date} 21 | \code{NULL} sets the end date to today.} 22 | 23 | \item{datanorm}{normalized ("\code{perc}") vs "\code{raw}" counts; defaults to \code{perc}.} 24 | 25 | \item{timelinesmooth}{a smoothing value applying moving averages over 15-minute increments} 26 | 27 | \item{datacomb}{if "\code{combined}", all network volume is combined into a single value. 28 | Defaults to "\code{separate}".} 29 | 30 | \item{last_24}{It can take the Internet Archive up to 24 hours to process a broadcast once 31 | it concludes. Thus, by default the TV API does not return results from the most recent 32 | 24 hours to ensure that analyses are not skewed by partial results. However, when 33 | tracking breaking news events, it may be desirable to view partial results with the 34 | understanding that any time or station-based trends may not accurately reflect the 35 | totality of their coverage. To include results from the most recent 24 hours, 36 | set this URL parameter to "yes".} 37 | } 38 | \value{ 39 | Different objects for different \code{mode}s: 40 | \itemize{ 41 | \item \code{TimelineVol} : a data frame with stations & counts (raw or normalied) 42 | \item \code{TimelineVolNorm} : a data frame of station & topic airtime 43 | \item \code{StationChart} : a data frame of stations and search result counts (raw or normalized) 44 | } 45 | } 46 | \description{ 47 | NOTE: The \code{mode} parameter controls what is returned. See the section on \code{Mode} for more information on available modes. 48 | } 49 | \section{Mode}{ 50 | 51 | 52 | This specifies the specific output you would like from the API, ranging from timelines to word clouds to clip galleries. 53 | \itemize{ 54 | \item \code{TimelineVol}. (Default) This tracks how many results your search generates by day/hour over the selected time period, allowing you to assess the relative attention each is paying to the topic and how that attention has varied over time. Using the DATANORM parameter you can control whether this reports results as raw clip counts or as normalized percentages of all coverage (the most robust way of comparing stations). By default, the timeline will not display the most recent 24 hours, since those results are still being generated (it can take up to 2-12 hours for a show to be processed by the Internet Archive and ready for analysis), but you can include those if needed via the LAST24 option. You can also smooth the timeline using the TIMELINESMOOTH option and combine all selected stations into a single time series using the DATACOMB option. 55 | \item \code{StationChart}. This compares how many results your search generates from each of the selected stations over the selected time period, allowing you to assess the relative attention each is paying to the topic. Using the DATANORM parameter you can control whether this reports results as raw clip counts or as normalized percentages of all coverage (the most robust way of comparing stations). 56 | \item \code{TimelineVolNorm}. This displays the total airtime (in terms of 15 second clips) monitored from each of the stations in your query. It must be combined with a valid query, since it displays the airtime for the stations queried in the search. This mode can be used to identify brief monitoring outages or for advanced normalization, since it reports the total amount of clips monitored overall from each station in each day/hour. 57 | } 58 | } 59 | 60 | \section{Queries}{ 61 | 62 | 63 | The GDELT TV API supports keyword and keyphrase searches, OR statements and a variety of advanced operators. NOTE – all of the operators below must be used as part of the value of the QUERY field, separated by spaces, and cannot be used as URL parameters on their own. 64 | \itemize{ 65 | \item \code{""}. Anything found inside of quote marks is treated as an exact phrase search. Thus, you can search for "Donald Trump" to find all matches of his name. (e.g \code{"donald trump"}) 66 | \item \code{(a OR b)}. You can specify a list of keywords to be boolean OR'd together by enclosing them in parentheses and placing the capitalized word "OR" between each keyword or phrase. Boolean OR blocks cannot be nested at this time. For example, to search for mentions of Clinton, Sanders or Trump, you would use "\code{(clinton OR sanders OR trump)}" 67 | \item \code{-}. You can place a minus sign in front of any operator, word or phrase to exclude it. For example "-sanders" would exclude results that contained "sanders" from your results. (e.g. \code{-sanders}) 68 | \item \code{Context}. By default all of your keywords/phrases must appear in a single 15 second clip. (Phrases are allowed to span across two clips and are counted towards the clip they started in). The "context" operator allows you to require that a given keyword/phrase appears either in the 15 second clip or in the 15 second clips immediately before or after it. This gives you a bit of additional search fuzziness. Even when searching for a single word, it must appear in quote marks. (e.g. \code{context:"russia"}) 69 | \item \code{Market}. This narrows your search to a particular geographic market. The list of available markets can be found via the Station Details mode (look for the city name in the description of local stations). Example markets include "San Francisco" and "Philadelphia". The market name must be enclosed in quote marks. You can also use the special reserved market "National" to search the major national networks together. (e.g. \code{market:"San Francisco"}) 70 | \item \code{Network}. This narrows your search to a particular television network. The list of available networks can be found via the Station Details mode (look for the network name in the description of local stations). Example markets include "CBS" and "NBC". Do not use quote marks around the network name. (e.g. \code{network:CBS}) 71 | \item Show. This narrows your search to a particular television show. This must be the complete show name as returned by the TV API. To find a particular show, search the API and use the "clipgallery" mode to display matching clips and their source show. For example, to limit your search to the show Hardball With Chris Matthews, you'd search for "show:"Hardball With Chris Matthews"". Note that you must surround the show name with quote marks. Remember that the TV API only searches shows monitored by the Internet Archive's Television News Archive, which may not include all shows. (e.g. \code{show:"Hardball With Chris Matthews"}) 72 | \item \code{Station}. This narrows your search to a particular television station. Remember that the TV API only searches stations monitored by the Internet Archive's Television News Archive and not all of those stations have been monitored for the entire 2009-present time period. Do not use quote marks around the name of the station. To find the Station ID of a particular station, use the Station Details mode. (e.g. \code{station:CNN}) 73 | } 74 | } 75 | 76 | \examples{ 77 | query_tv("(terror isis") 78 | query_tv("british prime minister") 79 | query_tv("mexican president") 80 | } 81 | \references{ 82 | \url{https://blog.gdeltproject.org/gdelt-2-0-television-api-debuts/} 83 | } 84 | -------------------------------------------------------------------------------- /man/read_chyrons.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/third-eye.r 3 | \name{read_chyrons} 4 | \alias{read_chyrons} 5 | \title{Retrieve TV News Archive chyrons from the Internet Archive's Third Eye project} 6 | \usage{ 7 | read_chyrons(chyron_day = Sys.Date() - 1, cleaned = TRUE) 8 | } 9 | \arguments{ 10 | \item{chyron_day}{archive day (\code{Date} or \code{character}; if \code{character} should be 11 | in \code{YYYY-mm-dd} format)} 12 | 13 | \item{cleaned}{logical, default \code{TRUE}. The "raw feed" option provides all of the 14 | OCR'ed text from chyrons at the rate of approximately one entry per second. 15 | The "clean feed" download provides the data feed that fuels the Third Eye 16 | Twitter bots; this has been filtered to find the most representative, 17 | clearest chyrons from a 60-second period, with no more than one entry/tweet per 18 | minute (though the duration may be shorter than 60 seconds.) The clean feed 19 | relies on algorithms that are a work in progress.} 20 | } 21 | \value{ 22 | \code{NULL} on irrecoverable errors, otherwise a data frame with five columns: 23 | \itemize{ 24 | \item \code{ts} (\code{POSIXct}) chyron timestamp 25 | \item \code{channel} (\code{character}) news channel the chyron appeared on 26 | \item \code{duration} (\code{integer}) see Description 27 | \item \code{details} (\code{character}) Internet Archive details path 28 | \item \code{text} (\code{character}) the chyron text 29 | } 30 | } 31 | \description{ 32 | The TV News Archive's Third Eye project captures the chyrons–or narrative text–that appear on the lower third of TV news screens and turns them into downloadable data and a Twitter feed for research, journalism, online tools, and other projects. At project launch (September 2017) we are collecting chyrons from BBC News, CNN, Fox News, and MSNBC–more than four million collected over just two weeks. Chyrons have public value because: 33 | \itemize{ 34 | \item Breaking news often appears on chyrons before TV newscasters begin reporting or video is available, whether it's a hurricane or a breaking political story. 35 | \item Which chyrons a TV news network chooses to display can reveal editorial decisions that can inform public understanding of how news is filtered for different audiences. 36 | \item Providing chyrons as data–and also on Twitter–in near real-time can serve as a alert system, showing how TV news stations are reporting the news. Often the chyrons are ahead of the general conversation on Twitter. 37 | } 38 | } 39 | \details{ 40 | Some notes on the data 41 | \itemize{ 42 | \item chyrons are derived in near real-time from the TV News Archive's collection of TV news. The constantly updating public collection contains 1.4 million TV news shows, some dating back to 2009. 43 | \item At launch, Third Eye captures four TV cable news channels: BBC News, CNN, Fox News, and MSNBC. 44 | \item Data can be affected by temporary collection outages, which typically can last minutes or hours, but rarely more. 45 | \item Dates/times are in UTC (Coordinated Universal Time). 46 | \item Because the size of the raw data is so large (about 20 megabytes per day), results are limited to seven days per request. 47 | \item Raw data collection began on August 25, 2017; the clean feed begins on September 7, 2017. 48 | \item "\code{duration}" column is in seconds–the amount of time that particular chyron appeared on the screen. 49 | } 50 | } 51 | \note{ 52 | It is \emph{highly} recommended that you use the "clean" feed unless you're researching 53 | how to correct text. This package does it's best to read in the raw feed but 54 | it often contains embedded nulls and non-standard text encodings which 55 | make it difficult to process. 56 | } 57 | -------------------------------------------------------------------------------- /man/word_cloud.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/word-cloud.R 3 | \name{word_cloud} 4 | \alias{word_cloud} 5 | \title{Retrieve top words that appear most frequently in clips matching your search} 6 | \usage{ 7 | word_cloud(query, start_date = NULL, end_date = NULL) 8 | } 9 | \arguments{ 10 | \item{query}{query string in GDELT format. See \code{QUERY} in https://blog.gdeltproject.org/gdelt-2-0-television-api-debuts/ 11 | for details; use \code{\link[=list_networks]{list_networks()}} to obtain valid station/network identifiers} 12 | 13 | \item{start_date, end_date}{start/end dates. Leaving both \code{NULL} searches all archive history. 14 | Leaving just \code{start_date} \code{NULL} sets the start date to July 2009. Leaving just \code{end_date} 15 | \code{NULL} sets the end date to today.} 16 | } 17 | \description{ 18 | The API takes the 200 most relevant clips that match your search and returns the 19 | terms for a "word cloud" of up to the top 200 most frequent words that appeared in 20 | those clips (common stop words are automatically removed). This is a powerful way 21 | of understanding the topics and words dominating the relevant coverage and 22 | suggesting additional contextual search terms to narrow or evolve your search. 23 | Note that if there are too few matching clips for your query, the word cloud may 24 | be blank. 25 | } 26 | -------------------------------------------------------------------------------- /newsflash.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | StripTrailingWhitespace: Yes 16 | 17 | BuildType: Package 18 | PackageUseDevtools: Yes 19 | PackageInstallArgs: --no-multiarch --with-keep.source 20 | PackageBuildArgs: --resave-data 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /tests/test-all.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | test_check("newsflash") 3 | -------------------------------------------------------------------------------- /tests/testthat/test-newsflash.R: -------------------------------------------------------------------------------- 1 | context("API functionality") 2 | test_that("API functionality", { 3 | 4 | #expect_that(some_function(), is_a("data.frame")) 5 | 6 | }) 7 | --------------------------------------------------------------------------------