├── .github
    └── workflows
    │   └── validate-ctv.yml
├── .gitignore
├── CONTRIBUTING.md
├── NEWS.md
├── README.md
├── WebTechnologies.Rproj
├── WebTechnologies.md
└── scripts
    ├── check-if-elevated-to-cran.R
    ├── install-test-omega-hat.R
    ├── spelling.csv
    ├── task-view-maintenance.R
    └── url-db-from-ctv-md.R


/.github/workflows/validate-ctv.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 |       - master
 6 |     paths:
 7 |       - '.github/workflows/validate-ctv.yml'
 8 |       - 'WebTechnologies.md'
 9 |   pull_request:
10 |     branches:
11 |       - main
12 |       - master
13 |     paths:
14 |       - '.github/workflows/validate-ctv.yml'
15 |       - 'WebTechnologies.md'
16 | 
17 | name: Validate task view
18 | 
19 | jobs:
20 |   validate-ctv:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - uses: cran-task-views/ctv/validate-ctv@main
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 00-check-links.R
6 | WebTechnologies.html
7 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Contributions to the Web Technologies task view are welcome from anyone and are best sent as pull requests on [the GitHub repository](https://github.com/cran-task-views/WebTechnologies). This page provides some instructions to potential contributors about how to add to the package.
 4 | 
 5 | 1. Contributions can be submitted as [a pull request](https://help.github.com/articles/creating-a-pull-request/) on GitHub by forking or cloning the [repo](https://github.com/cran-task-views/WebTechnologies), making changes and submitting the pull request.
 6 | 
 7 | 1. Pull requests should involve only one commit per substantive change. This means if you change multiple files (e.g., code and documentation), these changes should be committed together. If you don't know how to do this (e.g., you are making changes in the GitHub web interface) just submit anyway and the maintainer will clean things up.
 8 | 
 9 | 1. Before submitting the pull request, please
10 | 
11 |     * run `ctv::ctv2html("WebTechnologies.md")` and open "WebTechnologies.html" (in the root directory) to inspect the rendered html
12 |     * run `ctv::check_ctv_packages("WebTechnologies.md")`  to ensure consistency in all the links you added.
13 | 
14 | Some specific types of changes that you might make are:
15 | 
16 | 1. *Fix broken links*. Great!
17 | 
18 | 1. *Add new packages*. This is fine!
19 | 
20 | 1. *Remove packages retired from CRAN*. OK but this should be discussed on [the GitHub issues page](https://github.com/cran-task-views/WebTechnologies/issues) before submitting a pull request. My criteria is to always look for an alternative GitHub link and try to email the package author before retiring it.
21 | 
22 | Any questions you have can be opened as GitHub issues or directed to hello + r (at) pacha . dev.
23 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | CHANGES
 2 | =======
 3 | 
 4 | # 2022-06-30
 5 | 
 6 | ## Removed
 7 | 
 8 | * leafletR, osmar, RGA, rromeo, scraper, splashr, tm.plugin.webmining urlshorteneR (archived on CRAN)
 9 | 
10 | ## Updated
11 | 
12 | * gdns (moved to GitHub link)
13 | 
14 | # 2022-04-18
15 | 
16 | ## Added
17 | 
18 | * nanonext
19 | 
20 | # 2022-03-11
21 | 
22 | ## Removed
23 | 
24 | * translate (not on CRAN anymore)
25 | 
26 | # 2022-03-01
27 | 
28 | ## Added
29 | 
30 | * googlesheets4 in replacement of googlesheets.
31 | 
32 | ## Removed
33 | 
34 | * MediaWiki, in Wikipedia words "This extension has been archived. require php 5.6 or more deprecated."
35 | * tweet2r, archived on 2021-03-29.
36 | * googlesheets, which is superseeded by googlesheets4.
37 | * Outdated references to yajl (jsonlite).
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## CRAN Task View: WebTechnologies
2 | 
3 | **URL:** <https://CRAN.R-project.org/view=WebTechnologies>
4 | 
5 | **Source file:** [WebTechnologies.md](WebTechnologies.md)
6 | 
7 | **Contributions:** Suggestions and improvements for this task view are very welcome and can be made through issues or pull requests here on GitHub or via e-mail to the maintainer address. For further details see the [Web Technologies Contributing](https://github.com/cran-task-views/WebTechnologies/blob/main/CONTRIBUTING.md) and/or the [Task View Contributing](https://github.com/cran-task-views/ctv/blob/main/Contributing.md)
8 | guide. All contributions must adhere to the [code of conduct](https://github.com/cran-task-views/ctv/blob/main/CodeOfConduct.md).
9 | 


--------------------------------------------------------------------------------
/WebTechnologies.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/WebTechnologies.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: WebTechnologies
  3 | topic: Web Technologies and Services
  4 | maintainer: Mauricio Vargas Sepulveda, Will Beasley
  5 | email: m.sepulveda@mail.utoronto.ca
  6 | version: 2024-10-27
  7 | source: https://github.com/cran-task-views/WebTechnologies/
  8 | ---
  9 | 
 10 | ## 0. Introduction
 11 | 
 12 | ### Tools for Working with the Web
 13 | 
 14 | This task view recommends packages and strategies for efficiently interacting
 15 | with resources over the internet with R.
 16 | This task view focuses on:
 17 | 
 18 | 1. [Direct data download and ingestion](#direct),
 19 | 1. [Online services](#online),
 20 | 1. [Frameworks for building web-based R applications](#frameworks),
 21 | 1. [Low-level operations](#low), and
 22 | 1. [Resources](#resources)
 23 | 
 24 | If you have suggestions for improving or growing this task view,
 25 | please submit an issue or a pull request in the GitHub repository linked above.
 26 | If you can't contribute on GitHub,
 27 | please e-mail the task view maintainer.
 28 | If you have an issue with a package discussed below,
 29 | please contact the package's maintainer.
 30 | 
 31 | Thanks to all contributors to this task view, especially to
 32 | Scott Chamberlain, Thomas Leeper, Patrick Mair, Karthik Ram, and Christopher Gandrud
 33 | who maintained this task view up to 2021.
 34 | 
 35 | ### Core Tools For HTTP Requests
 36 | 
 37 | The bulk of R's capabilities are supplied by CRAN packages
 38 | that are layered on top of [libcurl](https://curl.se/libcurl/).
 39 | A handful of packages provide the foundation for most modern approaches.
 40 | 
 41 | 1. `r pkg("httr2", priority = "core")`
 42 |     and its predecessor `r pkg("httr", priority = "core")`
 43 |     are user-facing clients for HTTP requests.
 44 |     They leverage the curl package for most operations.
 45 |     If you are developing a package that calls a web service,
 46 |     we recommend reading their vignettes.
 47 | 
 48 | 1. `r pkg("crul", priority = "core")` is another package that leverages curl.
 49 |     It is an [R6](https://r6.r-lib.org/)-based client that supports
 50 |     asynchronous HTTP requests,
 51 |     a pagination helper,
 52 |     HTTP mocking via `r pkg("webmockr")`,
 53 |     and request caching for unit tests via `r pkg("vcr")`.
 54 |     crul is intended to be called by other packages, instead of R users.
 55 |     Unlike httr2,
 56 |     crul's [current version](https://docs.ropensci.org/crul/reference/auth.html#details)
 57 |     does not support OAuth.
 58 |     Additional options may be passed to curl when instantiating crul's R6 classes.
 59 | 
 60 | 1. `r pkg("curl", priority = "core")` is the lower-level
 61 |     package that provides a close interface between R and the
 62 |     [libcurl C library](https://curl.se/libcurl/).
 63 |     It is not intended to be called directly by typical R users.
 64 |     curl may be useful for operations on web-based XML or with FTP
 65 |     (as crul and httr2 are focused primarily on HTTP).
 66 | 
 67 | 1. [utils](https://stat.ethz.ch/R-manual/R-devel/library/utils/html/00Index.html) and
 68 |     [base](https://stat.ethz.ch/R-manual/R-devel/library/base/html/00Index.html)
 69 |     are the base R packages
 70 |     that provide `download.file()`, `url()`, and related functions.
 71 |     These functions also use libcurl.
 72 | 
 73 |     <!--
 74 |       They started supporting TLS/SSL with v3.3.0 and have had small increments through v4.2.2+.
 75 |       See https://cran.r-project.org/doc/manuals/r-release/NEWS.3.html and
 76 |       https://cran.r-project.org/doc/manuals/r-release/NEWS.html
 77 |     -->
 78 | 
 79 | ### Before you Start Using Web Scraping Tools
 80 | 
 81 | You may have a code to perform web scraping, and it can be very efficient by time metrics or resources usage, but first we need to talk about whether it's legal and ethical for you to do so.
 82 | 
 83 | You can use the ['polite'](https://cran.r-project.org/package=polite) package, which builds upoen the principles of seeking permission, taking slowly and never asking twice. The package builds on awesome toolkits for defining and managing http sessions (['httr'](https://cran.r-project.org/package=httr) and ['rvest'](https://cran.r-project.org/package=rvest), declaring the user agent string and investigating site policies ('robots.txt'), and utilizing rate-limiting and response caching (['ratelimitr'](https://cran.r-project.org/package=ratelimitr) and ['memoise'](https://cran.r-project.org/package=memoise)).
 84 | 
 85 | The problem is not technical, but ethical and also legal. You can technically log into an art auction site and scrape the prices of all the paintings, but if you need an account and to use 'rSelenium' to extract the information by automating clicks in the browser, you are subject to the Terms of Service (ToS).
 86 | 
 87 | Another problem is that some websites require specific connections. You can connect to a site from a university or government building and access content for free, but if you connect from home, you may find that you require a paid subscription to access the same content. If you scrape a site from a university, you might be breaking some laws if you are not carefull about the goal and scope of the scraping.
 88 | 
 89 | ## 1. [Direct data download and ingestion]{#direct}
 90 | 
 91 | In recent years,
 92 | many functions have been updated to accommodate web pages that are protected with TLS/SSL.
 93 | Consequently you can usually download a file's if its url starts with "http" or "https".
 94 | 
 95 | If the data file is not accessible via a simple url, you probably want to skip to the [Online services](#online) section.  It describes how to work with specific web services such as AWS, Google Documents, Twitter, REDCap, PubMed, and Wikipedia.
 96 | 
 97 | If the information is served by a database engine,
 98 | please review the cloud services in the [Online services](#online) section below,
 99 | as well as the *`r view("Databases")` with R* CRAN Task View.
100 | 
101 | ### Ingest a remote file directly
102 | 
103 | Many base and CRAN packages provide functions that accept a [url](https://en.wikipedia.org/wiki/URL) and
104 | return a `data.frame` or `list`.
105 | 
106 | - For tabular/rectangular plain-text structures:
107 |   - [utils](https://stat.ethz.ch/R-manual/R-devel/library/utils/html/00Index.html)'s
108 |     `read.csv()`, `read.table()`, and friends
109 |     return a `base::data.frame`.
110 |   - `r pkg("readr")`'s `read_csv()`, `read_delim()` and friends
111 |     return a `tibble::tibble`, which derives from `base::data.frame`.
112 |   - `r pkg("data.table")`'s `fread()`
113 |     returns a `data.table::data.table`, which derives from `base::data.frame`.
114 |   - `r pkg("arrow")`'s `read_csv_arrow()`
115 |     returns a `tibble::tibble()` or other [Arrow](https://arrow.apache.org/) structures.
116 | - For hierarchical/nested plain-text structures:
117 |   - `r pkg("jsonlite")`'s `fromJSON()` converts JSON into a `list`.
118 |   - `r pkg("yaml")`'s `yaml.load_file()` converts YAML into a `list`.
119 |   - `r pkg("XML")`'s `parseXML()` converts XML into a `list`.
120 |   - For HTML, see the "Parsing Structured Web Data" section below.
121 | - For structures in the Spark ecosystem:
122 |   - `r pkg("arrow")`: interacts with a variety of file types used with big data
123 |     including parquet, feather, and arrow IPC streams.
124 | - For other file structures:
125 |   - `r pkg("rio")` and `r pkg("repmis")`: accommodate many plain-text and proprietary formats.
126 | 
127 | ### Download a remote file, then ingest it
128 | 
129 | If you need to process a different type of file,
130 | you can accomplish this in two steps.
131 | First download the file from a server to your local computer;
132 | second pass the path of the new local file to a function in a
133 | package like
134 | [haven](https://CRAN.R-project.org/package=haven)
135 | or
136 | [foreign](https://cran.r-project.org/package=foreign).
137 | 
138 | Many base and CRAN packages provide functions that download files:
139 | 
140 | - [utils](https://stat.ethz.ch/R-manual/R-devel/library/utils/html/00Index.html): `download.file()`.
141 | - `r pkg("curl")`: `curl_download()`, `curl_fetch_multi()`, and friends.
142 | - `r pkg("httr2")`: `req_perform(path = <your_file_path>)`, or alternatively `req_perform()` piped to `resp_body_string()`
143 | - `r pkg("httr")`: `GET()`
144 | - `r pkg("RCurl")`: `getURL()`
145 | 
146 | ### Parsing Structured Web Data
147 | 
148 | The vast majority of web-based data is structured as
149 | plain text, HTML, XML, or JSON.
150 | Web service APIs increasingly rely on JSON, but XML is still prevalent in many applications.
151 | There are several packages for specifically working with these format.
152 | These functions can be used to interact directly with insecure web pages or
153 | can be used to parse locally stored or in-memory web files.
154 | Colloquially, these activities are called
155 | [web scraping](https://en.wikipedia.org/wiki/Web_scraping).
156 | 
157 | - *XML*:
158 |   There are two foundational packages for working with XML: `r pkg("XML")` and `r pkg("xml2")`.
159 |   Both support general XML (and HTML) parsing, including XPath queries.
160 |   `r pkg("xml2")` is less fully featured, but more user friendly with respect to memory management,
161 |   classes (e.g., XML node vs. node set vs. document), and namespaces.
162 |   Of the two, only the `r pkg("XML")` supports *de novo* creation of XML nodes and documents.
163 | 
164 |   Other XML tools include:
165 |   - `r pkg("XML2R")` is a collection of convenient functions for coercing XML into data frames.
166 |   An alternative to `r pkg("XML")` is `r pkg("selectr")`,
167 |   which parses CSS3 Selectors and translates them to XPath 1.0 expressions.
168 |   `r pkg("XML")` is often used for parsing xml and html,
169 |   but selectr translates CSS selectors to XPath,
170 |   so can use the CSS selectors instead of XPath.
171 | 
172 |   - `r github("omegahat/XMLSchema")` provides facilities in R for reading XML schema documents and
173 |   processing them to create definitions for R classes and functions for converting XML nodes
174 |   to instances of those classes.
175 |   It provides the framework for meta-computing with XML schema in R.
176 |   - `r pkg("xslt")` is an extension for `r pkg("xml2")` to transform XML documents by applying an xslt style-sheet.
177 |   This may be useful for web scraping, as well as
178 |   transforming XML markup into another human- or machine-readable format
179 |   (e.g., HTML, JSON, plain text, etc.).
180 | 
181 | - *HTML*:
182 |   All of the tools that work with XML also work for HTML, though HTML tends to be more prone to be malformed.  So `xml2::read_html()` is a good first function to use for importing HTML.  Other tools are designed specifically to work with HTML.
183 |   - For capturing static content of web pages `r pkg("postlightmercury")` is a client
184 |     for the web service 'Mercury' that turns web pages into structured and clean text.
185 |   - `r pkg("rvest")` is another higher-level alternative which expresses common web scraping tasks
186 |     with [pipes](https://r4ds.hadley.nz/workflow-pipes.html)
187 |     (like Base R's `|>` and magrittr's `%>%`).
188 |   - `r pkg("boilerpipeR")` provides generic extraction of main text content from HTML files;
189 |   removal of ads, sidebars and headers using the boilerpipe Java library.
190 |   - PhantomJS (which was [archived in 2018](https://github.com/ariya/phantomjs/issues/15344)): `r pkg("webshot")` uses PhantomJS to provide screenshots of web pages without a browser.
191 |     It can be useful for testing websites (such as Shiny applications).
192 |   `  r github("cpsievert/rdom")` uses PhantomJS to access a webpage's Document Object Model (DOM).
193 |   - `r pkg("htmltools")` provides functions to create HTML elements.
194 | 
195 |   - `r github("omegahat/RHTMLForms")` reads HTML documents and obtains a description of each of the forms it contains,
196 |   along with the different elements and hidden fields.
197 |   `r pkg("htm2txt")` uses regex to converts html documents to plain text by removing all html tags.
198 |   `r pkg("Rcrawler")` does crawling and scraping of web pages.
199 |   - *HTML Utilities*:
200 |     These tools don't extract content, but they can help your develop and debug.
201 |     - `r pkg("W3CMarkupValidator")` provides an R Interface to W3C Markup Validation Services for validating HTML documents.
202 |     - The [selectorgadget browser extension](https://selectorgadget.com/) can be used to identify page elements.
203 | 
204 | - *JSON*:
205 |   There are several packages for reading and writing JSON:
206 |   `r pkg("rjson")`,
207 |   `r pkg("RJSONIO")`, and
208 |   `r pkg("jsonlite")`.
209 |   We recommend using `r pkg("jsonlite")`.
210 |   Check out the paper describing jsonlite by Jeroen Ooms <https://arxiv.org/abs/1403.2805>.
211 |   `r pkg("jqr")` provides bindings for the fast JSON library 'jq'.
212 |   `r pkg("jsonvalidate")` validates JSON against a schema using the "is-my-json-valid" JavaScript library;
213 |   `r pkg("ajv")` does the same using the 'ajv' JavaScript library.
214 |   `r pkg("ndjson")` supports the "ndjson" format.
215 | 
216 | - *RSS/Atom*:
217 |   `r github("datawookie/feedeR")` can be used to parse RSS or Atom feeds.
218 |   `r pkg("tidyRSS")` parses RSS, Atom XML/JSON and geoRSS into a tidy data.frame.
219 | - `r pkg("swagger")` can be used to automatically generate functions for working with an web service API
220 |   that provides documentation in [Swagger.io](https://swagger.io/) format.
221 | 
222 | ## 2. [Online services]{#online}
223 | 
224 | ### Cloud Computing and Storage
225 | 
226 | - *Amazon Web Services (AWS)*:
227 |   - `r pkg("paws")` is an interface to nearly all AWS APIs,
228 |     including compute, storage, databases, and machine learning.
229 |     It also requires no external system dependencies.
230 |   - `r pkg("aws.signature")` provides functionality for generating AWS API request signatures.
231 |   - *Elastic Cloud Compute (EC2)* is a cloud computing service.
232 |     `r gcode("segue")` manages EC2 instances and S3 storage,
233 |     which includes a parallel version of `lapply()`
234 |     for the Elastic Map Reduce (EMR) engine called `emrlapply()`.
235 |     It uses Hadoop Streaming on Amazon's EMR in order to get simple parallel computation.
236 | - *Microsoft Azure*: Azure and Microsoft 365 are Microsoft's cloud computing services.
237 |   - The Azure platform provides Paas, SaaS and IaaS and supports many different tools and frameworks,
238 |     including both Microsoft-specific and third-party systems;
239 |     while Microsoft 365 is a unified framework for accessing cloud data from
240 |     Microsoft's Office services, Windows and Dynamics.
241 |     The [AzureR package family](https://github.com/Azure/AzureR)
242 |     aims to provide a suite of lightweight, powerful tools for working with Azure in R.
243 |     The packages listed below are part of the family, and are also mirrored at the cloudyr project.
244 |   - *Azure Active Directory (AAD)* is a centralized directory and identity service.
245 |     `r pkg("AzureAuth")` is an R client for AAD;
246 |     use this to obtain OAuth tokens for authenticating with other Azure services,
247 |     including Resource Manager and storage (see next).
248 |   - *Microsoft Graph* is the API framework for the Microsoft 365 platform,
249 |     including Azure Active Directory and Office.
250 |     `r pkg("AzureGraph")` is a low-level extensible R6-based interface to Graph.
251 |     `r pkg("Microsoft365R")` is an interface to the Office part of Microsoft 365,
252 |     including OneDrive and SharePoint Online.
253 |   - *Azure Resource Manager (ARM)* is a service for deploying other Azure services.
254 |     `r pkg("AzureRMR")` is an R interface to ARM, and allows managing
255 |     subscriptions, resource groups, resources and templates.
256 |     It exposes a general R6 class framework that can extended to provide extra
257 |     functionality for specific services (see next).
258 |   - *Azure Storage Accounts* are a general-purpose data storage facility.
259 |     Different types of storage are available: file, blob, table, Data Lake, and more.
260 |     `r pkg("AzureStor")` provides an R interface to storage.
261 |     Features include clients for file, blob and Data Lake Gen2 storage,
262 |     parallelized file transfers,
263 |     and an interface to Microsoft's cross-platform AzCopy command line utility.
264 |     Also supplied is an ARM interface, to allow creation and managing of storage accounts.
265 |     `r pkg("AzureTableStor")` and `r pkg("AzureQstor")` extend AzureStor
266 |     to provide interfaces to table storage and queue storage respectively
267 |   - `r pkg("AzureVM")` creates and manages virtual machines in Azure.
268 |     It includes templates for a wide variety of common VM specifications and
269 |     operating systems, including Windows, Ubuntu, Debian and RHEL.
270 |   - `r pkg("AzureContainers")` provides a unified facility for working with containers in Azure.
271 |     Specifically, it includes R interfaces to
272 |     *Azure Container Instances (ACI)*,
273 |     *Azure Docker Registry (ACR)* and
274 |     *Azure Kubernetes Service (AKS)*.
275 |     Create Docker images and push them to an ACR repository;
276 |     spin up ACI containers;
277 |     deploy Kubernetes services in AKS.
278 |   - *Azure Data Explorer*, also known as *Kusto*, is a fast, scalable
279 |     data exploration and analytics service.
280 |     `r pkg("AzureKusto")` is an R interface to ADE/Kusto.
281 |     It includes a dplyr client interface similar to that provided
282 |     by dbplyr for SQL databases, a DBI client interface, and an
283 |     ARM interface for deploying and managing Kusto clusters and databases.
284 |   - *Azure Cosmos DB* is a multi-model NoSQL database service,
285 |     previously known as Document DB.
286 |     `r pkg("AzureCosmosR")` is an interface to the core/SQL API for Cosmos DB.
287 |     It also includes simple bridges to the table storage and MongoDB APIs.
288 |   - *Azure Computer Vision* and *Azure Custom Vision* are AI services for
289 |     image recognition and analysis.
290 |     Computer Vision is a pre-trained service for handling commonly-encountered tasks,
291 |     while Custom Vision allows you to train your own image recognition model on a custom dataset.
292 |     `r pkg("AzureVision")` provides an interface to both these services.
293 |   - *Application Insights* provides application performance monitoring and and usage tracking
294 |     of live web applications.
295 |     `r pkg("AzureAppInsights")` allows developers of Shiny apps to include the Application Insights JS SDK
296 |     in their apps for tracking performance. Not part of the cloudyr project or AzureR package family.
297 | - *Google Cloud and Google Drive*:
298 |   - `r pkg("googledrive")` interfaces with Google Drive.
299 |   - `r pkg("googleComputeEngineR")` interacts with the Google Compute Engine API,
300 |     and lets you create, start and stop instances in the Google Cloud.
301 |   - `r pkg("googleCloudStorageR")` interfaces with Google Cloud Storage.
302 |   - `r pkg("bigrquery")`: An interface to Google's BigQuery.
303 |   - `r pkg("rrefine")` provides a client for the 'Open Refine' (formerly 'Google Refine') data cleaning service.
304 |   - `r pkg("gargle")`: An interface to [Google APIs](https://developers.google.com/apis-explorer).
305 |   - Look in other sections of the Web Technologies task view for packages interfacing other Google products.
306 | - *Dropbox*:
307 |   `r pkg("repmis")`'s `source_Dropbox()` function
308 |   for downloading/caching plain-text data from non-public folders.
309 | - *Other Cloud Storage*:
310 |   `r pkg("boxr")` is a lightweight, high-level interface for the
311 |   [box.com API](https://developer.box.com/reference/).
312 | 
313 | - *Docker*:
314 |   `r pkg("analogsea")` is a general purpose client for the Digital Ocean v2 API.
315 |   In addition, it includes functions to install various R tools
316 |   including base R, RStudio server, and more.
317 |   There's an improving interface
318 |   to interact with docker on your remote droplets via this package.
319 | - `r pkg("crunch")` provides an interface to the [crunch.io](https://crunch.io/)
320 |   storage and analytics platform.
321 |   `r pkg("crunchy")` facilitates making Shiny apps on Crunch.
322 | - [The cloudyr project](https://cloudyr.github.io/) aims to provide interfaces to popular
323 |   Amazon, Azure and Google cloud services without the need for external system dependencies.
324 |   Amazon Web Services is a popular, proprietary cloud service offering a
325 |   suite of computing, storage, and infrastructure tools.
326 | - `r pkg("pins")` can be used to publish data, models, and other R objects across a range of 
327 |   backends, including AWS, Azure, Google Cloud Storage, and Posit Connect.
328 | 
329 | ### Software Development
330 | 
331 | - [*R-hub*](https://builder.r-hub.io/) is a collection of free services to help R package development across all architectures.
332 |   `r pkg("rhub")` interfaces with R-Hub to allow you to check a package on the platform.
333 | - [*GitHub*](https://github.com/):
334 |   `r pkg("gistr")` works with GitHub gists ([gist.github.com](https://gist.github.com/discover)) from R,
335 |   allowing you to create new gists, update gists with new files, rename files, delete files,
336 |   get and delete gists, star and un-star gists, fork gists, open a gist in your default browser,
337 |   get embed code for a gist, list gist commits, and get rate limit information when authenticated.
338 |   `r pkg("git2r")` provides bindings to the git version control system and
339 |   `r pkg("gh")` is a client for the GitHub API.
340 | - [*GitLab*](https://about.gitlab.com/):
341 |   `r pkg("gitlabr")` is a GitLab-specific client.
342 | 
343 | ### Document and Images
344 | 
345 | - *Data archiving*:
346 |   `r pkg("dataverse")` provides access to [Dataverse](https://dataverse.org/), the open source research data repository software.
347 |   `r pkg("rfigshare")` connects with [Figshare.com](https://figshare.com/).
348 |   `r pkg("dataone")` provides a client for 'DataONE' repositories.
349 | - *Google Sheets*:
350 |   `r pkg("googlesheets4")` (replaces `googlesheets`) can access private or public 'Google Sheets'
351 |   by title, key, or URL. Extract data or edit data.
352 |   Create, delete, rename, copy, upload, or download spreadsheets and worksheets.
353 |   `r pkg("gsheet")` can download Google Sheets using just the sharing link.
354 |   Spreadsheets can be downloaded as a data frame, or as plain text to parse manually.
355 | - `r pkg("imguR")` shares plots using the image hosting service [Imgur.com](https://imgur.com/).
356 |   knitr also has a function `imgur_upload()` to load images from literate programming documents.
357 | - *Teams*, *SharePoint* and *OneDrive*:
358 |   `r pkg("Microsoft365R")` provides an interface to these services,
359 |   which form part of the Microsoft 365 (formerly known as Office 365) suite.
360 | 
361 | ### Data Processing and Visualization
362 | 
363 | - *Document Processing*:
364 |   `r pkg("pdftables")` uses [the PDFTables.com webservice](https://pdftables.com/) to extract tables from PDFs.
365 | - *Visualization*:
366 |   Plot.ly is a company that allows you to create visualizations in the web using R (and Python),
367 |   which is accessible via `r pkg("plotly")`.
368 |   `r pkg("googleVis")` provides an interface between R and the Google chart tools.
369 | - *Other* :
370 |   `r pkg("rrefine")` can import to and export from the 'OpenRefine' data cleaning service.
371 | 
372 | ### Machine Learning and Translation
373 | 
374 | This list describes online services.  For a more complete treatment of the topic, please see the *`r view("MachineLearning")`* CRAN Task View.
375 | 
376 | - *Machine Learning as a Service*:
377 |   Several packages provide access to cloud-based machine learning services.
378 |   `r pkg("OpenML")` is the official client for [the OpenML API](https://www.openml.org/apis).
379 |   `r pkg("clarifai")` is a [Clarifai.com](https://www.clarifai.com/) client that enables automated image description.
380 |   `r pkg("rLTP")` accesses the [ltp-cloud service](https://www.ltp-cloud.com/).
381 |   `r pkg("languagelayeR")` is a client for Languagelayer, a language detection API.
382 |   `r pkg("yhatr")` lets you deploy, maintain, and invoke models via the Yhat REST API.
383 |   `r pkg("datarobot")` works with Data Robot's predictive modeling platform.
384 |   `r pkg("mscsweblm4r")` interfaces with the Microsoft Cognitive Services Web Language Model API and
385 |   `r pkg("mscstexta4r")` uses the Microsoft Cognitive Services Text Analytics REST API.
386 |   `r pkg("rosetteApi")` links to the 'Rosette' text analysis API.
387 |   `r pkg("googleLanguageR")` provides interfaces to Google's
388 |   Cloud Translation API, Natural Language API, Cloud Speech API, and the Cloud Text-to-Speech API.
389 |   `r pkg("AzureVision")` provides interfaces to the Azure Computer Vision and Custom Vision image recognition services.
390 | - *Machine Translation*:
391 |   `r pkg("RYandexTranslate")` connects to Yandex Translate.
392 | 
393 | ### Spatial Analysis
394 | 
395 | This list describes online services.  For a more complete treatment of the topic, please see the *Analysis `r view("Spatial")` Data* CRAN Task View.
396 | 
397 | - *Geolocation/Geocoding*:
398 |   Services that translate between addresses and longlats.
399 |   `r pkg("rgeolocate")` offers several online and offline tools.
400 |   `r github("trestletech/rydn")` is an interface to the
401 |   Yahoo Developers network geolocation APIs, and
402 |   `r github("hrbrmstr/ipapi")` can be used to geolocate IPv4/6 addresses and/or domain names
403 |   using the <http://ip-api.com/> API.
404 |   `r pkg("opencage")` provides access to to the 'OpenCage' geocoding service.
405 |   `r pkg("nominatimlite")` and
406 |   `r github("hrbrmstr/nominatim")` connect to the
407 |   OpenStreetMap Nominatim API for reverse geocoding.
408 |   `r pkg("PostcodesioR")` provides post code lookup and
409 |   geocoding for the United Kingdom.
410 |   `r pkg("geosapi")` is an R client for the 'GeoServer' REST API,
411 |    an open source implementation used widely for serving spatial data.
412 |   `r pkg("geonapi")` provides an interface to the 'GeoNetwork' legacy API,
413 |   an open source catalogue for managing geographic metadata.
414 |   `r pkg("ows4R")` is a new R client for the 'OGC' standard Web-Services,
415 |   such Web Feature Service (WFS) for data and Catalogue Service (CSW) for metadata.
416 | 
417 | - *Mapping*:
418 |   Services that help create visual maps.
419 |   - *OpenStreetMap*:
420 |   `r github("ropensci/osmplotr")` extracts customizable map images.
421 |   - *Google Maps*:
422 |   `r pkg("RgoogleMaps")` serves two purposes:
423 |   it provides a comfortable R interface to query the Google server for static maps, and
424 |   uses the map as a background image to overlay plots within R.
425 |   `r pkg("mapsapi")` is an sf-compatible interface to Google Maps API.
426 | 
427 | - *Routing*:
428 |   Services that calculate and optimize distances and routes.
429 |   - *OpenStreetMap*:
430 |   `r pkg("osrm")` assists with the computation of routes, trips, isochrones and travel distances matrices.
431 | 
432 | ### Social Media Clients
433 | 
434 | The following packages provide an interface to its associated service, unless noted otherwise.
435 | 
436 | - *Twitter*:
437 |   `r pkg("rtweet")` provides an interface through its API.
438 |   `r github("gvegayon/twitterreport")` focuses on report generation based on Twitter data.
439 |   `r pkg("streamR")` allows users to access Twitter's filter,
440 |   sample, and user streams, and to parse the output into data frames.
441 |   OAuth authentication is supported.
442 |   `r pkg("graphTweets")` produces a network graph from a data.frame of tweets.
443 |   `r github("pablobarbera/twitter_ideology")` implements a political ideology scaling measure for specified Twitter users.
444 | - *Facebook*:
445 |   `r pkg("Rfacebook")`
446 | - *Instagram*:
447 |   `r pkg("instaR")`
448 | - *LinkedIn*:
449 |   `r pkg("Rlinkedin")`
450 | - *Stack Exchange*:
451 |   `r github("dgrtwo/stackr")`
452 | - *Pinterest*:
453 |   `r pkg("rpinterest")`
454 | - *VK*:
455 |   `r pkg("vkR")` the social networking site based in Russia.
456 | - *Meetup*:
457 |   `r github("rladies/meetupr")`
458 | - *Brandwatch*:
459 |   `r pkg("brandwatchR")`
460 | - *Hacker News*:
461 |   `r pkg("hackeRnews")`
462 | - *Mastodon*:
463 |   `r pkg("rtoot")`
464 | - *Slack*:
465 |   `r pkg("slackr")`
466 | - *Discourse*:
467 |   `r github("sckott/discgolf")`
468 |   provides an interface to an instance of Discourse, not to the Discourse site itself.
469 | 
470 | ### Survey, Questionnaire, and Data Capture Tools
471 | 
472 | - *REDCap*:
473 |   - `r pkg("REDCapR")` and `r pkg("redcapAPI")` export and import data from a REDCap,
474 |     a web application for building and managing online surveys and research databases.
475 |   - Another layer of packages provide additional extensions that to streamline many common operations, including
476 |     `r pkg("REDCapTidieR")`, 
477 |     `r pkg("tidyREDCap")`,
478 |     `r pkg("ReviewR")`,
479 |     `r pkg("REDCapCAST")`, and
480 |     `r pkg("REDCapDM")`.
481 | - *Qualtrics*:
482 |   `r pkg("qualtRics")` provide functions to interact with Qualtrics,
483 |   an online survey and data collection software platform.
484 | - *Wufoo*:
485 |   `r pkg("WufooR")` retrieves data from Wufoo,
486 |   which is another data collection tool from the SurveyMonkey company.
487 | - *formr*:
488 |  `r github("rubenarslan/formr")` facilitates use of the formr online survey framework,
489 |   which relies on R via OpenCPU.
490 | - *Experigen*:
491 |   `r pkg("Rexperigen")` is a client for Experigen,
492 |   which is a platform for creating phonology experiments.
493 | - *Usersnap*:
494 |   `r github("nealrichardson/useRsnap")` connects to Usersnap,
495 |   a tool for collecting feedback from web application users.
496 | - *KoboToolbox*:
497 |   `r pkg("robotoolbox")` is a suite of utilities for accessing and manipulating data from the [KoboToolbox](https://www.kobotoolbox.org/) API.
498 | 
499 | ### Web Analytics
500 | 
501 | The following packages interface with *online services* that facilitate web analytics.
502 | 
503 | - *Google*
504 |   - *Google Adwords*:
505 |     `r pkg("RAdwords")`
506 |   - *Google Analytics*:
507 |     `r pkg("googleAnalyticsR")`
508 |   - *Google Trends*:
509 |     `r pkg("gtrendsR")`
510 |   <!-- - *Google Marketing Platform*: `r github("WillemPaling/RDoubleClick")`  -->
511 | -  *Azure*
512 |   - *Application Insights*:
513 |     `r pkg("AzureAppInsights")`
514 | - *Facebook Marketing*:
515 |   `r pkg("fbRads")`
516 | - *Smartly.io*:
517 |   `r pkg("RSmartlyIO")` loads Facebook and Instagram advertising data via the advertising service.
518 | 
519 | The following packages interface with *tools* that facilitate web analytics.
520 | 
521 | - `r pkg("webreadr")` can process various common forms of request log,
522 |   including the Common and Combined Web Log formats and AWS logs.
523 | - `r pkg("WebAnalytics")` provides tools for analysis of
524 |   web application performance, workload and user population.
525 |   There is some overlap with `webreadr`, but webreader focuses on reading log files,
526 |   while WebAnalytics focuses on analysing them.
527 | 
528 | ### Publications
529 | 
530 | - *Reference/bibliography/citation management*:
531 |   `r pkg("rorcid")` connects to the [ORCID.org](https://orcid.org/) API,
532 |   which can identify scientific authors and their publications (e.g., by DOI).
533 |   `r pkg("rdatacite")` connects to [DataCite](https://datacite.org/),
534 |   which manages DOIs and metadata for scholarly datasets.
535 |   `r pkg("scholar")` extracts citation data from [Google Scholar](https://scholar.google.com/).
536 |   `r pkg("rscopus")` extracts citation data from [Elsevier Scopus](https://www.elsevier.com/solutions/scopus).
537 |   Convenience functions are also provided for comparing multiple scholars and
538 |   predicting future h-index values.
539 |   `r pkg("mathpix")` converts an image of a formula (typeset or handwritten)
540 |   via Mathpix webservice to produce the 'LaTeX' code.
541 |   `r pkg("zen4R")` connects to [Zenodo](https://zenodo.org/) API,
542 |   including management of depositions, attribution of DOIs and upload of files.
543 | 
544 | - *Literature*:
545 |   `r pkg("europepmc")` connects to the Europe PubMed Central service.
546 |   `r pkg("pubmed.mineR")` is for text mining of [PubMed Abstracts](https://pubmed.ncbi.nlm.nih.gov/)
547 |   that supports fetching text and XML from PubMed.
548 |   `r pkg("jstor")` retrieves
549 |   metadata, ngrams and full-texts from Data for Research service by JSTOR.
550 |   `r pkg("aRxiv")` connects to arXiv, a repository of electronic preprints for
551 |   computer science, mathematics, physics, quantitative biology, quantitative finance, and statistics.
552 |   `r pkg("roadoi")` connects to the [Unpaywall API](https://unpaywall.org/products/api)
553 |   for finding free full-text versions of academic papers.
554 |   `r pkg("rcrossref")` is an interface to Crossref's API.
555 | 
556 | ### Generating Synthetic Data
557 | 
558 | - *MockaRoo API*:
559 |   `r github("stephlocke/mockaRoo")` generates mock or fake data based on an input schema.
560 | - *RandomAPI*:
561 |   `r github("karthik/randNames")` generates random names and personal identifying information.
562 | 
563 | ### Sports Analytics
564 | 
565 | Many CRAN packages interact with services facilitating sports analysis.  For a more complete treatment of the topic, please see the *`r view("SportsAnalytics")`* CRAN Task View.
566 | 
567 | ### Reproducible Research
568 | 
569 | Using packages in this Web Technologies task view can help you acquire data programmatically, which can facilitate Reproducible Research.
570 | Please see the *`r view("ReproducibleResearch")`* CRAN Task View for more tools and information:
571 | 
572 | > "The goal of reproducible research is to tie specific instructions to data analysis and experimental data so that scholarship can be recreated, understood, and verified."
573 | 
574 | ### Other Web Services
575 | 
576 | - *Push Notifications*:
577 |   `r pkg("RPushbullet")` provides an easy-to-use interface for the Pushbullet service
578 |   which provides fast and efficient notifications between
579 |   computers, phones and tablets.
580 |   `r pkg("pushoverr")` can sending push notifications to mobile devices (iOS and Android) and desktop
581 |   using 'Pushover'.
582 |   `r pkg("notifyme")` can control Phillips Hue lighting.
583 | 
584 | - *Automated Metadata Harvesting*:
585 |   `r pkg("oai")` and `r pkg("OAIHarvester")` harvest metadata
586 |   using the Open Archives Initiative Protocol for Metadata Harvesting (OAI-PMH) standard.
587 | 
588 | - *Wikipedia*:
589 |   `r pkg("WikipediR")` is a wrapper for the 'MediaWiki' API,
590 |   aimed particularly at the 'Wikimedia' "production" wikis, such as 'Wikipedia'.
591 |   `r pkg("WikidataR")` can request data from
592 |   [Wikidata.org](https://www.wikidata.org/wiki/Wikidata:Main_Page), the free knowledge base.
593 |   `r pkg("WikidataQueryServiceR")` is a client for the [Wikidata Query Service](https://query.wikidata.org/).
594 | 
595 | - `r pkg("rerddap")`: A generic R client to interact with any ERDDAP instance,
596 |   which is a special case of OPeNDAP (<https://en.wikipedia.org/wiki/OPeNDAP>),
597 |   or *Open-source Project for a Network Data Access Protocol*.
598 |   Allows user to swap out the base URL to use any ERDDAP instance.
599 | 
600 | - `r pkg("duckduckr")` is an R interface to [DuckDuckGo](https://duckduckgo.com/)
601 | 
602 | ## 3. [Frameworks for building web-based R applications]{#frameworks}
603 | 
604 | - [Model Operationalization](https://docs.microsoft.com/en-us/machine-learning-server/what-is-operationalization)
605 |   (previously DeployR)
606 |   is a Microsoft product that provides support for deploying R and Python models and
607 |   code to a server as a web service to later consume.
608 | - `r pkg("shiny")` makes it easy to build interactive web applications with R.
609 | - `r github("plotly/dashR")` is a web framework which is available for
610 |   Python, R and Julia, with components written in React.js.
611 | - Other web frameworks include:
612 |   `r pkg("fiery")` that is meant to be more flexible but less easy to use than shiny
613 |   (`r pkg("reqres")` and `r pkg("routr")` are utilities used by fiery
614 |   that provide HTTP request and response classes, and HTTP routing, respectively);
615 |   `r github("att/rcloud")` provides an iPython notebook-style web-based R interface; and
616 |   `r pkg("Rook")`, which contains the specification and convenience software
617 |   for building and running Rook applications.
618 | - The `r pkg("opencpu")` framework for embedded statistical computation and reproducible research
619 |   exposes a web API interfacing R, LaTeX and Pandoc.
620 |   This API is used for example to integrate statistical functionality into systems,
621 |   share and execute scripts or reports on centralized servers,
622 |   and build R based apps.
623 | - Several general purpose server/client frameworks for R exist.
624 |   `r pkg("Rserve")` and
625 |   `r pkg("RSclient")`
626 |   provide server and client functionality for TCP/IP or local socket interfaces.
627 |   `r pkg("httpuv")` provides a low-level socket and protocol support
628 |   for handling HTTP and WebSocket requests directly within R.
629 |   Another related package, perhaps which `r pkg("httpuv")` replaces, is `websockets` (retired from CRAN).
630 |   `r pkg("servr")` provides a simple HTTP server to serve files under a given directory based on httpuv.
631 | - Several packages offer functionality for turning R code into a web API.
632 |   `r pkg("FastRWeb")` provides some basic infrastructure for this.
633 |   `r pkg("plumber")` allows you to create a REST API by decorating existing R source code.
634 |   `r pkg("beakr")` provides an R version of functionality found in python Flask and JavaScript Express.js.
635 | - `r github("omegahat/RDCOMClient")` which provides user-level access from R to other COM servers.
636 | - `r pkg("radiant")` is Shiny-based GUI for R that runs in a browser from a server or local machine.
637 | - The 'Tiki' Wiki CMS/Groupware framework has an R plugin (`PluginR`) to run R code from wiki pages,
638 |   and use data from their own collected web databases (trackers).
639 |   A demo: <https://r.tiki.org/tiki-index.php> .
640 | - `r pkg("whisker")`: Implementation of logicless templating based on 'Mustache' in R.
641 | - Mustache syntax is described in <http://mustache.github.io/mustache.5.html>
642 | 
643 | ### Other Useful Packages and Functions
644 | 
645 | - *JavaScript*:
646 |   `r pkg("V8")` is an R interface to Google's open source, high performance JavaScript engine.
647 |   It can wrap JavaScript libraries as well as NPM packages.
648 |   `r pkg("js")` wraps `r pkg("V8")` and validates, reformats, optimizes and analyzes JavaScript code.
649 | - *Email*:
650 |   `r pkg("mailR")` is an interface to Apache Commons Email to send emails from within R.
651 |   `r pkg("sendmailR")` provides a simple SMTP client.
652 |   `r pkg("gmailr")` provides access the Google's gmail.com RESTful API.
653 |   `r pkg("Microsoft365R")` provides a client for Microsoft's Outlook email service,
654 |   both personal (outlook.com) and
655 |   as part of the Microsoft 365 (formerly known as Office 365) suite.
656 | - *Mocking*:
657 |   `r pkg("webmockr")` stubs and sets expectations on HTTP requests.
658 |   It is inspired from Ruby's `webmock`.
659 |   `r pkg("webmockr")` only helps mock HTTP requests, and returns nothing when requests match expectations.
660 |   It integrates with `r pkg("crul")` and `r pkg("httr")`.
661 |   See *Testing* for mocking with returned responses.
662 | - *Testing*:
663 |   `r pkg("vcr")` provides an interface to easily cache HTTP requests in R package test suites
664 |   (but can be used outside of testing use cases as well).
665 |   vcr relies on `r pkg("webmockr")` to do the HTTP request mocking.
666 |   vcr integrates with `r pkg("crul")` and `r pkg("httr")`.
667 |   `r pkg("httptest")` provides a framework for testing packages that communicate with HTTP APIs,
668 |   offering tools for mocking APIs, for recording real API responses for use as mocks,
669 |   and for making assertions about HTTP requests,
670 |   all without requiring a live connection to the API server at runtime.
671 |   httptest only works with httr.
672 | - *Miscellaneous*:
673 |   `r pkg("webutils")` contains various functions for developing web applications,
674 |   including parsers for `application/x-www-form-urlencoded` as well as `multipart/form-data`.
675 |   `r pkg("mime")` guesses the MIME type for a file from its extension.
676 |   `r pkg("rsdmx")` provides tools to read data and metadata documents exchanged through the
677 |   Statistical Data and Metadata Exchange (SDMX) framework;
678 |   it focuses on the SDMX XML standard format(SDMX-ML).
679 |   `r pkg("robotstxt")` provides functions and classes for parsing robots.txt files and
680 |   checking access permissions;
681 |   `r pkg("spiderbar")` does the same.
682 |   `r pkg("uaparserjs")` uses the JavaScript ["ua-parser" library](https://github.com/ua-parser)
683 |   to parse User-Agent HTTP headers.
684 |   `r pkg("rapiclient")` is a client for consuming APIs that follow the
685 |   [Open API format](https://www.openapis.org/).
686 |   `r pkg("restfulr")` models a RESTful service as if it were a nested R list.
687 | 
688 | ## 4. [Low-level operations]{#low}
689 | 
690 | ### Tools for Working with URLs
691 | 
692 | - The `httr::parse_url()` function can be used to extract portions of a URL.
693 |   The `RCurl::URLencode()` and `utils::URLencode()` functions can be used to encode character strings for use in URLs.
694 |   `utils::URLdecode()` decodes back to the original strings.
695 |   `r pkg("urltools")` can also handle URL encoding, decoding, parsing, and parameter extraction.
696 | - `r pkg("ipaddress")` facilitates for working with IP addresses and networks.
697 | - `r pkg("urlshorteneR")` offers URL expansion and analysis for Bit.ly, Goo.gl, and is.gd.
698 |   `r pkg("longurl")` uses the longurl.org API to provide similar functionality.
699 | - `r github("hrbrmstr/gdns")` provides access to Google's secure HTTP-based DNS resolution service.
700 | 
701 | ### Additional tools for internet communication
702 | 
703 | For specialized situations, the following resources may be useful:
704 | 
705 | - `r pkg("RCurl")` is another low-level client for libcurl.
706 |   Of the two low-level curl clients, we recommend using `r pkg("curl")`.
707 |   `r pkg("httpRequest")` is another low-level package for HTTP requests that implements
708 |   the GET, POST and multipart POST verbs,
709 |   but we do not recommend its use.
710 | - `r pkg("request")` provides a high-level package that is useful for developing other API client packages.
711 |   `r pkg("httping")` provides simplified tools to ping and time HTTP requests, around `r pkg("httr")` calls.
712 |   `r pkg("httpcache")` provides a mechanism for caching HTTP requests.
713 | - `r pkg("nanonext")` is an alternative low-level sockets implementation that can be used to perform HTTP and
714 |   streaming WebSocket requests synchronously or asynchronously over its own concurrency framework.
715 |   It uses the NNG/mbedTLS libraries as a backend.
716 | - For dynamically generated webpages (i.e., those requiring user interaction to display results),
717 |   `r pkg("RSelenium")` can be used to automate those interactions and extract page contents.
718 |   It provides a set of bindings for the Selenium 2.0 webdriver using the 'JsonWireProtocol'.
719 |   It can also aid in automated application testing, load testing, and web scraping.
720 |   `r pkg("seleniumPipes")` provides a "pipe"-oriented interface to the same.
721 | - *Authentication*: Using web resources can require authentication,
722 |   either via API keys, OAuth, username:password combination, or via other means.
723 |   Additionally, sometimes web resources that require authentication be in the header of an http call,
724 |   which requires a little bit of extra work. API keys and username:password combos can be combined
725 |   within a url for a call to a web resource, or can be specified via commands in
726 |   `r pkg("RCurl")` or `r pkg("httr2")`.
727 |   OAuth is the most complicated authentication process,
728 |   and can be most easily done using `r pkg("httr2")`.
729 | 
730 |   See the 6 demos within `r pkg("httr")`,
731 |   three for OAuth 1.0 (LinkedIn, Twitter, Vimeo) and
732 |   three for OAuth 2.0 (Facebook, GitHub, Google).
733 |   `r pkg("ROAuth")` provides a separate R interface to OAuth.
734 |   OAuth is easier to to do in `r pkg("httr")`, so start there.
735 |   `r pkg("googleAuthR")` provides an OAuth 2.0 setup specifically for Google web services,
736 |   and `r pkg("AzureAuth")` provides similar functionality for Azure Active Directory.
737 | 
738 | ### Handling HTTP Errors/Codes
739 | 
740 | - `r pkg("fauxpas")` brings a set of Ruby or Python like R6 classes for each individual HTTP status code,
741 |   allowing simple and verbose messages, with a choice of using messages, warnings, or stops.
742 | - `r pkg("httpcode")` is a simple package to help a user/package find HTTP status codes and
743 |   associated messages by name or number.
744 | 
745 | ### Security
746 | 
747 | - `r github("hrbrmstr/securitytxt")` identifies and parses web Security policy files.
748 | 
749 | ## 5. Resources
750 | 
751 | ### Links
752 | 
753 | - [Omega Project for Statistical Computing](https://OmegaHat.net/):
754 | Open-source packages from authors in (or close to) the R Core Team,
755 | especially for web-based technologies, actively developed 1998-2013.
756 | 


--------------------------------------------------------------------------------
/scripts/check-if-elevated-to-cran.R:
--------------------------------------------------------------------------------
 1 | # See https://github.com/cran-task-views/WebTechnologies/issues/505
 2 | 
 3 | library(stringr)
 4 | library(available)
 5 | library(purrr)
 6 | library(rvest)
 7 | 
 8 | file_content <- readLines("WebTechnologies.md")
 9 | 
10 | github_packages <- unlist(str_extract_all(file_content, "r github\\(\"([^\"]+)\"\\)"))
11 | github_packages <- str_match(github_packages, "r github\\(\"([^\"]+)\"\\)")[, 2]
12 | github_packages <- str_replace(github_packages, ".*/", "")
13 | 
14 | packages <- list()
15 | 
16 | # Check each package and print results
17 | for (pkg in github_packages) {
18 |   packages[[pkg]] <- available_on_cran(pkg)
19 | }
20 | 
21 | on_cran <- map_lgl(packages, function(x) isFALSE(as.logical(x)))
22 | on_cran <- github_packages[on_cran]
23 | 
24 | on_cran %>% 
25 |   rlang::set_names() %>% 
26 |   purrr::map_lgl(available::available_on_cran) %>% 
27 |   tibble::enframe(
28 |     value = "available_on_cran"
29 |   )
30 | 
31 | urls <- sprintf("https://CRAN.R-project.org/package=%s", on_cran)
32 | 
33 | # read the urls
34 | 
35 | cran_status <- map(urls, ~{
36 |   tryCatch({
37 |     url <- .x
38 |     read_html(url)
39 |   }, error = function(e) {
40 |     e
41 |   })
42 | })
43 | 
44 | cran_status <- map(cran_status, ~{
45 |   if (inherits(.x, "error")) {
46 |     NA
47 |   } else {
48 |     y <- .x %>% 
49 |       html_nodes("p") %>%
50 |         html_text() %>%
51 |         str_detect("removed") %>%
52 |         any()
53 |     y <- ifelse(y, "removed", "available")
54 |   }
55 | })
56 | 
57 | on_cran[!grep("removed", unlist(cran_status))]
58 | 


--------------------------------------------------------------------------------
/scripts/install-test-omega-hat.R:
--------------------------------------------------------------------------------
  1 | # install.packages("CGIwithR", repos="http://www.omegahat.net/R")
  2 | # library_temp <- base::tempdir(check = TRUE)
  3 | # cat(library_temp)
  4 | # libraries_existing <- .libPaths()
  5 | # .libPaths(c(libraries_existing, library_temp))
  6 | 
  7 | # cran_dependencies <- c(
  8 | #   "RCurl",
  9 | #   "RJSONIO"
 10 | # )
 11 | # 
 12 | # install.packages(cran_dependencies)#, lib = library_temp)
 13 | 
 14 | package_omegahat <- c(
 15 |   "CGIwithR",
 16 |   "R2GoogleMaps",
 17 |   "RAmazonDBREST",
 18 |   "Rflickr",
 19 |   "RGoogleDocs",
 20 |   "RGoogleStorage",
 21 |   "SSOAP",  # Is on GitHub, but doesn't have a description file: https://github.com/omegahat/SSOAP
 22 |   "SXalan",
 23 |   "WADL",
 24 |   "XMLRPC"
 25 | )
 26 | package_github <- c(
 27 |   "Rcompression",
 28 |   "RDCOMClient",
 29 |   "RDCOMServer",
 30 |   "RGoogleTrends",
 31 |   "RHTMLForms",
 32 |   "RTidyHTML",
 33 |   "RUbigraph",
 34 |   "SpiderMonkey",
 35 |   "Sxslt",
 36 |   "XMLSchema"
 37 | )
 38 | 
 39 | outcome <- list()
 40 | for(p in package_github) {
 41 |   message("Installing ", p)
 42 |   # install.packages(
 43 |   #   p, 
 44 |   #   repos = "http://www.omegahat.net/R",
 45 |   #   type  = "source"
 46 |   #   #, lib = library_temp
 47 |   # )
 48 |   remotes::install_github(
 49 |     repo = paste0("omegahat/", p)
 50 |   )
 51 |   
 52 |   success <- requireNamespace(p, quietly = T)
 53 |   outcome[[p]] <- success
 54 |   
 55 |   message("success: ", success)
 56 |   if (success) remove.packages(p)#, lib = library_temp)
 57 | }
 58 | 
 59 | outcome |> 
 60 |   tibble::enframe(
 61 |     name = "package",
 62 |     value = "install_success"
 63 |   ) |> 
 64 |   dplyr::mutate(
 65 |     install_success = as.logical(install_success)
 66 |   )
 67 | 
 68 | # print(outcome)
 69 | 
 70 | # Remove all the CRAN packages used to test Omega Hat
 71 | # remove.packages(cran_dependencies)#, lib = library_temp)
 72 | 
 73 | # Restore session's libraries and remove temp library
 74 | # .libPaths(libraries_existing)
 75 | # unlink(library_temp, recursive = TRUE, force = TRUE)
 76 | 
 77 | # install.packages("SXalan", repos = "http://www.omegahat.net/R", type  = "source")
 78 | # remotes::install_github("omegahat/SSOAP")
 79 | 
 80 | # === From GitHub =======
 81 | # # A tibble: 10 × 2
 82 | #    package       install_success
 83 | #    <chr>         <lgl>          
 84 | #  1 Rcompression  FALSE          # Please define LIB_BZIP2
 85 | #  2 RDCOMClient   TRUE           
 86 | #  3 RDCOMServer   FALSE          # ERROR: dependencies 'SWinRegistry', 'Ruuid' are not available for package 'RDCOMServer'
 87 | #  4 RGoogleTrends TRUE           
 88 | #  5 RHTMLForms    TRUE           
 89 | #  6 RTidyHTML     FALSE          # make: cc: No such file or directory; make: *** [<builtin>: access.o] Error 127
 90 | #  7 RUbigraph     FALSE          # undefined exports: runUbigraph
 91 | #  8 Sxslt         FALSE          # Please define LIB_XSLT
 92 | #  9 SpiderMonkey  FALSE          # ERROR: dependencies 'RAutoGenRunTime', 'Rffi' are not available for package 'SpiderMonkey'
 93 | # 10 XMLSchema     TRUE    
 94 | 
 95 | # === From omegahat site =======
 96 | # # A tibble: 20 × 2
 97 | #    package        install_success
 98 | #    <chr>          <lgl>          
 99 | #  1 CGIwithR       TRUE           
100 | #  2 R2GoogleMaps   TRUE           
101 | #  3 RAmazonDBREST  TRUE           
102 | #  4 Rcompression   FALSE      # Please define LIB_ZLIB    
103 | #  5 RDCOMClient    TRUE           
104 | #  6 RDCOMServer    FALSE      # package ‘RDCOMServer’ is not available for this version of R    
105 | #  7 Rflickr        TRUE           
106 | #  8 RGoogleDocs    TRUE           
107 | #  9 RGoogleStorage TRUE           
108 | # 10 RGoogleTrends  TRUE           
109 | # 11 RHTMLForms     TRUE           
110 | # 12 RTidyHTML      FALSE      # make: cc: No such file or directory; make: *** [<builtin>: access.o] Error 127    
111 | # 13 RUbigraph      FALSE      # undefined exports: runUbigraph    
112 | # 14 SpiderMonkey   FALSE      # ERROR: dependencies 'RAutoGenRunTime', 'Rffi' are not available for package 'SpiderMonkey    
113 | # 15 SSOAP          TRUE           
114 | # 16 SXalan         FALSE          
115 | # 17 Sxslt          FALSE      # Please define LIB_XSLT       
116 | # 18 WADL           TRUE           
117 | # 19 XMLRPC         TRUE           
118 | # 20 XMLSchema      TRUE 
119 | 


--------------------------------------------------------------------------------
/scripts/spelling.csv:
--------------------------------------------------------------------------------
  1 | AAD
  2 | ACI
  3 | ACR
  4 | ADE
  5 | Adwords
  6 | ajv
  7 | AKS
  8 | analysing
  9 | analytics
 10 | Analytics
 11 | api
 12 | arxiv
 13 | arXiv
 14 | AzCopy
 15 | AzureR
 16 | AzureStor
 17 | boilerpipe
 18 | Brandwatch
 19 | bzip
 20 | catalogue
 21 | Catalogue
 22 | Clarifai
 23 | cloudyr
 24 | CMS
 25 | Crossref's
 26 | crul
 27 | crul's
 28 | CSW
 29 | customizable
 30 | DataCite
 31 | DataONE
 32 | Dataverse
 33 | dbplyr
 34 | de
 35 | DeployR
 36 | DNS
 37 | DOI
 38 | DOIs
 39 | DOM
 40 | dplyr
 41 | DuckDuckGo
 42 | Elsevier
 43 | EMR
 44 | ERDDAP
 45 | Experigen
 46 | Figshare
 47 | Flickr
 48 | formr
 49 | Gandrud
 50 | gd
 51 | geocoding
 52 | Geocoding
 53 | geolocate
 54 | geolocation
 55 | Geolocation
 56 | GeoNetwork
 57 | geoRSS
 58 | GeoServer
 59 | gists
 60 | github
 61 | GitLab
 62 | gl
 63 | gmail
 64 | http
 65 | https
 66 | httptest
 67 | httpuv
 68 | httr
 69 | httr's
 70 | IaaS
 71 | Imgur
 72 | io
 73 | ip
 74 | IPC
 75 | IPv
 76 | iPython
 77 | isochrones
 78 | Jeroen
 79 | jq
 80 | js
 81 | json
 82 | JSON
 83 | jsonlite
 84 | JsonWireProtocol
 85 | JSTOR
 86 | Karthik
 87 | knitr
 88 | Kubernetes
 89 | Kusto
 90 | Languagelayer
 91 | Leeper
 92 | libcurl
 93 | libtidy
 94 | LinkedIn
 95 | logicless
 96 | longlats
 97 | longurl
 98 | ltp
 99 | ly
100 | magrittr's
101 | Mair
102 | Mathpix
103 | mbedTLS
104 | MediaWiki
105 | MockaRoo
106 | ndjson
107 | ngrams
108 | NNG
109 | Nominatim
110 | NoSQL
111 | novo
112 | NPM
113 | OAI
114 | OAuth
115 | OGC
116 | OneDrive
117 | Ooms
118 | Openaddresses
119 | OpenCage
120 | OpenCPU
121 | OPeNDAP
122 | OpenML
123 | OpenRefine
124 | OpenStreetMap
125 | Operationalization
126 | Orcid
127 | Paas
128 | Pandoc
129 | parsers
130 | PDFTables
131 | PhantomJS
132 | php
133 | Pinterest
134 | PMH
135 | pre
136 | preprints
137 | programmatically
138 | PubMed
139 | Pushbullet
140 | Qualtrics
141 | RandomAPI
142 | RCurl
143 | REDCap
144 | reformats
145 | RESTful
146 | RHEL
147 | RPC
148 | RStudio
149 | SaaS
150 | scalable
151 | Scopus
152 | SDMX
153 | selectorgadget
154 | selectr
155 | SMTP
156 | SSL
157 | SSOAP
158 | SurveyMonkey
159 | templating
160 | tiki
161 | Tiki
162 | TLS
163 | ua
164 | Ubigraph
165 | un
166 | Unpaywall
167 | Usersnap
168 | vcr
169 | Vimeo
170 | VK
171 | VM
172 | WADL
173 | WebAnalytics
174 | webdriver
175 | webpage's
176 | webreader
177 | webservice
178 | WebSocket
179 | WFS
180 | Wikidata
181 | Wikimedia
182 | wikipedia
183 | Wufoo
184 | XPath
185 | xslt
186 | YAML
187 | Yandex
188 | Yhat
189 | Zenodo
190 | zlib
191 | 


--------------------------------------------------------------------------------
/scripts/task-view-maintenance.R:
--------------------------------------------------------------------------------
 1 | # Keep current with the advice in
 2 | #  https://github.com/cran-task-views/ctv/blob/main/Contributing.md
 3 | 
 4 | task_view_name  <- "WebTechnologies"
 5 | path_md         <- sprintf("%s.md"  , task_view_name)
 6 | path_html       <- sprintf("%s.html", task_view_name)
 7 | 
 8 | if (base::basename(getwd()) != "WebTechnologies") {
 9 |   stop("The working directory should be the root of the repository.")
10 | } else {
11 |   source("scripts/url-db-from-ctv-md.R")
12 | }
13 | 
14 | # Create the html and visually inspect the content and formatting.
15 | ctv::ctv2html(path_md)
16 | utils::browseURL(path_html)
17 | 
18 | # Check that the information text and the package list
19 | #  are consistent and that all packages are available from CRAN:
20 | ctv::check_ctv_packages(path_md)
21 | 
22 | # Determine if any urls need to be updated
23 | #  or if any services are defunct (and their entry should be removed).
24 | url_db_from_ctv_md(path_md)
25 | 
26 | # Check spelling
27 | path_spelling <- "scripts/spelling.csv"
28 | 
29 | # Run block to update list.
30 | if (FALSE) {
31 |   path_md |>
32 |     spelling::spell_check_files() |>
33 |     dplyr::pull("word") |>
34 |     readr::write_lines(path_spelling)
35 | }
36 | 
37 | words_ignore <- readr::read_lines(path_spelling)
38 | spelling::spell_check_files(path_md, words_ignore)
39 | 


--------------------------------------------------------------------------------
/scripts/url-db-from-ctv-md.R:
--------------------------------------------------------------------------------
 1 | url_db_from_ctv_md <- function(path_md, verbose = TRUE, verbose_row_count = 10L) {
 2 |   if (!inherits(path_md, "character"))  stop("`path_md` must be a character.")
 3 |   if (length(path_md) != 1L)            stop("Only one `path_md` value is accepted.")
 4 |   if (!inherits(verbose, "logical"))    stop("`verbose` must be a character.")
 5 |   if (length(verbose) != 1L)            stop("Only one `verbose` value is accepted.")
 6 |   if (!is.numeric(verbose_row_count))   stop("`verbose_row_count` must be numeric")
 7 |   if (length(verbose_row_count) != 1L)  stop("Only one `verbose_row_count` value is accepted.")
 8 | 
 9 |   # Capture each url, even those with parentheses.  See https://stackoverflow.com/a/67942420/1082435
10 |   pattern <- "\\[(?<page_name>.[^][]+)\\](\\((?<page_url>(?:[^()]+|(?2))+)\\))"
11 |   # pattern <- "\\[(?<page_name>.+?)\\]\\((?<page_url>.+?)\\)"
12 |   lines <- readr::read_file(path_md)
13 | 
14 |   matches <-
15 |     rematch2::re_match_all(lines, pattern, perl = TRUE)
16 | 
17 |   db <-
18 |     tibble::tibble(
19 |       label   = unlist(matches$page_name),
20 |       URL     = unlist(matches$page_url)
21 |     ) |>
22 |     dplyr::distinct() |> # Avoid checking redundant entries
23 |     dplyr::mutate(
24 |       label =
25 |         sub( # strip opening & closing bold/italics
26 |           pattern     = "^[_\\*]{1,2})(.+?)\\1$",
27 |           replacement = "\\2",
28 |           x           = label
29 |           # perl        = TRUE
30 |         ),
31 |     )
32 | 
33 |   if (verbose) {
34 |     cat("---- URLs to check ----------\n")
35 |     print(db, n = verbose_row_count)
36 |   }
37 | 
38 |   db$Parent <- path_md
39 |   class(db) <- "url_db"
40 | 
41 |   output <- tools:::check_url_db(db, verbose = verbose, parallel = TRUE)
42 |   
43 |   if (nrow(output) == 0L) {
44 |     message("All links in the task view resolved successfully.")
45 |   } else {
46 |     warning("At least one link in the task view needs attention.")
47 |   }
48 |   
49 |   output
50 | }
51 | 
52 | # url_db_from_ctv_md("WebTechnologies.md", verbose_row_count = 20)
53 | 


--------------------------------------------------------------------------------