├── man ├── figures │ └── test-url-table.png ├── pipe.Rd ├── print.webclient.Rd ├── print.browserinfo.Rd ├── wc_title.Rd ├── wc_go.Rd ├── wc_url.Rd ├── wc_status.Rd ├── wc_browser_info.Rd ├── wc_content_type.Rd ├── wc_load_time.Rd ├── wc_headers.Rd ├── wc_content_length.Rd ├── wc_fill_in.Rd ├── wc_click_on.Rd ├── wc_resize.Rd ├── wc_css.Rd ├── wc_geo.Rd ├── wc_html_nodes.Rd ├── wc_dnt.Rd ├── wc_img_dl.Rd ├── wc_wait.Rd ├── wc_inspect.Rd ├── wc_use_insecure_ssl.Rd ├── wc_timeout.Rd ├── web_client.Rd ├── wc_html_text.Rd ├── htmlunit.Rd ├── wc_render.Rd └── hu_read_html.Rd ├── .gitignore ├── java └── htmlunit │ ├── deps │ ├── dec-0.1.2.jar │ ├── xalan-2.7.2.jar │ ├── commons-io-2.6.jar │ ├── commons-io-2.7.jar │ ├── httpmime-4.5.8.jar │ ├── httpmime-4.5.9.jar │ ├── commons-lang3-3.9.jar │ ├── commons-net-3.6.jar │ ├── commons-net-3.7.jar │ ├── commons-text-1.6.jar │ ├── commons-text-1.7.jar │ ├── commons-text-1.8.jar │ ├── commons-text-1.9.jar │ ├── htmlunit-2.35.0.jar │ ├── htmlunit-2.36.0.jar │ ├── htmlunit-2.38.0.jar │ ├── htmlunit-2.40.0.jar │ ├── htmlunit-2.43.0.jar │ ├── httpclient-4.5.12.jar │ ├── httpclient-4.5.8.jar │ ├── httpclient-4.5.9.jar │ ├── httpcore-4.4.11.jar │ ├── httpcore-4.4.13.jar │ ├── httpmime-4.5.12.jar │ ├── salvation-2.7.1.jar │ ├── salvation-2.7.2.jar │ ├── serializer-2.7.2.jar │ ├── xercesImpl-2.12.0.jar │ ├── xml-apis-1.4.01.jar │ ├── commons-codec-1.11.jar │ ├── commons-lang3-3.10.jar │ ├── commons-lang3-3.11.jar │ ├── commons-logging-1.2.jar │ ├── neko-htmlunit-2.35.0.jar │ ├── neko-htmlunit-2.36.0.jar │ ├── neko-htmlunit-2.38.0.jar │ ├── neko-htmlunit-2.40.0.jar │ ├── neko-htmlunit-2.43.0.jar │ ├── htmlunit-core-js-2.35.0.jar │ ├── htmlunit-core-js-2.36.0.jar │ ├── htmlunit-core-js-2.38.0.jar │ ├── htmlunit-core-js-2.40.0.jar │ ├── htmlunit-core-js-2.43.0.jar │ ├── htmlunit-cssparser-1.4.0.jar │ ├── htmlunit-cssparser-1.5.0.jar │ ├── jetty-http-9.4.16.v20190411.jar │ ├── jetty-http-9.4.20.v20190813.jar │ ├── jetty-http-9.4.27.v20200227.jar │ ├── jetty-http-9.4.28.v20200408.jar │ ├── jetty-http-9.4.31.v20200723.jar │ ├── jetty-io-9.4.16.v20190411.jar │ ├── jetty-io-9.4.20.v20190813.jar │ ├── jetty-io-9.4.27.v20200227.jar │ ├── jetty-io-9.4.28.v20200408.jar │ ├── jetty-io-9.4.31.v20200723.jar │ ├── jetty-util-9.4.16.v20190411.jar │ ├── jetty-util-9.4.20.v20190813.jar │ ├── jetty-util-9.4.27.v20200227.jar │ ├── jetty-util-9.4.28.v20200408.jar │ ├── jetty-util-9.4.31.v20200723.jar │ ├── jetty-xml-9.4.16.v20190411.jar │ ├── jetty-xml-9.4.20.v20190813.jar │ ├── jetty-xml-9.4.27.v20200227.jar │ ├── jetty-xml-9.4.28.v20200408.jar │ ├── jetty-xml-9.4.31.v20200723.jar │ ├── jetty-client-9.4.16.v20190411.jar │ ├── jetty-client-9.4.20.v20190813.jar │ ├── jetty-client-9.4.27.v20200227.jar │ ├── jetty-client-9.4.28.v20200408.jar │ ├── jetty-client-9.4.31.v20200723.jar │ ├── websocket-api-9.4.16.v20190411.jar │ ├── websocket-api-9.4.20.v20190813.jar │ ├── websocket-api-9.4.27.v20200227.jar │ ├── websocket-api-9.4.28.v20200408.jar │ ├── websocket-api-9.4.31.v20200723.jar │ ├── websocket-client-9.4.16.v20190411.jar │ ├── websocket-client-9.4.20.v20190813.jar │ ├── websocket-client-9.4.27.v20200227.jar │ ├── websocket-client-9.4.28.v20200408.jar │ ├── websocket-client-9.4.31.v20200723.jar │ ├── websocket-common-9.4.16.v20190411.jar │ ├── websocket-common-9.4.20.v20190813.jar │ ├── websocket-common-9.4.27.v20200227.jar │ ├── websocket-common-9.4.28.v20200408.jar │ └── websocket-common-9.4.31.v20200723.jar │ ├── target │ ├── htmlunit-1.0-SNAPSHOT.jar │ ├── classes │ │ └── is │ │ │ └── rud │ │ │ └── htmlunit │ │ │ ├── Zapp.class │ │ │ ├── Zapp$1.class │ │ │ ├── RDefaultCssErrorHandler.class │ │ │ └── RIncorrectnessListener.class │ ├── maven-archiver │ │ └── pom.properties │ └── maven-status │ │ └── maven-compiler-plugin │ │ └── compile │ │ └── default-compile │ │ ├── createdFiles.lst │ │ └── inputFiles.lst │ ├── src │ └── main │ │ └── java │ │ └── is │ │ └── rud │ │ └── htmlunit │ │ ├── RIncorrectnessListener.java │ │ ├── RDefaultCssErrorHandler.java │ │ └── Zapp.java │ ├── pom.xml │ └── Makefile ├── inst ├── java │ └── htmlunit-1.0-SNAPSHOT.jar └── tinytest │ └── test_htmlunit.R ├── tests └── tinytest.R ├── R ├── utils-infix-helpers.R ├── utils-pipe.R ├── zzz.R ├── htmlunit-package.R ├── wc-forms.R ├── wc-click.R ├── wc-as.R ├── wc-inspect.R ├── wc-html-nodes.R ├── wc-status.R ├── hu-read-html.R ├── wc-options.R └── web-client.R ├── .Rbuildignore ├── NEWS.md ├── htmlunit.Rproj ├── NAMESPACE ├── CODE_OF_CONDUCT.md ├── DESCRIPTION ├── README.Rmd ├── README.md └── LICENSE /man/figures/test-url-table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/man/figures/test-url-table.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | .Rproj 6 | src/*.o 7 | src/*.so 8 | src/*.dll 9 | -------------------------------------------------------------------------------- /java/htmlunit/deps/dec-0.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/dec-0.1.2.jar -------------------------------------------------------------------------------- /inst/java/htmlunit-1.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/inst/java/htmlunit-1.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/xalan-2.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/xalan-2.7.2.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-io-2.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-io-2.6.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-io-2.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-io-2.7.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/httpmime-4.5.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpmime-4.5.8.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/httpmime-4.5.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpmime-4.5.9.jar -------------------------------------------------------------------------------- /tests/tinytest.R: -------------------------------------------------------------------------------- 1 | 2 | if ( requireNamespace("tinytest", quietly=TRUE) ){ 3 | tinytest::test_package("htmlunit") 4 | } 5 | 6 | -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-lang3-3.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-lang3-3.9.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-net-3.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-net-3.6.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-net-3.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-net-3.7.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-text-1.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-text-1.6.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-text-1.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-text-1.7.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-text-1.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-text-1.8.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-text-1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-text-1.9.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-2.35.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.35.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-2.36.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.36.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-2.38.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.38.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-2.40.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.40.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-2.43.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.43.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/httpclient-4.5.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpclient-4.5.12.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/httpclient-4.5.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpclient-4.5.8.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/httpclient-4.5.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpclient-4.5.9.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/httpcore-4.4.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpcore-4.4.11.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/httpcore-4.4.13.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpcore-4.4.13.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/httpmime-4.5.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpmime-4.5.12.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/salvation-2.7.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/salvation-2.7.1.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/salvation-2.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/salvation-2.7.2.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/serializer-2.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/serializer-2.7.2.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/xercesImpl-2.12.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/xercesImpl-2.12.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/xml-apis-1.4.01.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/xml-apis-1.4.01.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-codec-1.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-codec-1.11.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-lang3-3.10.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-lang3-3.10.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-lang3-3.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-lang3-3.11.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/commons-logging-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-logging-1.2.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/neko-htmlunit-2.35.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.35.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/neko-htmlunit-2.36.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.36.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/neko-htmlunit-2.38.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.38.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/neko-htmlunit-2.40.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.40.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/neko-htmlunit-2.43.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.43.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-core-js-2.35.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.35.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-core-js-2.36.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.36.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-core-js-2.38.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.38.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-core-js-2.40.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.40.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-core-js-2.43.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.43.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-cssparser-1.4.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-cssparser-1.4.0.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/htmlunit-cssparser-1.5.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-cssparser-1.5.0.jar -------------------------------------------------------------------------------- /java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-http-9.4.16.v20190411.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.16.v20190411.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-http-9.4.20.v20190813.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.20.v20190813.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-http-9.4.27.v20200227.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.27.v20200227.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-http-9.4.28.v20200408.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.28.v20200408.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-io-9.4.16.v20190411.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.16.v20190411.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-io-9.4.20.v20190813.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.20.v20190813.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-io-9.4.27.v20200227.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.27.v20200227.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-io-9.4.28.v20200408.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.28.v20200408.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-util-9.4.16.v20190411.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.16.v20190411.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-util-9.4.20.v20190813.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.20.v20190813.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-util-9.4.27.v20200227.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.27.v20200227.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-util-9.4.28.v20200408.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.28.v20200408.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-xml-9.4.16.v20190411.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.16.v20190411.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-xml-9.4.20.v20190813.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.20.v20190813.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-xml-9.4.27.v20200227.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.27.v20200227.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-xml-9.4.28.v20200408.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.28.v20200408.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-client-9.4.16.v20190411.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.16.v20190411.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-client-9.4.20.v20190813.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.20.v20190813.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-client-9.4.27.v20200227.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.27.v20200227.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-client-9.4.28.v20200408.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.28.v20200408.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-api-9.4.16.v20190411.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.16.v20190411.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-api-9.4.20.v20190813.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.20.v20190813.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-api-9.4.27.v20200227.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.27.v20200227.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-api-9.4.28.v20200408.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.28.v20200408.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar -------------------------------------------------------------------------------- /java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-client-9.4.16.v20190411.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.16.v20190411.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-client-9.4.20.v20190813.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.20.v20190813.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-client-9.4.27.v20200227.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.27.v20200227.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-client-9.4.28.v20200408.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.28.v20200408.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-common-9.4.16.v20190411.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.16.v20190411.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-common-9.4.20.v20190813.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.20.v20190813.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-common-9.4.27.v20200227.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.27.v20200227.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-common-9.4.28.v20200408.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.28.v20200408.jar -------------------------------------------------------------------------------- /java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar -------------------------------------------------------------------------------- /java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class -------------------------------------------------------------------------------- /java/htmlunit/target/maven-archiver/pom.properties: -------------------------------------------------------------------------------- 1 | #Generated by Maven 2 | #Wed Aug 19 08:51:02 EDT 2020 3 | groupId=is.rud.htmlunit 4 | artifactId=htmlunit 5 | version=1.0-SNAPSHOT 6 | -------------------------------------------------------------------------------- /java/htmlunit/target/classes/is/rud/htmlunit/RDefaultCssErrorHandler.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/classes/is/rud/htmlunit/RDefaultCssErrorHandler.class -------------------------------------------------------------------------------- /java/htmlunit/target/classes/is/rud/htmlunit/RIncorrectnessListener.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/classes/is/rud/htmlunit/RIncorrectnessListener.class -------------------------------------------------------------------------------- /R/utils-infix-helpers.R: -------------------------------------------------------------------------------- 1 | `%l0%` <- function(x, y) if (length(x) == 0) y else x 2 | `%||%` <- function(x, y) if (is.null(x)) y else x 3 | `%@%` <- function(x, name) attr(x, name, exact = TRUE) 4 | `%nin%` <- function(x, table) match(x, table, nomatch = 0) == 0 5 | -------------------------------------------------------------------------------- /java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst: -------------------------------------------------------------------------------- 1 | is/rud/htmlunit/RDefaultCssErrorHandler.class 2 | is/rud/htmlunit/RIncorrectnessListener.class 3 | is/rud/htmlunit/Zapp.class 4 | is/rud/htmlunit/Zapp$1.class 5 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | ^README\.*Rmd$ 5 | ^README\.*html$ 6 | ^NOTES\.*Rmd$ 7 | ^NOTES\.*html$ 8 | ^\.codecov\.yml$ 9 | ^README_files$ 10 | ^java$ 11 | ^doc$ 12 | ^docs$ 13 | ^tmp$ 14 | ^notes$ 15 | ^\.gitlab-ci\.yml$ 16 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | stop_logging <- function() { 2 | rJava::J("java.util.logging.LogManager")$getLogManager()$reset() 3 | invisible(NULL) 4 | } 5 | 6 | .onLoad <- function(libname, pkgname) { 7 | rJava::.jpackage(pkgname, jars = "*", lib.loc = libname) 8 | rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE)) 9 | stop_logging() 10 | } 11 | 12 | 13 | -------------------------------------------------------------------------------- /java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst: -------------------------------------------------------------------------------- 1 | /Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java 2 | /Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java 3 | /Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java 4 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 0.5.0 2 | * Updated for 2.43.0 jars 3 | * Added support for Microsoft Edge browser 4 | * Added `timeout`, `css`, and `images` parameters to `wc_inspect()` 5 | 6 | 0.4.0 7 | * Switched to {tinytest} 8 | * Updated for 2.40.0 jars 9 | 10 | 0.3.0 11 | * java 11 compile 12 | * tested against new htmlunit jar release 13 | 14 | 0.2.0 15 | * inspect 16 | 17 | 0.1.0 18 | * Initial release 19 | -------------------------------------------------------------------------------- /java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java: -------------------------------------------------------------------------------- 1 | package is.rud.htmlunit; 2 | 3 | public class RIncorrectnessListener implements com.gargoylesoftware.htmlunit.IncorrectnessListener, 4 | java.io.Serializable { 5 | 6 | /** 7 | * {@inheritDoc} 8 | */ 9 | @Override 10 | public void notify(final java.lang.String message, final java.lang.Object origin) { 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /man/print.webclient.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/web-client.R 3 | \name{print.webclient} 4 | \alias{print.webclient} 5 | \title{Print method for \code{webclient} objects} 6 | \usage{ 7 | \method{print}{webclient}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{\code{webclient} object} 11 | 12 | \item{...}{unused} 13 | } 14 | \value{ 15 | \code{x} 16 | } 17 | \description{ 18 | Print method for \code{webclient} objects 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /htmlunit.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | StripTrailingWhitespace: Yes 16 | 17 | BuildType: Package 18 | PackageUseDevtools: Yes 19 | PackageInstallArgs: --no-multiarch --with-keep.source 20 | PackageBuildArgs: --resave-data 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /man/print.browserinfo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/web-client.R 3 | \name{print.browserinfo} 4 | \alias{print.browserinfo} 5 | \title{Print method for \code{browserinfo} objects} 6 | \usage{ 7 | \method{print}{browserinfo}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{\code{browserinfo} object} 11 | 12 | \item{...}{unused} 13 | } 14 | \value{ 15 | \code{x} 16 | } 17 | \description{ 18 | Print method for \code{browserinfo} objects 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/wc_title.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-status.R 3 | \name{wc_title} 4 | \alias{wc_title} 5 | \title{Return page title for current page} 6 | \usage{ 7 | wc_title(wc_obj) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | } 12 | \value{ 13 | page title of the current page \code{NULL} if no active page 14 | } 15 | \description{ 16 | Return page title for current page 17 | } 18 | \note{ 19 | This is an information retrieval function that does not return 20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 21 | } 22 | -------------------------------------------------------------------------------- /man/wc_go.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/web-client.R 3 | \name{wc_go} 4 | \alias{wc_go} 5 | \title{Visit a URL} 6 | \usage{ 7 | wc_go(wc_obj, url) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{url}{URL to retrieve} 13 | } 14 | \value{ 15 | the \code{webclient} object (invisibly) 16 | } 17 | \description{ 18 | Visit a URL 19 | } 20 | \note{ 21 | The caller does not have to assign the output of this function to a 22 | variable as the browser state is managed internally by HtmlUnit. 23 | } 24 | \examples{ 25 | w <- web_client() 26 | wc_go(w, "https://httpbin.org/") 27 | } 28 | -------------------------------------------------------------------------------- /java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java: -------------------------------------------------------------------------------- 1 | package is.rud.htmlunit; 2 | 3 | public class RDefaultCssErrorHandler implements com.gargoylesoftware.css.parser.CSSErrorHandler, 4 | java.io.Serializable { 5 | @Override 6 | public void error(final com.gargoylesoftware.css.parser.CSSParseException exception) { 7 | } 8 | 9 | @Override 10 | public void fatalError(final com.gargoylesoftware.css.parser.CSSParseException exception) { 11 | } 12 | 13 | @Override 14 | public void warning(final com.gargoylesoftware.css.parser.CSSParseException exception) { 15 | } 16 | 17 | } -------------------------------------------------------------------------------- /man/wc_url.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-status.R 3 | \name{wc_url} 4 | \alias{wc_url} 5 | \title{Return load time of the last web request for current page} 6 | \usage{ 7 | wc_url(wc_obj) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | } 12 | \value{ 13 | the load time (in ms) of the web request or \code{NULL} if no active page 14 | } 15 | \description{ 16 | Return load time of the last web request for current page 17 | } 18 | \note{ 19 | This is an information retrieval function that does not return 20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 21 | } 22 | -------------------------------------------------------------------------------- /man/wc_status.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-status.R 3 | \name{wc_status} 4 | \alias{wc_status} 5 | \title{Return status code of web request for current page} 6 | \usage{ 7 | wc_status(wc_obj) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | } 12 | \value{ 13 | the HTTP status code and message of the web request or \code{NULL} if no active page 14 | } 15 | \description{ 16 | Return status code of web request for current page 17 | } 18 | \note{ 19 | This is an information retrieval function that does not return 20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 21 | } 22 | -------------------------------------------------------------------------------- /man/wc_browser_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/web-client.R 3 | \name{wc_browser_info} 4 | \alias{wc_browser_info} 5 | \title{Retreive information about the browser used to create the \code{webclient}} 6 | \usage{ 7 | wc_browser_info(wc_obj) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | } 12 | \value{ 13 | the browser version 14 | } 15 | \description{ 16 | Retreive information about the browser used to create the \code{webclient} 17 | } 18 | \note{ 19 | This is an information retrieval function that does not return 20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 21 | } 22 | -------------------------------------------------------------------------------- /man/wc_content_type.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-status.R 3 | \name{wc_content_type} 4 | \alias{wc_content_type} 5 | \title{Return content type of web request for current page} 6 | \usage{ 7 | wc_content_type(wc_obj) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | } 12 | \value{ 13 | the content type of the web request or \code{NULL} if no active page 14 | } 15 | \description{ 16 | Return content type of web request for current page 17 | } 18 | \note{ 19 | This is an information retrieval function that does not return 20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 21 | } 22 | -------------------------------------------------------------------------------- /man/wc_load_time.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-status.R 3 | \name{wc_load_time} 4 | \alias{wc_load_time} 5 | \title{Return load time of the last web request for current page} 6 | \usage{ 7 | wc_load_time(wc_obj) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | } 12 | \value{ 13 | the load time (in ms) of the web request or \code{NULL} if no active page 14 | } 15 | \description{ 16 | Return load time of the last web request for current page 17 | } 18 | \note{ 19 | This is an information retrieval function that does not return 20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 21 | } 22 | -------------------------------------------------------------------------------- /man/wc_headers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-status.R 3 | \name{wc_headers} 4 | \alias{wc_headers} 5 | \title{Return response headers of the last web request for current page} 6 | \usage{ 7 | wc_headers(wc_obj) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | } 12 | \value{ 13 | the response headers of the web request as a data frame or \code{NULL} if 14 | no active page 15 | } 16 | \description{ 17 | Return response headers of the last web request for current page 18 | } 19 | \note{ 20 | This is an information retrieval function that does not return 21 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 22 | } 23 | -------------------------------------------------------------------------------- /man/wc_content_length.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-status.R 3 | \name{wc_content_length} 4 | \alias{wc_content_length} 5 | \title{Return content length of the last web request for current page} 6 | \usage{ 7 | wc_content_length(wc_obj) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | } 12 | \value{ 13 | the content length (in bytes) of the web request or \code{NULL} if no active page 14 | } 15 | \description{ 16 | Return content length of the last web request for current page 17 | } 18 | \note{ 19 | This is an information retrieval function that does not return 20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 21 | } 22 | -------------------------------------------------------------------------------- /man/wc_fill_in.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-forms.R 3 | \name{wc_fill_in} 4 | \alias{wc_fill_in} 5 | \title{Fill in a input box in a form field} 6 | \usage{ 7 | wc_fill_in(wc_obj, value, css, xpath) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{value}{the value to fill in} 13 | 14 | \item{css, xpath}{Node to select for filling. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.} 15 | } 16 | \description{ 17 | Fill in a input box in a form field 18 | } 19 | \note{ 20 | The caller does not have to assign the output of this function to a 21 | variable as the browser state is managed internally by HtmlUnit. 22 | } 23 | -------------------------------------------------------------------------------- /man/wc_click_on.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-click.R 3 | \name{wc_click_on} 4 | \alias{wc_click_on} 5 | \title{Click on a DOM element in a webclient loaded page} 6 | \usage{ 7 | wc_click_on(wc_obj, css, xpath) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{css, xpath}{Node to click on. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.} 13 | } 14 | \description{ 15 | Click on a DOM element in a webclient loaded page 16 | } 17 | \note{ 18 | The caller does not have to assign the output of this function to a 19 | variable as the browser state is managed internally by HtmlUnit. 20 | } 21 | \examples{ 22 | w <- web_client() 23 | wc_go(w, url = "https://hrbrmstr.github.io/htmlunitjars/index.html") 24 | wc_click_on(w, "table") 25 | } 26 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,browserinfo) 4 | S3method(print,webclient) 5 | export("%>%") 6 | export(hu_read_html) 7 | export(wc_browser_info) 8 | export(wc_click_on) 9 | export(wc_content_length) 10 | export(wc_content_type) 11 | export(wc_css) 12 | export(wc_dnt) 13 | export(wc_fill_in) 14 | export(wc_geo) 15 | export(wc_go) 16 | export(wc_headers) 17 | export(wc_html_attr) 18 | export(wc_html_name) 19 | export(wc_html_nodes) 20 | export(wc_html_text) 21 | export(wc_img_dl) 22 | export(wc_inspect) 23 | export(wc_load_time) 24 | export(wc_render) 25 | export(wc_resize) 26 | export(wc_status) 27 | export(wc_timeout) 28 | export(wc_title) 29 | export(wc_url) 30 | export(wc_use_insecure_ssl) 31 | export(wc_wait) 32 | export(web_client) 33 | export(webclient) 34 | import(htmlunitjars) 35 | import(rJava) 36 | import(rvest) 37 | import(xml2) 38 | importFrom(magrittr,"%>%") 39 | -------------------------------------------------------------------------------- /man/wc_resize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-options.R 3 | \name{wc_resize} 4 | \alias{wc_resize} 5 | \title{Resize the virtual browser window} 6 | \usage{ 7 | wc_resize(wc_obj, h, w) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{h, w}{height and width (pixels)} 13 | } 14 | \value{ 15 | the \code{webclient} object (invisibly) 16 | } 17 | \description{ 18 | Resize the virtual browser window 19 | } 20 | \note{ 21 | The caller does not have to assign the output of this function to a 22 | variable as the browser state is managed internally by HtmlUnit. 23 | } 24 | \seealso{ 25 | Other wc_opts: 26 | \code{\link{wc_css}()}, 27 | \code{\link{wc_dnt}()}, 28 | \code{\link{wc_geo}()}, 29 | \code{\link{wc_img_dl}()}, 30 | \code{\link{wc_timeout}()}, 31 | \code{\link{wc_use_insecure_ssl}()}, 32 | \code{\link{wc_wait}()} 33 | } 34 | \concept{wc_opts} 35 | -------------------------------------------------------------------------------- /man/wc_css.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-options.R 3 | \name{wc_css} 4 | \alias{wc_css} 5 | \title{Enable/Disable CSS support} 6 | \usage{ 7 | wc_css(wc_obj, enable) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{enable}{if \code{TRUE} enable CSS support (which is the HtmlUnit default)} 13 | } 14 | \value{ 15 | the \code{webclient} object (invisibly) 16 | } 17 | \description{ 18 | Enable/Disable CSS support 19 | } 20 | \note{ 21 | The caller does not have to assign the output of this function to a 22 | variable as the browser state is managed internally by HtmlUnit. 23 | } 24 | \seealso{ 25 | Other wc_opts: 26 | \code{\link{wc_dnt}()}, 27 | \code{\link{wc_geo}()}, 28 | \code{\link{wc_img_dl}()}, 29 | \code{\link{wc_resize}()}, 30 | \code{\link{wc_timeout}()}, 31 | \code{\link{wc_use_insecure_ssl}()}, 32 | \code{\link{wc_wait}()} 33 | } 34 | \concept{wc_opts} 35 | -------------------------------------------------------------------------------- /man/wc_geo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-options.R 3 | \name{wc_geo} 4 | \alias{wc_geo} 5 | \title{Enable/Disable Geolocation} 6 | \usage{ 7 | wc_geo(wc_obj, enable) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{enable}{if \code{TRUE} enable geolocation (which is the HtmlUnit default)} 13 | } 14 | \value{ 15 | the \code{webclient} object (invisibly) 16 | } 17 | \description{ 18 | Enable/Disable Geolocation 19 | } 20 | \note{ 21 | The caller does not have to assign the output of this function to a 22 | variable as the browser state is managed internally by HtmlUnit. 23 | } 24 | \seealso{ 25 | Other wc_opts: 26 | \code{\link{wc_css}()}, 27 | \code{\link{wc_dnt}()}, 28 | \code{\link{wc_img_dl}()}, 29 | \code{\link{wc_resize}()}, 30 | \code{\link{wc_timeout}()}, 31 | \code{\link{wc_use_insecure_ssl}()}, 32 | \code{\link{wc_wait}()} 33 | } 34 | \concept{wc_opts} 35 | -------------------------------------------------------------------------------- /man/wc_html_nodes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-html-nodes.R 3 | \name{wc_html_nodes} 4 | \alias{wc_html_nodes} 5 | \title{Select nodes from web client active page html content} 6 | \usage{ 7 | wc_html_nodes(wc_obj, css, xpath) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{css, xpath}{Nodes to select. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.} 13 | } 14 | \description{ 15 | Select nodes from web client active page html content 16 | } 17 | \examples{ 18 | \dontrun{ 19 | wc <- web_client() 20 | 21 | wc \%>\% wc_go("https://usa.gov/") 22 | 23 | wc \%>\% 24 | wc_html_nodes("a") \%>\% 25 | sapply(wc_html_text) 26 | 27 | wc \%>\% 28 | wc_html_nodes(xpath=".//a") \%>\% 29 | sapply(wc_html_text) 30 | 31 | wc \%>\% 32 | wc_html_nodes(xpath=".//a") \%>\% 33 | sapply(wc_html_attr, "href") 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /man/wc_dnt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-options.R 3 | \name{wc_dnt} 4 | \alias{wc_dnt} 5 | \title{Enable/Disable Do-Not-Track} 6 | \usage{ 7 | wc_dnt(wc_obj, enable) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{enable}{if \code{TRUE} enable Do-Not-Track support (which is the HtmlUnit default)} 13 | } 14 | \value{ 15 | the \code{webclient} object (invisibly) 16 | } 17 | \description{ 18 | Enable/Disable Do-Not-Track 19 | } 20 | \note{ 21 | The caller does not have to assign the output of this function to a 22 | variable as the browser state is managed internally by HtmlUnit. 23 | } 24 | \seealso{ 25 | Other wc_opts: 26 | \code{\link{wc_css}()}, 27 | \code{\link{wc_geo}()}, 28 | \code{\link{wc_img_dl}()}, 29 | \code{\link{wc_resize}()}, 30 | \code{\link{wc_timeout}()}, 31 | \code{\link{wc_use_insecure_ssl}()}, 32 | \code{\link{wc_wait}()} 33 | } 34 | \concept{wc_opts} 35 | -------------------------------------------------------------------------------- /man/wc_img_dl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-options.R 3 | \name{wc_img_dl} 4 | \alias{wc_img_dl} 5 | \title{Enable/Disable Image Downloading} 6 | \usage{ 7 | wc_img_dl(wc_obj, enable) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{enable}{if \code{TRUE} enable image downloading (the default is not to download images)} 13 | } 14 | \value{ 15 | the \code{webclient} object (invisibly) 16 | } 17 | \description{ 18 | Enable/Disable Image Downloading 19 | } 20 | \note{ 21 | The caller does not have to assign the output of this function to a 22 | variable as the browser state is managed internally by HtmlUnit. 23 | } 24 | \seealso{ 25 | Other wc_opts: 26 | \code{\link{wc_css}()}, 27 | \code{\link{wc_dnt}()}, 28 | \code{\link{wc_geo}()}, 29 | \code{\link{wc_resize}()}, 30 | \code{\link{wc_timeout}()}, 31 | \code{\link{wc_use_insecure_ssl}()}, 32 | \code{\link{wc_wait}()} 33 | } 34 | \concept{wc_opts} 35 | -------------------------------------------------------------------------------- /man/wc_wait.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-options.R 3 | \name{wc_wait} 4 | \alias{wc_wait} 5 | \title{Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing} 6 | \usage{ 7 | wc_wait(wc_obj, js_delay = 2000L) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{js_delay}{number of ms to wait/block} 13 | } 14 | \description{ 15 | Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing 16 | } 17 | \note{ 18 | The caller does not have to assign the output of this function to a 19 | variable as the browser state is managed internally by HtmlUnit. 20 | } 21 | \seealso{ 22 | Other wc_opts: 23 | \code{\link{wc_css}()}, 24 | \code{\link{wc_dnt}()}, 25 | \code{\link{wc_geo}()}, 26 | \code{\link{wc_img_dl}()}, 27 | \code{\link{wc_resize}()}, 28 | \code{\link{wc_timeout}()}, 29 | \code{\link{wc_use_insecure_ssl}()} 30 | } 31 | \concept{wc_opts} 32 | -------------------------------------------------------------------------------- /man/wc_inspect.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-inspect.R 3 | \name{wc_inspect} 4 | \alias{wc_inspect} 5 | \title{Perform a "Developer Tools"-like Network Inspection of a URL} 6 | \usage{ 7 | wc_inspect( 8 | url, 9 | js_delay = 5000L, 10 | timeout = 30000L, 11 | css = FALSE, 12 | images = FALSE 13 | ) 14 | } 15 | \arguments{ 16 | \item{url}{URL to fetch} 17 | 18 | \item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)} 19 | 20 | \item{timeout}{Sets the timeout (milliseconds) of the web connection. Set to zero for an infinite wait. 21 | Defaults to \code{30000}. Note: The timeout is used twice. The first is for making the socket 22 | connection, the second is for data retrieval. If the time is critical you must allow for twice 23 | the time specified here.} 24 | 25 | \item{css, images}{enable CSS/download images? (default \code{FALSE})} 26 | } 27 | \description{ 28 | Retrieves \emph{all} content loaded 29 | } 30 | -------------------------------------------------------------------------------- /man/wc_use_insecure_ssl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-options.R 3 | \name{wc_use_insecure_ssl} 4 | \alias{wc_use_insecure_ssl} 5 | \title{Enable/Disable Ignoring SSL Validation Issues} 6 | \usage{ 7 | wc_use_insecure_ssl(wc_obj, enable) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{enable}{if \code{TRUE} the client will accept connections to any host, 13 | regardless of whether they have valid certificates or not} 14 | } 15 | \value{ 16 | the \code{webclient} object (invisibly) 17 | } 18 | \description{ 19 | Enable/Disable Ignoring SSL Validation Issues 20 | } 21 | \note{ 22 | The caller does not have to assign the output of this function to a 23 | variable as the browser state is managed internally by HtmlUnit. 24 | } 25 | \seealso{ 26 | Other wc_opts: 27 | \code{\link{wc_css}()}, 28 | \code{\link{wc_dnt}()}, 29 | \code{\link{wc_geo}()}, 30 | \code{\link{wc_img_dl}()}, 31 | \code{\link{wc_resize}()}, 32 | \code{\link{wc_timeout}()}, 33 | \code{\link{wc_wait}()} 34 | } 35 | \concept{wc_opts} 36 | -------------------------------------------------------------------------------- /man/wc_timeout.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-options.R 3 | \name{wc_timeout} 4 | \alias{wc_timeout} 5 | \title{Change default request timeout} 6 | \usage{ 7 | wc_timeout(wc_obj, timeout) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{timeout}{timeout (ms); The timeout is used twice. The first is for making 13 | the socket connection, the second is for data retrieval. If the 14 | time is critical you must allow for twice the time specified here.} 15 | } 16 | \value{ 17 | the \code{webclient} object (invisibly) 18 | } 19 | \description{ 20 | Change default request timeout 21 | } 22 | \note{ 23 | The caller does not have to assign the output of this function to a 24 | variable as the browser state is managed internally by HtmlUnit. 25 | } 26 | \seealso{ 27 | Other wc_opts: 28 | \code{\link{wc_css}()}, 29 | \code{\link{wc_dnt}()}, 30 | \code{\link{wc_geo}()}, 31 | \code{\link{wc_img_dl}()}, 32 | \code{\link{wc_resize}()}, 33 | \code{\link{wc_use_insecure_ssl}()}, 34 | \code{\link{wc_wait}()} 35 | } 36 | \concept{wc_opts} 37 | -------------------------------------------------------------------------------- /man/web_client.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/web-client.R 3 | \name{web_client} 4 | \alias{web_client} 5 | \alias{webclient} 6 | \title{Create a new HtmlUnit WebClient instance} 7 | \usage{ 8 | web_client( 9 | emulate = c("best", "chrome", "firefox", "ie", "edge"), 10 | proxy_host = NULL, 11 | proxy_port = NULL 12 | ) 13 | 14 | webclient( 15 | emulate = c("best", "chrome", "firefox", "ie", "edge"), 16 | proxy_host = NULL, 17 | proxy_port = NULL 18 | ) 19 | } 20 | \arguments{ 21 | \item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"} 22 | 23 | \item{proxy_host, proxy_port}{the server/port that will act as proxy (default 24 | \code{NULL} = no proxy)} 25 | } 26 | \value{ 27 | \code{webclient} object 28 | } 29 | \description{ 30 | A new HtmlUnit web client (virtual browser) will be created and a \code{webclient} 31 | object will be returned. 32 | } 33 | \details{ 34 | This is part of the \code{htmlunit} DSL interface.s 35 | } 36 | \examples{ 37 | w <- web_client() 38 | wc_browser_info(w) 39 | } 40 | \concept{dsl} 41 | -------------------------------------------------------------------------------- /inst/tinytest/test_htmlunit.R: -------------------------------------------------------------------------------- 1 | 2 | test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html" 3 | 4 | w <- web_client() 5 | 6 | expect_equal(class(w), "webclient") 7 | expect_equal(class(wc_browser_info(w)), "browserinfo") 8 | 9 | expect_equal(class(wc_go(w, url = test_url)), "webclient") 10 | 11 | expect_equal(wc_url(w), test_url) 12 | expect_equal(wc_title(w), "") 13 | 14 | expect_true(inherits(wc_render(w, "parsed"), "xml_document")) 15 | expect_true(inherits(wc_render(w, "html"), "character")) 16 | expect_true(inherits(wc_render(w, "text"), "character")) 17 | 18 | expect_true(inherits(wc_click_on(w, "table"), "webclient")) 19 | 20 | expect_equal( 21 | wc_html_nodes(w, "title") %>% sapply(wc_html_text), 22 | "" 23 | ) 24 | 25 | expect_equal( 26 | wc_html_nodes(w, "title") %>% sapply(wc_html_name), 27 | "title" 28 | ) 29 | 30 | h <- wc_headers(w) 31 | expect_true(any(h$value == "GitHub.com")) 32 | 33 | expect_inherits( 34 | hu_read_html(url = test_url, ret = "html_document"), 35 | "xml_document" 36 | ) 37 | expect_true( 38 | inherits(hu_read_html(url = test_url, ret = "text"), 39 | "character" 40 | )) 41 | 42 | -------------------------------------------------------------------------------- /man/wc_html_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-html-nodes.R 3 | \name{wc_html_text} 4 | \alias{wc_html_text} 5 | \alias{wc_html_attr} 6 | \alias{wc_html_name} 7 | \title{Extract attributes, text and tag name from webclient page html content} 8 | \usage{ 9 | wc_html_text(dom_node, trim = FALSE) 10 | 11 | wc_html_attr(dom_node, attr) 12 | 13 | wc_html_name(dom_node) 14 | } 15 | \arguments{ 16 | \item{dom_node}{a webclient page DOM node (likely produced by \code{\link[=wc_html_nodes]{wc_html_nodes()}})} 17 | 18 | \item{trim}{if \code{TRUE} will trim leading/trailing white space} 19 | 20 | \item{attr}{name of attribute to retrieve} 21 | } 22 | \description{ 23 | Extract attributes, text and tag name from webclient page html content 24 | } 25 | \examples{ 26 | \dontrun{ 27 | wc <- web_client() 28 | 29 | wc \%>\% wc_go("https://usa.gov/") 30 | 31 | wc \%>\% 32 | wc_html_nodes("a") \%>\% 33 | sapply(wc_html_text) 34 | 35 | wc \%>\% 36 | wc_html_nodes(xpath=".//a") \%>\% 37 | sapply(wc_html_text) 38 | 39 | wc \%>\% 40 | wc_html_nodes(xpath=".//a") \%>\% 41 | sapply(wc_html_attr, "href") 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /java/htmlunit/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | is.rud.htmlunit 5 | htmlunit 6 | jar 7 | 1.0-SNAPSHOT 8 | htmlunit 9 | http://maven.apache.org 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 3.1 16 | 17 | 1.7 18 | 1.7 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | net.sourceforge.htmlunit 27 | htmlunit 28 | 2.43.0 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /R/htmlunit-package.R: -------------------------------------------------------------------------------- 1 | #' Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library 2 | #' 3 | #' `HtmlUnit` () is _a "'GUI'-Less 4 | #' browser for 'Java' programs". It models 'HTML' documents and provides an 'API' 5 | #' that allows one to invoke pages, fill out forms, click links and more just like 6 | #' one does in a "normal" browser. The library has fairly good and constantly 7 | #' improving 'JavaScript' support and is able to work even with quite complex 'AJAX' 8 | #' libraries, simulating 'Chrome', 'Firefox' or 'Internet Explorer' depending on 9 | #' the configuration used. It is typically used for testing purposes or to retrieve 10 | #' information from web sites._ 11 | #' 12 | #' Tools are provided to work with this library at a higher level than provided by 13 | #' the exposed 'Java' libraries in the [`htmlunitjars`](https://gitlab.com/hrbrmstr/htmlunitjars) 14 | #' package. 15 | #' 16 | #' - URL: 17 | #' - BugReports: 18 | #' 19 | #' @md 20 | #' @name htmlunit 21 | #' @docType package 22 | #' @author Bob Rudis (bob@@rud.is) 23 | #' @import rvest htmlunitjars rJava xml2 24 | NULL 25 | -------------------------------------------------------------------------------- /java/htmlunit/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean pkg deps run 2 | 3 | pkg: 4 | # JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn --quiet package 5 | JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn --quiet package 6 | cp target/htmlunit-1.0-SNAPSHOT.jar ../../inst/java 7 | 8 | clean: 9 | # JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn clean 10 | JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn clean 11 | 12 | deps: 13 | # JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps 14 | JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps 15 | 16 | new: 17 | # JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false 18 | JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false 19 | -------------------------------------------------------------------------------- /man/htmlunit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/htmlunit-package.R 3 | \docType{package} 4 | \name{htmlunit} 5 | \alias{htmlunit} 6 | \title{Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library} 7 | \description{ 8 | \code{HtmlUnit} (\url{http://htmlunit.sourceforge.net/}) is \emph{a "'GUI'-Less 9 | browser for 'Java' programs". It models 'HTML' documents and provides an 'API' 10 | that allows one to invoke pages, fill out forms, click links and more just like 11 | one does in a "normal" browser. The library has fairly good and constantly 12 | improving 'JavaScript' support and is able to work even with quite complex 'AJAX' 13 | libraries, simulating 'Chrome', 'Firefox' or 'Internet Explorer' depending on 14 | the configuration used. It is typically used for testing purposes or to retrieve 15 | information from web sites.} 16 | } 17 | \details{ 18 | Tools are provided to work with this library at a higher level than provided by 19 | the exposed 'Java' libraries in the \href{https://gitlab.com/hrbrmstr/htmlunitjars}{\code{htmlunitjars}} 20 | package. 21 | \itemize{ 22 | \item URL: \url{https://gitlab.com/hrbrmstr/htmlunit} 23 | \item BugReports: \url{https://gitlab.com/hrbrmstr/htmlunit/issues} 24 | } 25 | } 26 | \author{ 27 | Bob Rudis (bob@rud.is) 28 | } 29 | -------------------------------------------------------------------------------- /R/wc-forms.R: -------------------------------------------------------------------------------- 1 | #' Fill in a input box in a form field 2 | #' 3 | #' @note The caller does not have to assign the output of this function to a 4 | #' variable as the browser state is managed internally by HtmlUnit. 5 | #' @param wc_obj a `webclient` object 6 | #' @param value the value to fill in 7 | #' @param css,xpath Node to select for filling. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector. 8 | #' @export 9 | wc_fill_in <- function(wc_obj, value, css, xpath) { 10 | 11 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 12 | 13 | if (.jnull() == pg) return(NULL) 14 | 15 | if (missing(css) && missing(xpath)) 16 | stop("Please supply one of css or xpath", call. = FALSE) 17 | 18 | if (!missing(css) && !missing(xpath)) 19 | stop("Please supply css or xpath, not both", call. = FALSE) 20 | 21 | if (!missing(css)) { 22 | if (!is.character(css) && length(css) == 1) stop("`css` must be a string") 23 | 24 | item <- pg$querySelector(css) 25 | 26 | } else { 27 | if (!is.character(xpath) && length(xpath) == 1) 28 | stop("`xpath` must be a string") 29 | 30 | item <- as.list(pg$getByXPath(xpath))[[1]] 31 | 32 | } 33 | 34 | if (length(item) == 0) { 35 | warning("No item found with that selector.") 36 | } else if (length(item) > 1) { 37 | warning("More than one item found with that selector.") 38 | } else { 39 | 40 | } 41 | 42 | return(wc_obj) 43 | 44 | } 45 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, we pledge to respect all people who 4 | contribute through reporting issues, posting feature requests, updating documentation, 5 | submitting pull requests or patches, and other activities. 6 | 7 | We are committed to making participation in this project a harassment-free experience for 8 | everyone, regardless of level of experience, gender, gender identity and expression, 9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. 10 | 11 | Examples of unacceptable behavior by participants include the use of sexual language or 12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment, 13 | insults, or other unprofessional conduct. 14 | 15 | Project maintainers have the right and responsibility to remove, edit, or reject comments, 16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this 17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 18 | from the project team. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 21 | opening an issue or contacting one or more of the project maintainers. 22 | 23 | This Code of Conduct is adapted from the Contributor Covenant 24 | (http://contributor-covenant.org), version 1.0.0, available at 25 | http://contributor-covenant.org/version/1/0/0/ 26 | -------------------------------------------------------------------------------- /R/wc-click.R: -------------------------------------------------------------------------------- 1 | #' Click on a DOM element in a webclient loaded page 2 | #' 3 | #' @note The caller does not have to assign the output of this function to a 4 | #' variable as the browser state is managed internally by HtmlUnit. 5 | #' @param wc_obj a `webclient` object 6 | #' @param css,xpath Node to click on. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector. 7 | #' @export 8 | #' @examples 9 | #' w <- web_client() 10 | #' wc_go(w, url = "https://hrbrmstr.github.io/htmlunitjars/index.html") 11 | #' wc_click_on(w, "table") 12 | wc_click_on <- function(wc_obj, css, xpath) { 13 | 14 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 15 | 16 | if (.jnull() == pg) return(NULL) 17 | 18 | if (missing(css) && missing(xpath)) 19 | stop("Please supply one of css or xpath", call. = FALSE) 20 | 21 | if (!missing(css) && !missing(xpath)) 22 | stop("Please supply css or xpath, not both", call. = FALSE) 23 | 24 | if (!missing(css)) { 25 | if (!is.character(css) && length(css) == 1) stop("`css` must be a string") 26 | 27 | item <- pg$querySelector(css) 28 | 29 | } else { 30 | if (!is.character(xpath) && length(xpath) == 1) 31 | stop("`xpath` must be a string") 32 | 33 | item <- as.list(pg$getByXPath(xpath))[[1]] 34 | 35 | } 36 | 37 | if (length(item) == 0) { 38 | warning("No item found with that selector.") 39 | } else if (length(item) > 1) { 40 | warning("More than one item found with that selector.") 41 | } else { 42 | item$click() 43 | } 44 | 45 | return(wc_obj) 46 | 47 | } 48 | -------------------------------------------------------------------------------- /man/wc_render.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wc-as.R 3 | \name{wc_render} 4 | \alias{wc_render} 5 | \title{Retrieve current page contents} 6 | \usage{ 7 | wc_render(wc_obj, what = c("parsed", "html", "text")) 8 | } 9 | \arguments{ 10 | \item{wc_obj}{a \code{webclient} object} 11 | 12 | \item{what}{what to return (see Details); NOTE that if there is no active 13 | page this function returns \code{NULL}.} 14 | } 15 | \value{ 16 | if \code{what} is \code{parsed}, an \code{xml2} \code{html_document}; if \code{html}, 17 | the character HTML representation of the page; if \code{text} 18 | the rendered text of the document as viewed by a human. 19 | } 20 | \description{ 21 | If there is a page in the active browser context, return the contents of 22 | the page. 23 | } 24 | \details{ 25 | The page contents can be returned as one of: 26 | \itemize{ 27 | \item Parsed HTML (i.e. an \code{xml2} \code{html_document}) 28 | \item A string representation of the HTML document. NOTE: The charset used is the 29 | current page encoding. 30 | \item A textual representation of this page that represents what would be visible 31 | to the user if this page was shown in a web browser. This is useful for, 32 | say, text mining. 33 | } 34 | } 35 | \note{ 36 | This is an information retrieval function that does not return 37 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe. 38 | } 39 | \examples{ 40 | w <- web_client() 41 | wc_go(w, url = "https://hrbrmstr.github.io/htmlunitjars/index.html") 42 | wc_render(w, "parsed") 43 | wc_render(w, "html") 44 | wc_render(w, "text") 45 | } 46 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: htmlunit 2 | Type: Package 3 | Title: Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library 4 | Version: 0.5.0 5 | Date: 2020-07-18 6 | Authors@R: c( 7 | person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), 8 | comment = c(ORCID = "0000-0001-5670-2640")), 9 | person("Everet", "Rummel", email = "everet.rummel@gmail.com", role = "ctb") 10 | ) 11 | Maintainer: Bob Rudis 12 | Description: 'HtmlUnit' () is a "'GUI'-Less 13 | browser for 'Java' programs". It models 'HTML' documents and provides an 'API' 14 | that allows one to invoke pages, fill out forms, click links and more just like 15 | one does in a "normal" browser. The library has fairly good and constantly 16 | improving 'JavaScript' support and is able to work even with quite complex 'AJAX' 17 | libraries, simulating 'Chrome', 'Firefox' or 'Internet Explorer' depending on 18 | the configuration used. It is typically used for testing purposes or to retrieve 19 | information from web sites. Tools are provided to work with this library 20 | at a higher level than provided by the exposed 'Java' libraries in the 21 | 'htmlunitjars' package. 22 | URL: https://github.com/hrbrmstr/htmlunit 23 | Encoding: UTF-8 24 | License: Apache License 2.0 | file LICENSE 25 | Imports: 26 | magrittr 27 | Suggests: 28 | covr, tinytest 29 | Depends: 30 | R (>= 3.6.0), 31 | rJava, 32 | htmlunitjars (>= 2.43.0), 33 | rvest, 34 | xml2 35 | Roxygen: list(markdown = TRUE) 36 | RoxygenNote: 7.1.1 37 | Remotes: github::hrbrmstr/htmlunitjars 38 | -------------------------------------------------------------------------------- /java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java: -------------------------------------------------------------------------------- 1 | package is.rud.htmlunit; 2 | 3 | import com.gargoylesoftware.htmlunit.*; 4 | import com.gargoylesoftware.htmlunit.util.*; 5 | 6 | import java.util.*; 7 | import java.lang.*; 8 | import java.io.*; 9 | 10 | public class Zapp { 11 | 12 | private static com.gargoylesoftware.htmlunit.IncorrectnessListener incorrectnessListener_ = new RIncorrectnessListener(); 13 | private static com.gargoylesoftware.css.parser.CSSErrorHandler cssErrorHandler_ = new RDefaultCssErrorHandler(); 14 | 15 | public static List getRequestsFor(String url, long jsDelay, int timeout, Boolean css, Boolean images) throws IOException { 16 | 17 | final WebClient webClient = new WebClient(BrowserVersion.CHROME); 18 | 19 | webClient.setCssErrorHandler(cssErrorHandler_); 20 | webClient.setIncorrectnessListener(incorrectnessListener_); 21 | 22 | WebClientOptions wco = webClient.getOptions(); 23 | 24 | wco.setThrowExceptionOnScriptError(false); 25 | wco.setCssEnabled(css); 26 | wco.setDownloadImages(images); 27 | wco.setTimeout(timeout); 28 | 29 | final List list = new ArrayList<>(); 30 | 31 | new WebConnectionWrapper(webClient) { 32 | @Override 33 | public WebResponse getResponse(final WebRequest request) throws IOException { 34 | final WebResponse response = super.getResponse(request); 35 | list.add(response); 36 | return response; 37 | } 38 | }; 39 | 40 | webClient.getPage(url); 41 | webClient.waitForBackgroundJavaScript(jsDelay); 42 | 43 | return(list); 44 | 45 | } 46 | 47 | } 48 | 49 | -------------------------------------------------------------------------------- /R/wc-as.R: -------------------------------------------------------------------------------- 1 | #' Retrieve current page contents 2 | #' 3 | #' If there is a page in the active browser context, return the contents of 4 | #' the page. 5 | #' 6 | #' The page contents can be returned as one of: 7 | #' 8 | #' - Parsed HTML (i.e. an `xml2` `html_document`) 9 | #' - A string representation of the HTML document. NOTE: The charset used is the 10 | #' current page encoding. 11 | #' - A textual representation of this page that represents what would be visible 12 | #' to the user if this page was shown in a web browser. This is useful for, 13 | #' say, text mining. 14 | #' 15 | #' @note This is an information retrieval function that does not return 16 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 17 | #' @param wc_obj a `webclient` object 18 | #' @param what what to return (see Details); NOTE that if there is no active 19 | #' page this function returns `NULL`. 20 | #' @return if `what` is `parsed`, an `xml2` `html_document`; if `html`, 21 | #' the character HTML representation of the page; if `text` 22 | #' the rendered text of the document as viewed by a human. 23 | #' @export 24 | #' @examples 25 | #' w <- web_client() 26 | #' wc_go(w, url = "https://hrbrmstr.github.io/htmlunitjars/index.html") 27 | #' wc_render(w, "parsed") 28 | #' wc_render(w, "html") 29 | #' wc_render(w, "text") 30 | wc_render <- function(wc_obj, what = c("parsed", "html", "text")) { 31 | 32 | what <- match.arg(what, c("parsed", "html", "text")) 33 | 34 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 35 | 36 | if (.jnull() == pg) return(NULL) 37 | 38 | response <- pg$getWebResponse() 39 | content <- response$getContentAsString() 40 | 41 | switch( 42 | what, 43 | parsed = xml2::read_html(pg$asXml()), 44 | html = pg$asXml(), 45 | text = pg$asText() 46 | ) 47 | 48 | } 49 | -------------------------------------------------------------------------------- /R/wc-inspect.R: -------------------------------------------------------------------------------- 1 | #' Perform a "Developer Tools"-like Network Inspection of a URL 2 | #' 3 | #' Retrieves _all_ content loaded 4 | #' 5 | #' @md 6 | #' @param url URL to fetch 7 | #' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000) 8 | #' @param timeout Sets the timeout (milliseconds) of the web connection. Set to zero for an infinite wait. 9 | #' Defaults to `30000`. Note: The timeout is used twice. The first is for making the socket 10 | #' connection, the second is for data retrieval. If the time is critical you must allow for twice 11 | #' the time specified here. 12 | #' @param css,images enable CSS/download images? (default `FALSE`) 13 | #' @export 14 | wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L, css = FALSE, images = FALSE) { 15 | 16 | app <- J("is.rud.htmlunit.Zapp") 17 | 18 | app$getRequestsFor( 19 | url, 20 | .jlong(js_delay), 21 | as.integer(timeout), 22 | .jnew("java/lang/Boolean", css), 23 | .jnew("java/lang/Boolean", images) 24 | ) -> res 25 | 26 | res <- as.list(res) 27 | 28 | lapply(res, function(.x) { 29 | 30 | wr <- .x$getWebRequest() 31 | hdrs <- as.list(.x$getResponseHeaders()) 32 | 33 | lapply(hdrs, function(.x) { 34 | data.frame( 35 | name = .x$getName() %||% NA_character_, 36 | value = .x$getValue() %||% NA_character_, 37 | stringsAsFactors = FALSE 38 | ) 39 | }) -> hdrs 40 | 41 | hdrs <- do.call(rbind.data.frame, hdrs) 42 | class(hdrs) <- c("tbl_df", "tbl", "data.frame") 43 | 44 | data.frame( 45 | method = wr$getHttpMethod()$toString() %||% NA_character_, 46 | url = wr$getUrl()$toString() %||% NA_character_, 47 | status_code = .x$getStatusCode() %||% NA_integer_, 48 | message = .x$getStatusMessage() %||% NA_character_, 49 | content =I(list(charToRaw(.x$getContentAsString()))) %||% NA_character_, 50 | content_length = as.double(.x$getContentLength() %||% NA_real_), 51 | content_type = .x$getContentType() %||% NA_character_, 52 | load_time = as.double(.x$getLoadTime() %||% NA_real_), 53 | headers = I(list(hdrs)), 54 | stringsAsFactors = FALSE 55 | ) 56 | 57 | }) -> out 58 | 59 | out <- do.call(rbind.data.frame, out) 60 | class(out) <- c("tbl_df", "tbl", "data.frame") 61 | 62 | out 63 | 64 | } 65 | -------------------------------------------------------------------------------- /R/wc-html-nodes.R: -------------------------------------------------------------------------------- 1 | #' Select nodes from web client active page html content 2 | #' 3 | #' @md 4 | #' @param wc_obj a `webclient` object 5 | #' @param css,xpath Nodes to select. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector. 6 | #' @export 7 | #' @examples \dontrun{ 8 | #' wc <- web_client() 9 | #' 10 | #' wc %>% wc_go("https://usa.gov/") 11 | #' 12 | #' wc %>% 13 | #' wc_html_nodes("a") %>% 14 | #' sapply(wc_html_text) 15 | #' 16 | #' wc %>% 17 | #' wc_html_nodes(xpath=".//a") %>% 18 | #' sapply(wc_html_text) 19 | #' 20 | #' wc %>% 21 | #' wc_html_nodes(xpath=".//a") %>% 22 | #' sapply(wc_html_attr, "href") 23 | #' } 24 | wc_html_nodes <- function(wc_obj, css, xpath) { 25 | 26 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 27 | 28 | if (.jnull() == pg) return(NULL) 29 | 30 | if (missing(css) && missing(xpath)) 31 | stop("Please supply one of css or xpath", call. = FALSE) 32 | 33 | if (!missing(css) && !missing(xpath)) 34 | stop("Please supply css or xpath, not both", call. = FALSE) 35 | 36 | if (!missing(css)) { 37 | if (!is.character(css) && length(css) == 1) stop("`css` must be a string") 38 | 39 | out <- pg$querySelectorAll(css) 40 | 41 | } else { 42 | if (!is.character(xpath) && length(xpath) == 1) 43 | stop("`xpath` must be a string") 44 | 45 | out <- pg$getByXPath(xpath) 46 | 47 | } 48 | 49 | out 50 | 51 | } 52 | 53 | #' Extract attributes, text and tag name from webclient page html content 54 | #' 55 | #' @md 56 | #' @param dom_node a webclient page DOM node (likely produced by [wc_html_nodes()]) 57 | #' @param trim if `TRUE` will trim leading/trailing white space 58 | #' @export 59 | #' @examples \dontrun{ 60 | #' wc <- web_client() 61 | #' 62 | #' wc %>% wc_go("https://usa.gov/") 63 | #' 64 | #' wc %>% 65 | #' wc_html_nodes("a") %>% 66 | #' sapply(wc_html_text) 67 | #' 68 | #' wc %>% 69 | #' wc_html_nodes(xpath=".//a") %>% 70 | #' sapply(wc_html_text) 71 | #' 72 | #' wc %>% 73 | #' wc_html_nodes(xpath=".//a") %>% 74 | #' sapply(wc_html_attr, "href") 75 | #' } 76 | wc_html_text <- function(dom_node, trim = FALSE) { 77 | x <- dom_node$getTextContent() 78 | if (trim) x <- trimws(x) 79 | x 80 | } 81 | 82 | #' @rdname wc_html_text 83 | #' @export 84 | #' @param attr name of attribute to retrieve 85 | wc_html_attr <- function(dom_node, attr) { 86 | dom_node$getAttribute(attr) 87 | } 88 | 89 | #' @rdname wc_html_text 90 | #' @export 91 | wc_html_name <- function(dom_node) { 92 | dom_node$getNodeName() 93 | } 94 | -------------------------------------------------------------------------------- /man/hu_read_html.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hu-read-html.R 3 | \name{hu_read_html} 4 | \alias{hu_read_html} 5 | \title{Read HTML from a URL with Browser Emulation & in a JavaScript Context} 6 | \usage{ 7 | hu_read_html( 8 | url, 9 | emulate = c("best", "chrome", "firefox", "ie", "edge"), 10 | ret = c("html_document", "text"), 11 | js_delay = 2000L, 12 | timeout = 30000L, 13 | ignore_ssl_errors = TRUE, 14 | enable_dnt = FALSE, 15 | download_images = FALSE, 16 | options = c("RECOVER", "NOERROR", "NOBLANKS") 17 | ) 18 | } 19 | \arguments{ 20 | \item{url}{URL to retrieve} 21 | 22 | \item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"} 23 | 24 | \item{ret}{what to return; if \code{html_document} (the default) then the HTML created 25 | by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_xml]{xml2::read_html()}} 26 | and an \code{xml2} \code{html_document}/\code{xml_document} is returned. Note that this causes 27 | further HTML processing by \code{xml2}/\code{libxml2} so is not \emph{exactly} what 28 | \code{HtmlUnit} generated. If you want the HTML code (text) without any further 29 | processing then use \code{text} as the value.} 30 | 31 | \item{js_delay}{time (ms) to let loaded javascript to execute; default is 2 seconds (2000 ms)} 32 | 33 | \item{timeout}{overall timeout (ms); \code{0} == infinite wait (not recommended); note: the 34 | timeout is used twice: first in making the socket connection, 35 | second for data retrieval. If the time is critical you must 36 | allow for twice the time specified here. Default 30s (30000 ms)} 37 | 38 | \item{ignore_ssl_errors}{Should SSL/TLS errors be ignored. The default (\code{TRUE}) is 39 | a current hack due to how \code{HtmlUnit} seems to handle virtual hosted sites 40 | with multiple vhosts and multiple certificates. You can try it with \code{FALSE} 41 | initially and revert back to \code{TRUE} if you encounter issues.} 42 | 43 | \item{enable_dnt}{Enable the "Do Not Track" header. Default: \code{FALSE}.} 44 | 45 | \item{download_images}{Download images as the page is loaded? Since this 46 | function is a high-level wrapper designed to do a read of HTML, 47 | it is recommended that you leave this the default \code{FALSE} to save 48 | time/bandwidth.} 49 | 50 | \item{options}{options to pass to \code{\link[xml2:read_xml]{xml2::read_html()}} if \code{ret} == \code{html_document}.} 51 | } 52 | \value{ 53 | an \code{xml2} \code{html_document}/\code{xml_document} if \code{ret} == \code{html_document} else 54 | the HTML document text generated by \code{HtmlUnit}. 55 | } 56 | \description{ 57 | Use a JavaScript-enabled browser context to read and render HTML from a URL. 58 | } 59 | \details{ 60 | For the code in the examples, this is the site that is being scraped: 61 | 62 | \if{html}{ 63 | \figure{test-url-table.png}{options: width="100\%" alt="Figure: test-url-table.png"} 64 | } 65 | 66 | \if{latex}{ 67 | \figure{test-url-table.png}{options: width=10cm} 68 | } 69 | 70 | Note that it has a table of values but it is rendered via JavaScript. 71 | } 72 | \examples{ 73 | \dontrun{ 74 | test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html" 75 | hu_read_html(test_url) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /R/wc-status.R: -------------------------------------------------------------------------------- 1 | #' Return status code of web request for current page 2 | #' 3 | #' @note This is an information retrieval function that does not return 4 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 5 | #' @param wc_obj a `webclient` object 6 | #' @return the HTTP status code and message of the web request or `NULL` if no active page 7 | #' @export 8 | wc_status<- function(wc_obj) { 9 | 10 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 11 | 12 | if (.jnull() == pg) return(NULL) 13 | 14 | response <- pg$getWebResponse() 15 | 16 | list( 17 | status_code = response$getStatusCode(), 18 | message = response$getStatusMessage() 19 | ) 20 | 21 | } 22 | 23 | #' Return content type of web request for current page 24 | #' 25 | #' @note This is an information retrieval function that does not return 26 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 27 | #' @param wc_obj a `webclient` object 28 | #' @return the content type of the web request or `NULL` if no active page 29 | #' @export 30 | wc_content_type <- function(wc_obj) { 31 | 32 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 33 | 34 | if (.jnull() == pg) return(NULL) 35 | 36 | response <- pg$getWebResponse() 37 | 38 | response$getContentType() 39 | 40 | } 41 | 42 | #' Return content length of the last web request for current page 43 | #' 44 | #' @note This is an information retrieval function that does not return 45 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 46 | #' @param wc_obj a `webclient` object 47 | #' @return the content length (in bytes) of the web request or `NULL` if no active page 48 | #' @export 49 | wc_content_length <- function(wc_obj) { 50 | 51 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 52 | 53 | if (.jnull() == pg) return(NULL) 54 | 55 | response <- pg$getWebResponse() 56 | 57 | response$getContentLength() 58 | 59 | } 60 | 61 | #' Return load time of the last web request for current page 62 | #' 63 | #' @note This is an information retrieval function that does not return 64 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 65 | #' @param wc_obj a `webclient` object 66 | #' @return the load time (in ms) of the web request or `NULL` if no active page 67 | #' @export 68 | wc_load_time <- function(wc_obj) { 69 | 70 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 71 | 72 | if (.jnull() == pg) return(NULL) 73 | 74 | response <- pg$getWebResponse() 75 | 76 | response$getLoadTime() 77 | 78 | } 79 | 80 | #' Return load time of the last web request for current page 81 | #' 82 | #' @note This is an information retrieval function that does not return 83 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 84 | #' @param wc_obj a `webclient` object 85 | #' @return the load time (in ms) of the web request or `NULL` if no active page 86 | #' @export 87 | wc_url <- function(wc_obj) { 88 | 89 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 90 | 91 | if (.jnull() == pg) return(NULL) 92 | 93 | pg$getUrl()$toString() 94 | 95 | } 96 | 97 | #' Return page title for current page 98 | #' 99 | #' @note This is an information retrieval function that does not return 100 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 101 | #' @param wc_obj a `webclient` object 102 | #' @return page title of the current page `NULL` if no active page 103 | #' @export 104 | wc_title <- function(wc_obj) { 105 | 106 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 107 | 108 | if (.jnull() == pg) return(NULL) 109 | 110 | pg$getTitleText() 111 | 112 | } 113 | 114 | #' Return response headers of the last web request for current page 115 | #' 116 | #' @note This is an information retrieval function that does not return 117 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 118 | #' @param wc_obj a `webclient` object 119 | #' @return the response headers of the web request as a data frame or `NULL` if 120 | #' no active page 121 | #' @export 122 | wc_headers <- function(wc_obj) { 123 | 124 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage() 125 | 126 | if (.jnull() == pg) return(NULL) 127 | 128 | response <- pg$getWebResponse() 129 | 130 | do.call( 131 | rbind.data.frame, 132 | c( 133 | lapply( 134 | as.list(response$getResponseHeaders()), 135 | function(x) list(name = x$getName(), value = x$getValue()) 136 | ), 137 | stringsAsFactors=FALSE 138 | ) 139 | ) -> out 140 | 141 | class(out) <- c("tbl_df", "tbl", "data.frame") 142 | 143 | out 144 | 145 | } 146 | 147 | -------------------------------------------------------------------------------- /R/hu-read-html.R: -------------------------------------------------------------------------------- 1 | #' Read HTML from a URL with Browser Emulation & in a JavaScript Context 2 | #' 3 | #' Use a JavaScript-enabled browser context to read and render HTML from a URL. 4 | #' 5 | #' For the code in the examples, this is the site that is being scraped: 6 | #' 7 | #' \if{html}{ 8 | #' \figure{test-url-table.png}{options: width="100\%" alt="Figure: test-url-table.png"} 9 | #' } 10 | #' 11 | #' \if{latex}{ 12 | #' \figure{test-url-table.png}{options: width=10cm} 13 | #' } 14 | #' 15 | #' Note that it has a table of values but it is rendered via JavaScript. 16 | #' 17 | #' @param url URL to retrieve 18 | #' @param emulate browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`" 19 | #' @param ret what to return; if `html_document` (the default) then the HTML created 20 | #' by the `HtmlUnit` emulated browser context is passed to [xml2::read_html()] 21 | #' and an `xml2` `html_document`/`xml_document` is returned. Note that this causes 22 | #' further HTML processing by `xml2`/`libxml2` so is not _exactly_ what 23 | #' `HtmlUnit` generated. If you want the HTML code (text) without any further 24 | #' processing then use `text` as the value. 25 | #' @param js_delay time (ms) to let loaded javascript to execute; default is 2 seconds (2000 ms) 26 | #' @param timeout overall timeout (ms); `0` == infinite wait (not recommended); note: the 27 | #' timeout is used twice: first in making the socket connection, 28 | #' second for data retrieval. If the time is critical you must 29 | #' allow for twice the time specified here. Default 30s (30000 ms) 30 | #' @param ignore_ssl_errors Should SSL/TLS errors be ignored. The default (`TRUE`) is 31 | #' a current hack due to how `HtmlUnit` seems to handle virtual hosted sites 32 | #' with multiple vhosts and multiple certificates. You can try it with `FALSE` 33 | #' initially and revert back to `TRUE` if you encounter issues. 34 | #' @param enable_dnt Enable the "Do Not Track" header. Default: `FALSE`. 35 | #' @param download_images Download images as the page is loaded? Since this 36 | #' function is a high-level wrapper designed to do a read of HTML, 37 | #' it is recommended that you leave this the default `FALSE` to save 38 | #' time/bandwidth. 39 | #' @param options options to pass to [xml2::read_html()] if `ret` == `html_document`. 40 | #' @return an `xml2` `html_document`/`xml_document` if `ret` == `html_document` else 41 | #' the HTML document text generated by `HtmlUnit`. 42 | #' @export 43 | #' @examples \dontrun{ 44 | #' test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html" 45 | #' hu_read_html(test_url) 46 | #' } 47 | hu_read_html <- function(url, 48 | emulate = c("best", "chrome", "firefox", "ie", "edge"), 49 | ret = c("html_document", "text"), 50 | js_delay = 2000L, 51 | timeout = 30000L, 52 | ignore_ssl_errors = TRUE, 53 | enable_dnt = FALSE, 54 | download_images = FALSE, 55 | options = c("RECOVER", "NOERROR", "NOBLANKS")) { 56 | 57 | emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge")) 58 | ret <- match.arg(ret, c("html_document", "text")) 59 | 60 | available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion") 61 | 62 | switch( 63 | emulate, 64 | best = available_browsers$BEST_SUPPORTED, 65 | chrome = available_browsers$CHROME, 66 | firefox = available_browsers$FIREFOX, 67 | edge = available_browsers$EDGE, 68 | ie = available_browsers$INTERNET_EXPLORER 69 | ) -> use_browser 70 | 71 | wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser) 72 | 73 | cssErrorHandler <- .jnew("is.rud.htmlunit.RDefaultCssErrorHandler") 74 | wc$setCssErrorHandler(cssErrorHandler) 75 | 76 | incorrectListenerHandler <- .jnew("is.rud.htmlunit.RIncorrectnessListener") 77 | wc$setIncorrectnessListener(incorrectListenerHandler) 78 | 79 | res <- wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay))) 80 | 81 | wc_opts <- wc$getOptions() 82 | wc_opts$setThrowExceptionOnFailingStatusCode(FALSE) 83 | wc_opts$setThrowExceptionOnScriptError(FALSE) 84 | wc_opts$setTimeout(as.integer(timeout)) 85 | 86 | if (ignore_ssl_errors) wc_opts$setUseInsecureSSL(TRUE) 87 | if (enable_dnt) wc_opts$setDoNotTrackEnabled(TRUE) 88 | if (download_images) wc_opts$setDownloadImages(TRUE) 89 | 90 | pg <- wc$getPage(url) 91 | 92 | # response <- pg$getWebResponse() 93 | # content <- response$getContentAsString() 94 | 95 | if (ret == "html_document") return(xml2::read_html(pg$asXml(), options = options)) 96 | 97 | return(pg$asText()) 98 | 99 | } 100 | 101 | -------------------------------------------------------------------------------- /R/wc-options.R: -------------------------------------------------------------------------------- 1 | #' Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing 2 | #' 3 | #' @note The caller does not have to assign the output of this function to a 4 | #' variable as the browser state is managed internally by HtmlUnit. 5 | #' @param wc_obj a `webclient` object 6 | #' @param js_delay number of ms to wait/block 7 | #' @family wc_opts 8 | #' @export 9 | wc_wait <- function(wc_obj, js_delay = 2000L) { 10 | 11 | res <- wc_obj$wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay))) 12 | 13 | invisible(wc_obj) 14 | 15 | } 16 | 17 | #' Enable/Disable CSS support 18 | #' 19 | #' @note The caller does not have to assign the output of this function to a 20 | #' variable as the browser state is managed internally by HtmlUnit. 21 | #' @param wc_obj a `webclient` object 22 | #' @param enable if `TRUE` enable CSS support (which is the HtmlUnit default) 23 | #' @return the `webclient` object (invisibly) 24 | #' @family wc_opts 25 | #' @export 26 | wc_css <- function(wc_obj, enable) { 27 | 28 | wc_obj$wc_opts$setCssEnabled(enable) 29 | 30 | invisible(wc_obj) 31 | 32 | } 33 | 34 | #' Enable/Disable Do-Not-Track 35 | #' 36 | #' @note The caller does not have to assign the output of this function to a 37 | #' variable as the browser state is managed internally by HtmlUnit. 38 | #' @param wc_obj a `webclient` object 39 | #' @param enable if `TRUE` enable Do-Not-Track support (which is the HtmlUnit default) 40 | #' @return the `webclient` object (invisibly) 41 | #' @family wc_opts 42 | #' @export 43 | wc_dnt <- function(wc_obj, enable) { 44 | 45 | wc_obj$wc_opts$setDoNotTrackEnabled(enable) 46 | 47 | invisible(wc_obj) 48 | 49 | } 50 | 51 | #' Enable/Disable Image Downloading 52 | #' 53 | #' @note The caller does not have to assign the output of this function to a 54 | #' variable as the browser state is managed internally by HtmlUnit. 55 | #' @param wc_obj a `webclient` object 56 | #' @param enable if `TRUE` enable image downloading (the default is not to download images) 57 | #' @return the `webclient` object (invisibly) 58 | #' @family wc_opts 59 | #' @export 60 | wc_img_dl <- function(wc_obj, enable) { 61 | 62 | wc_obj$wc_opts$setDownloadImages(enable) 63 | 64 | invisible(wc_obj) 65 | 66 | } 67 | 68 | #' Enable/Disable Geolocation 69 | #' 70 | #' @note The caller does not have to assign the output of this function to a 71 | #' variable as the browser state is managed internally by HtmlUnit. 72 | #' @param wc_obj a `webclient` object 73 | #' @param enable if `TRUE` enable geolocation (which is the HtmlUnit default) 74 | #' @return the `webclient` object (invisibly) 75 | #' @family wc_opts 76 | #' @export 77 | wc_geo <- function(wc_obj, enable) { 78 | 79 | wc_obj$wc_opts$setGeolocationEnabled(enable) 80 | 81 | invisible(wc_obj) 82 | 83 | } 84 | 85 | #' Change default request timeout 86 | #' 87 | #' @note The caller does not have to assign the output of this function to a 88 | #' variable as the browser state is managed internally by HtmlUnit. 89 | #' @param wc_obj a `webclient` object 90 | #' @param timeout timeout (ms); The timeout is used twice. The first is for making 91 | #' the socket connection, the second is for data retrieval. If the 92 | #' time is critical you must allow for twice the time specified here. 93 | #' @return the `webclient` object (invisibly) 94 | #' @family wc_opts 95 | #' @export 96 | wc_timeout <- function(wc_obj, timeout) { 97 | 98 | wc_obj$wc_opts$setTimeout(timeout) 99 | 100 | invisible(wc_obj) 101 | 102 | } 103 | 104 | #' Resize the virtual browser window 105 | #' 106 | #' @note The caller does not have to assign the output of this function to a 107 | #' variable as the browser state is managed internally by HtmlUnit. 108 | #' @param wc_obj a `webclient` object 109 | #' @param h,w height and width (pixels) 110 | #' @return the `webclient` object (invisibly) 111 | #' @family wc_opts 112 | #' @export 113 | wc_resize <- function(wc_obj, h, w) { 114 | 115 | wc_obj$wc_opts$setScreenHeight(h) 116 | wc_obj$wc_opts$setScreenWidth(w) 117 | 118 | invisible(wc_obj) 119 | 120 | } 121 | 122 | #' Enable/Disable Ignoring SSL Validation Issues 123 | #' 124 | #' @note The caller does not have to assign the output of this function to a 125 | #' variable as the browser state is managed internally by HtmlUnit. 126 | #' @param wc_obj a `webclient` object 127 | #' @param enable if `TRUE` the client will accept connections to any host, 128 | #' regardless of whether they have valid certificates or not 129 | #' @return the `webclient` object (invisibly) 130 | #' @family wc_opts 131 | #' @export 132 | wc_use_insecure_ssl <- function(wc_obj, enable) { 133 | 134 | wc_obj$wc_opts$setUseInsecureSSL(enable) 135 | 136 | invisible(wc_obj) 137 | 138 | } 139 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | rmarkdown::github_document 4 | editor_options: 5 | chunk_output_type: console 6 | --- 7 | ```{r pkg-knitr-opts, include=FALSE} 8 | hrbrpkghelpr::global_opts() 9 | ``` 10 | 11 | ```{r badges, results='asis', echo=FALSE, cache=FALSE} 12 | hrbrpkghelpr::stinking_badges() 13 | ``` 14 | 15 | ```{r description, results='asis', echo=FALSE, cache=FALSE} 16 | hrbrpkghelpr::yank_title_and_description() 17 | ``` 18 | 19 | ## What's Inside The Tin 20 | 21 | The following functions are implemented: 22 | 23 | ### DSL 24 | 25 | - `web_client`/`webclient`: Create a new HtmlUnit WebClient instance

26 | 27 | - `wc_go`: Visit a URL
28 | 29 | - `wc_html_nodes`: Select nodes from web client active page html content 30 | - `wc_html_text`: Extract attributes, text and tag name from webclient page html content

31 | - `wc_html_attr`: Extract attributes, text and tag name from webclient page html content 32 | - `wc_html_name`: Extract attributes, text and tag name from webclient page html content 33 | 34 | - `wc_headers`: Return response headers of the last web request for current page 35 | - `wc_browser_info`: Retreive information about the browser used to create the 'webclient' 36 | - `wc_content_length`: Return content length of the last web request for current page 37 | - `wc_content_type`: Return content type of web request for current page

38 | 39 | - `wc_render`: Retrieve current page contents

40 | 41 | - `wc_css`: Enable/Disable CSS support 42 | - `wc_dnt`: Enable/Disable Do-Not-Track 43 | - `wc_geo`: Enable/Disable Geolocation 44 | - `wc_img_dl`: Enable/Disable Image Downloading 45 | - `wc_load_time`: Return load time of the last web request for current page 46 | - `wc_resize`: Resize the virtual browser window 47 | - `wc_status`: Return status code of web request for current page 48 | - `wc_timeout`: Change default request timeout 49 | - `wc_title`: Return page title for current page 50 | - `wc_url`: Return load time of the last web request for current page 51 | - `wc_use_insecure_ssl`: Enable/Disable Ignoring SSL Validation Issues 52 | - `wc_wait`: Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing 53 | 54 | ### Just the Content (pls) 55 | 56 | - `hu_read_html`: Read HTML from a URL with Browser Emulation & in a JavaScript Context 57 | 58 | ### Content++ 59 | 60 | - `wc_inspect`: Perform a "Developer Tools"-like Network Inspection of a URL 61 | 62 | ## Installation 63 | 64 | ```{r install-ex, results='asis', echo=FALSE, cache=FALSE} 65 | hrbrpkghelpr::install_block() 66 | ``` 67 | 68 | ## Usage 69 | 70 | ```{r cache=FALSE} 71 | library(htmlunit) 72 | library(tidyverse) # for some data ops; not req'd for pkg 73 | 74 | # current verison 75 | packageVersion("htmlunit") 76 | 77 | ``` 78 | 79 | Something `xml2::read_html()` cannot do, read the table from : 80 | 81 | ![](man/figures/test-url-table.png) 82 | 83 | ```{r ex1} 84 | test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html" 85 | 86 | pg <- xml2::read_html(test_url) 87 | 88 | html_table(pg) 89 | ``` 90 | 91 | ☹️ 92 | 93 | But, `hu_read_html()` can! 94 | 95 | ```{r ex2} 96 | pg <- hu_read_html(test_url) 97 | 98 | html_table(pg) 99 | ``` 100 | 101 | All without needing a separate Selenium or Splash server instance. 102 | 103 | ### Content++ 104 | 105 | We can also get a HAR-like content + metadata dump: 106 | 107 | ```{r ex3} 108 | xdf <- wc_inspect("https://rstudio.com") 109 | 110 | colnames(xdf) 111 | 112 | select(xdf, method, url, status_code, content_length, load_time) 113 | 114 | group_by(xdf, content_type) %>% 115 | summarise( 116 | total_size = sum(content_length), 117 | total_load_time = sum(load_time)/1000 118 | ) 119 | ``` 120 | 121 | ### DSL 122 | 123 | ```{r ex4} 124 | wc <- web_client(emulate = "chrome") 125 | 126 | wc %>% wc_browser_info() 127 | 128 | wc <- web_client() 129 | 130 | wc %>% wc_go("https://usa.gov/") 131 | 132 | # if you want to use purrr::map_ functions the result of wc_html_nodes() needs to be passed to as.list() 133 | 134 | wc %>% 135 | wc_html_nodes("a") %>% 136 | sapply(wc_html_text, trim = TRUE) %>% 137 | head(10) 138 | 139 | wc %>% 140 | wc_html_nodes(xpath=".//a") %>% 141 | sapply(wc_html_text, trim = TRUE) %>% 142 | head(10) 143 | 144 | wc %>% 145 | wc_html_nodes(xpath=".//a") %>% 146 | sapply(wc_html_attr, "href") %>% 147 | head(10) 148 | ``` 149 | 150 | Handy function to get rendered plain text for text mining: 151 | 152 | ```{r ex5} 153 | wc %>% 154 | wc_render("text") %>% 155 | substr(1, 300) %>% 156 | cat() 157 | ``` 158 | 159 | ### htmlunit Metrics 160 | 161 | ```{r echo=FALSE} 162 | cloc::cloc_pkg_md() 163 | ``` 164 | 165 | ## Code of Conduct 166 | 167 | Please note that this project is released with a Contributor Code of Conduct. 168 | By participating in this project you agree to abide by its terms. 169 | -------------------------------------------------------------------------------- /R/web-client.R: -------------------------------------------------------------------------------- 1 | #' Create a new HtmlUnit WebClient instance 2 | #' 3 | #' A new HtmlUnit web client (virtual browser) will be created and a `webclient` 4 | #' object will be returned. 5 | #' 6 | #' This is part of the `htmlunit` DSL interface.s 7 | #' 8 | #' @param emulate browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`" 9 | #' @param proxy_host,proxy_port the server/port that will act as proxy (default 10 | #' `NULL` = no proxy) 11 | #' @return `webclient` object 12 | #' @family dsl 13 | #' @export 14 | #' @examples 15 | #' w <- web_client() 16 | #' wc_browser_info(w) 17 | web_client <- function(emulate = c("best", "chrome", "firefox", "ie", "edge"), 18 | proxy_host = NULL, proxy_port = NULL) { 19 | 20 | emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge")) 21 | available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion") 22 | 23 | switch( 24 | emulate, 25 | best = available_browsers$BEST_SUPPORTED, 26 | chrome = available_browsers$CHROME, 27 | firefox = available_browsers$FIREFOX, 28 | edge = available_browsers$EDGE, 29 | ie = available_browsers$INTERNET_EXPLORER 30 | ) -> use_browser 31 | 32 | wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser) 33 | 34 | wc$getOptions()$setThrowExceptionOnFailingStatusCode(FALSE) 35 | wc$getOptions()$setThrowExceptionOnScriptError(FALSE) 36 | wc$getOptions()$setDownloadImages(FALSE) 37 | wc$getOptions()$setJavaScriptEnabled(TRUE) 38 | wc$getOptions()$setCssEnabled(TRUE) 39 | wc$getOptions()$setDoNotTrackEnabled(FALSE) 40 | wc$getOptions()$setGeolocationEnabled(TRUE) 41 | wc$getOptions()$setPopupBlockerEnabled(FALSE) 42 | wc$getOptions()$setPrintContentOnFailingStatusCode(TRUE) 43 | wc$getOptions()$setRedirectEnabled(TRUE) 44 | 45 | list( 46 | wc = wc, 47 | wc_opts = wc$getOptions() 48 | ) -> wc_obj 49 | 50 | class(wc_obj) <- c("webclient") 51 | 52 | invisible(wc_obj) 53 | 54 | } 55 | 56 | #' @rdname web_client 57 | #' @export 58 | webclient <- web_client 59 | 60 | #' Visit a URL 61 | #' 62 | #' @note The caller does not have to assign the output of this function to a 63 | #' variable as the browser state is managed internally by HtmlUnit. 64 | #' @param wc_obj a `webclient` object 65 | #' @param url URL to retrieve 66 | #' @return the `webclient` object (invisibly) 67 | #' @export 68 | #' @examples 69 | #' w <- web_client() 70 | #' wc_go(w, "https://httpbin.org/") 71 | wc_go <- function(wc_obj, url) { 72 | 73 | wc_obj$wc$getPage(url) 74 | 75 | invisible(wc_obj) 76 | 77 | } 78 | 79 | #' Retreive information about the browser used to create the `webclient` 80 | #' 81 | #' @note This is an information retrieval function that does not return 82 | #' the `wc_obj` so must be the last function call in a `webclient` pipe. 83 | #' @param wc_obj a `webclient` object 84 | #' @return the browser version 85 | #' @export 86 | wc_browser_info <- function(wc_obj) { 87 | 88 | bv <- wc_obj$wc$getBrowserVersion() 89 | 90 | list( 91 | name = bv$getApplicationName(), 92 | version = bv$getApplicationVersion(), 93 | language = bv$getBrowserLanguage() 94 | ) -> bv_lst 95 | 96 | class(bv_lst) <- "browserinfo" 97 | 98 | bv_lst 99 | 100 | } 101 | 102 | #' Print method for `browserinfo` objects 103 | #' @keywords internal 104 | #' @param x `browserinfo` object 105 | #' @param ... unused 106 | #' @return `x` 107 | #' @export 108 | print.browserinfo <- function(x, ...) { 109 | 110 | cat( 111 | sprintf("< %s / %s / %s >\n", x$name, x$version, x$language) 112 | ) 113 | 114 | invisible(x) 115 | 116 | } 117 | 118 | 119 | # Closes all virtual browser opened windows & stop all background JavaScript processing 120 | # 121 | # @param wc_obj a `webclient` object 122 | # @return the `webclient` object (invisibly) 123 | # @export 124 | # wc_go <- function(wc_obj, url) { 125 | # 126 | # wc_obj$wc$getPage(url) 127 | # 128 | # invisible(wc_obj) 129 | # 130 | # } 131 | 132 | 133 | #' Print method for `webclient` objects 134 | #' @keywords internal 135 | #' @param x `webclient` object 136 | #' @param ... unused 137 | #' @return `x` 138 | #' @export 139 | print.webclient <- function(x, ...) { 140 | 141 | bv <- x$wc$getBrowserVersion() 142 | 143 | cat( 144 | sprintf( 145 | "\n", 146 | bv$getApplicationName(), 147 | bv$getApplicationVersion(), 148 | bv$getBrowserLanguage()) 149 | ) 150 | 151 | pg <- x$wc$getCurrentWindow()$getEnclosedPage() 152 | 153 | if (!(.jnull() == pg)) { 154 | 155 | cat(sprintf(" Current URL: <%s>\n", pg$getUrl()$toString())) 156 | 157 | if (pg$getTitleText() != "") cat(sprintf(" Page Title: <%s>\n", pg$getTitleText())) 158 | 159 | res <- pg$getWebResponse() 160 | 161 | cat(sprintf(" Status Code: %s\n", res$getStatusCode())) 162 | cat(sprintf(" Content Type: %s\n", res$getContentType())) 163 | cat(sprintf(" Content Length: %s bytes\n", prettyNum(res$getContentLength(), big.mark=","))) 164 | cat(sprintf(" Load Time: %s ms\n", prettyNum(res$getLoadTime(), big.mark=","))) 165 | 166 | } 167 | 168 | invisible(x) 169 | 170 | } 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Project Status: Active – The project has reached a stable, usable 3 | state and is being actively 4 | developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) 5 | [![Signed 6 | by](https://img.shields.io/badge/Keybase-Verified-brightgreen.svg)](https://keybase.io/hrbrmstr) 7 | ![Signed commit 8 | %](https://img.shields.io/badge/Signed_Commits-100%25-lightgrey.svg) 9 | [![Linux build 10 | Status](https://travis-ci.org/hrbrmstr/htmlunit.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmlunit) 11 | [![Coverage 12 | Status](https://codecov.io/gh/hrbrmstr/htmlunit/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/htmlunit) 13 | ![Minimal R 14 | Version](https://img.shields.io/badge/R%3E%3D-3.6.0-blue.svg) 15 | ![License](https://img.shields.io/badge/License-Apache-blue.svg) 16 | 17 | # htmlunit 18 | 19 | Tools to Scrape Dynamic Web Content via the ‘HtmlUnit’ Java Library 20 | 21 | ## Description 22 | 23 | ‘HtmlUnit’ () is a “‘GUI’-Less 24 | browser for ‘Java’ programs”. It models ‘HTML’ documents and provides an 25 | ‘API’ that allows one to invoke pages, fill out forms, click links and 26 | more just like one does in a “normal” browser. The library has fairly 27 | good and constantly improving ‘JavaScript’ support and is able to work 28 | even with quite complex ‘AJAX’ libraries, simulating ‘Chrome’, ‘Firefox’ 29 | or ‘Internet Explorer’ depending on the configuration used. It is 30 | typically used for testing purposes or to retrieve information from web 31 | sites. Tools are provided to work with this library at a higher level 32 | than provided by the exposed ‘Java’ libraries in the ‘htmlunitjars’ 33 | package. 34 | 35 | ## What’s Inside The Tin 36 | 37 | The following functions are implemented: 38 | 39 | ### DSL 40 | 41 | - `web_client`/`webclient`: Create a new HtmlUnit WebClient 42 | instance

43 | 44 | - `wc_go`: Visit a URL
45 | 46 | - `wc_html_nodes`: Select nodes from web client active page html 47 | content 48 | 49 | - `wc_html_text`: Extract attributes, text and tag name from webclient 50 | page html content

51 | 52 | - `wc_html_attr`: Extract attributes, text and tag name from webclient 53 | page html content 54 | 55 | - `wc_html_name`: Extract attributes, text and tag name from webclient 56 | page html content 57 | 58 | - `wc_headers`: Return response headers of the last web request for 59 | current page 60 | 61 | - `wc_browser_info`: Retreive information about the browser used to 62 | create the ‘webclient’ 63 | 64 | - `wc_content_length`: Return content length of the last web request 65 | for current page 66 | 67 | - `wc_content_type`: Return content type of web request for current 68 | page

69 | 70 | - `wc_render`: Retrieve current page contents

71 | 72 | - `wc_css`: Enable/Disable CSS support 73 | 74 | - `wc_dnt`: Enable/Disable Do-Not-Track 75 | 76 | - `wc_geo`: Enable/Disable Geolocation 77 | 78 | - `wc_img_dl`: Enable/Disable Image Downloading 79 | 80 | - `wc_load_time`: Return load time of the last web request for current 81 | page 82 | 83 | - `wc_resize`: Resize the virtual browser window 84 | 85 | - `wc_status`: Return status code of web request for current page 86 | 87 | - `wc_timeout`: Change default request timeout 88 | 89 | - `wc_title`: Return page title for current page 90 | 91 | - `wc_url`: Return load time of the last web request for current page 92 | 93 | - `wc_use_insecure_ssl`: Enable/Disable Ignoring SSL Validation Issues 94 | 95 | - `wc_wait`: Block HtlUnit final rendering blocks until all background 96 | JavaScript tasks have finished executing 97 | 98 | ### Just the Content (pls) 99 | 100 | - `hu_read_html`: Read HTML from a URL with Browser Emulation & in a 101 | JavaScript Context 102 | 103 | ### Content++ 104 | 105 | - `wc_inspect`: Perform a “Developer Tools”-like Network Inspection of 106 | a URL 107 | 108 | ## Installation 109 | 110 | ``` r 111 | install.packages("htmlunit", repos = c("https://cinc.rud.is", "https://cloud.r-project.org/")) 112 | # or 113 | remotes::install_git("https://git.rud.is/hrbrmstr/htmlunit.git") 114 | # or 115 | remotes::install_git("https://git.sr.ht/~hrbrmstr/htmlunit") 116 | # or 117 | remotes::install_gitlab("hrbrmstr/htmlunit") 118 | # or 119 | remotes::install_bitbucket("hrbrmstr/htmlunit") 120 | # or 121 | remotes::install_github("hrbrmstr/htmlunit") 122 | ``` 123 | 124 | NOTE: To use the ‘remotes’ install options you will need to have the 125 | [{remotes} package](https://github.com/r-lib/remotes) installed. 126 | 127 | ## Usage 128 | 129 | ``` r 130 | library(htmlunit) 131 | library(tidyverse) # for some data ops; not req'd for pkg 132 | 133 | # current verison 134 | packageVersion("htmlunit") 135 | ## [1] '0.5.0' 136 | ``` 137 | 138 | Something `xml2::read_html()` cannot do, read the table from 139 | : 140 | 141 | ![](man/figures/test-url-table.png) 142 | 143 | ``` r 144 | test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html" 145 | 146 | pg <- xml2::read_html(test_url) 147 | 148 | html_table(pg) 149 | ## list() 150 | ``` 151 | 152 | ☹️ 153 | 154 | But, `hu_read_html()` can\! 155 | 156 | ``` r 157 | pg <- hu_read_html(test_url) 158 | 159 | html_table(pg) 160 | ## [[1]] 161 | ## X1 X2 162 | ## 1 One Two 163 | ## 2 Three Four 164 | ## 3 Five Six 165 | ``` 166 | 167 | All without needing a separate Selenium or Splash server instance. 168 | 169 | ### Content++ 170 | 171 | We can also get a HAR-like content + metadata dump: 172 | 173 | ``` r 174 | xdf <- wc_inspect("https://rstudio.com") 175 | 176 | colnames(xdf) 177 | ## [1] "method" "url" "status_code" "message" "content" "content_length" 178 | ## [7] "content_type" "load_time" "headers" 179 | 180 | select(xdf, method, url, status_code, content_length, load_time) 181 | ## # A tibble: 36 x 5 182 | ## method url status_code content_length load_time 183 | ## 184 | ## 1 GET https://rstudio.com/ 200 14621 495 185 | ## 2 GET https://metadata-static-files.sfo2.cdn.digitaloceanspaces.com/pixel/lp.js 200 3576 221 186 | ## 3 GET https://snap.licdn.com/li.lms-analytics/insight.min.js 200 1576 162 187 | ## 4 GET https://connect.facebook.net/en_US/fbevents.js 200 34269 138 188 | ## 5 GET https://connect.facebook.net/signals/config/151855192184380?v=2.9.23&r=s… 200 134841 66 189 | ## 6 GET https://munchkin.marketo.net/munchkin-beta.js 200 752 230 190 | ## 7 GET https://munchkin.marketo.net/159/munchkin.js 200 4810 27 191 | ## 8 GET https://x.clearbitjs.com/v1/pk_60c5aa2221e3c03eca10fb6876aa6df7/clearbit… 200 86568 483 192 | ## 9 GET https://cdn.segment.com/analytics.js/v1/gO0uTGfCkO4DQpfkRim9mBsjdKrehtnu… 200 62860 243 193 | ## 10 GET https://static.hotjar.com/c/hotjar-1446157.js?sv=6 200 1708 212 194 | ## # … with 26 more rows 195 | 196 | group_by(xdf, content_type) %>% 197 | summarise( 198 | total_size = sum(content_length), 199 | total_load_time = sum(load_time)/1000 200 | ) 201 | ## # A tibble: 7 x 3 202 | ## content_type total_size total_load_time 203 | ## 204 | ## 1 application/javascript 431338 2.58 205 | ## 2 application/json 4118 1.37 206 | ## 3 application/x-javascript 176248 0.623 207 | ## 4 image/gif 35 0.232 208 | ## 5 text/html 16640 1.36 209 | ## 6 text/javascript 254971 0.996 210 | ## 7 text/plain 28 0.189 211 | ``` 212 | 213 | ### DSL 214 | 215 | ``` r 216 | wc <- web_client(emulate = "chrome") 217 | 218 | wc %>% wc_browser_info() 219 | ## < Netscape / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 / en-US > 220 | 221 | wc <- web_client() 222 | 223 | wc %>% wc_go("https://usa.gov/") 224 | 225 | # if you want to use purrr::map_ functions the result of wc_html_nodes() needs to be passed to as.list() 226 | 227 | wc %>% 228 | wc_html_nodes("a") %>% 229 | sapply(wc_html_text, trim = TRUE) %>% 230 | head(10) 231 | ## [1] "Skip to main content" "" "Español" 232 | ## [4] "1-844-USA-GOV1" "All Topics and Services" "About the U.S." 233 | ## [7] "American Flag" "Branches of the U.S. Government" "Budget of the U.S. Government" 234 | ## [10] "Data and Statistics about the U.S." 235 | 236 | wc %>% 237 | wc_html_nodes(xpath=".//a") %>% 238 | sapply(wc_html_text, trim = TRUE) %>% 239 | head(10) 240 | ## [1] "Skip to main content" "" "Español" 241 | ## [4] "1-844-USA-GOV1" "All Topics and Services" "About the U.S." 242 | ## [7] "American Flag" "Branches of the U.S. Government" "Budget of the U.S. Government" 243 | ## [10] "Data and Statistics about the U.S." 244 | 245 | wc %>% 246 | wc_html_nodes(xpath=".//a") %>% 247 | sapply(wc_html_attr, "href") %>% 248 | head(10) 249 | ## [1] "#content" "/" "/espanol/" "/phone" 250 | ## [5] "/#tpcs" "#" "/flag" "/branches-of-government" 251 | ## [9] "/budget" "/statistics" 252 | ``` 253 | 254 | Handy function to get rendered plain text for text mining: 255 | 256 | ``` r 257 | wc %>% 258 | wc_render("text") %>% 259 | substr(1, 300) %>% 260 | cat() 261 | ## Official Guide to Government Information and Services | USAGov 262 | ## Skip to main content 263 | ## An official website of the United States government Here's how you know 264 | ## 265 | ## 266 | ## Main Navigation 267 | ## Search 268 | ## Search 269 | ## Search 270 | ## 1-844-USA-GOV1 271 | ## All Topics and Services 272 | ## Benefits, Grants, Loans 273 | ## Government Agencies and Elected Officials 274 | ``` 275 | 276 | ### htmlunit Metrics 277 | 278 | | Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) | 279 | | :---- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: | 280 | | R | 14 | 0.70 | 341 | 0.72 | 188 | 0.70 | 377 | 0.82 | 281 | | Java | 3 | 0.15 | 52 | 0.11 | 23 | 0.09 | 3 | 0.01 | 282 | | Rmd | 1 | 0.05 | 41 | 0.09 | 52 | 0.19 | 75 | 0.16 | 283 | | Maven | 1 | 0.05 | 30 | 0.06 | 0 | 0.00 | 1 | 0.00 | 284 | | make | 1 | 0.05 | 10 | 0.02 | 4 | 0.01 | 4 | 0.01 | 285 | 286 | clock Package Metrics for htmlunit 287 | 288 | ## Code of Conduct 289 | 290 | Please note that this project is released with a Contributor Code of 291 | Conduct. By participating in this project you agree to abide by its 292 | terms. 293 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------