├── man
├── figures
│ └── test-url-table.png
├── pipe.Rd
├── print.webclient.Rd
├── print.browserinfo.Rd
├── wc_title.Rd
├── wc_go.Rd
├── wc_url.Rd
├── wc_status.Rd
├── wc_browser_info.Rd
├── wc_content_type.Rd
├── wc_load_time.Rd
├── wc_headers.Rd
├── wc_content_length.Rd
├── wc_fill_in.Rd
├── wc_click_on.Rd
├── wc_resize.Rd
├── wc_css.Rd
├── wc_geo.Rd
├── wc_html_nodes.Rd
├── wc_dnt.Rd
├── wc_img_dl.Rd
├── wc_wait.Rd
├── wc_inspect.Rd
├── wc_use_insecure_ssl.Rd
├── wc_timeout.Rd
├── web_client.Rd
├── wc_html_text.Rd
├── htmlunit.Rd
├── wc_render.Rd
└── hu_read_html.Rd
├── .gitignore
├── java
└── htmlunit
│ ├── deps
│ ├── dec-0.1.2.jar
│ ├── xalan-2.7.2.jar
│ ├── commons-io-2.6.jar
│ ├── commons-io-2.7.jar
│ ├── httpmime-4.5.8.jar
│ ├── httpmime-4.5.9.jar
│ ├── commons-lang3-3.9.jar
│ ├── commons-net-3.6.jar
│ ├── commons-net-3.7.jar
│ ├── commons-text-1.6.jar
│ ├── commons-text-1.7.jar
│ ├── commons-text-1.8.jar
│ ├── commons-text-1.9.jar
│ ├── htmlunit-2.35.0.jar
│ ├── htmlunit-2.36.0.jar
│ ├── htmlunit-2.38.0.jar
│ ├── htmlunit-2.40.0.jar
│ ├── htmlunit-2.43.0.jar
│ ├── httpclient-4.5.12.jar
│ ├── httpclient-4.5.8.jar
│ ├── httpclient-4.5.9.jar
│ ├── httpcore-4.4.11.jar
│ ├── httpcore-4.4.13.jar
│ ├── httpmime-4.5.12.jar
│ ├── salvation-2.7.1.jar
│ ├── salvation-2.7.2.jar
│ ├── serializer-2.7.2.jar
│ ├── xercesImpl-2.12.0.jar
│ ├── xml-apis-1.4.01.jar
│ ├── commons-codec-1.11.jar
│ ├── commons-lang3-3.10.jar
│ ├── commons-lang3-3.11.jar
│ ├── commons-logging-1.2.jar
│ ├── neko-htmlunit-2.35.0.jar
│ ├── neko-htmlunit-2.36.0.jar
│ ├── neko-htmlunit-2.38.0.jar
│ ├── neko-htmlunit-2.40.0.jar
│ ├── neko-htmlunit-2.43.0.jar
│ ├── htmlunit-core-js-2.35.0.jar
│ ├── htmlunit-core-js-2.36.0.jar
│ ├── htmlunit-core-js-2.38.0.jar
│ ├── htmlunit-core-js-2.40.0.jar
│ ├── htmlunit-core-js-2.43.0.jar
│ ├── htmlunit-cssparser-1.4.0.jar
│ ├── htmlunit-cssparser-1.5.0.jar
│ ├── jetty-http-9.4.16.v20190411.jar
│ ├── jetty-http-9.4.20.v20190813.jar
│ ├── jetty-http-9.4.27.v20200227.jar
│ ├── jetty-http-9.4.28.v20200408.jar
│ ├── jetty-http-9.4.31.v20200723.jar
│ ├── jetty-io-9.4.16.v20190411.jar
│ ├── jetty-io-9.4.20.v20190813.jar
│ ├── jetty-io-9.4.27.v20200227.jar
│ ├── jetty-io-9.4.28.v20200408.jar
│ ├── jetty-io-9.4.31.v20200723.jar
│ ├── jetty-util-9.4.16.v20190411.jar
│ ├── jetty-util-9.4.20.v20190813.jar
│ ├── jetty-util-9.4.27.v20200227.jar
│ ├── jetty-util-9.4.28.v20200408.jar
│ ├── jetty-util-9.4.31.v20200723.jar
│ ├── jetty-xml-9.4.16.v20190411.jar
│ ├── jetty-xml-9.4.20.v20190813.jar
│ ├── jetty-xml-9.4.27.v20200227.jar
│ ├── jetty-xml-9.4.28.v20200408.jar
│ ├── jetty-xml-9.4.31.v20200723.jar
│ ├── jetty-client-9.4.16.v20190411.jar
│ ├── jetty-client-9.4.20.v20190813.jar
│ ├── jetty-client-9.4.27.v20200227.jar
│ ├── jetty-client-9.4.28.v20200408.jar
│ ├── jetty-client-9.4.31.v20200723.jar
│ ├── websocket-api-9.4.16.v20190411.jar
│ ├── websocket-api-9.4.20.v20190813.jar
│ ├── websocket-api-9.4.27.v20200227.jar
│ ├── websocket-api-9.4.28.v20200408.jar
│ ├── websocket-api-9.4.31.v20200723.jar
│ ├── websocket-client-9.4.16.v20190411.jar
│ ├── websocket-client-9.4.20.v20190813.jar
│ ├── websocket-client-9.4.27.v20200227.jar
│ ├── websocket-client-9.4.28.v20200408.jar
│ ├── websocket-client-9.4.31.v20200723.jar
│ ├── websocket-common-9.4.16.v20190411.jar
│ ├── websocket-common-9.4.20.v20190813.jar
│ ├── websocket-common-9.4.27.v20200227.jar
│ ├── websocket-common-9.4.28.v20200408.jar
│ └── websocket-common-9.4.31.v20200723.jar
│ ├── target
│ ├── htmlunit-1.0-SNAPSHOT.jar
│ ├── classes
│ │ └── is
│ │ │ └── rud
│ │ │ └── htmlunit
│ │ │ ├── Zapp.class
│ │ │ ├── Zapp$1.class
│ │ │ ├── RDefaultCssErrorHandler.class
│ │ │ └── RIncorrectnessListener.class
│ ├── maven-archiver
│ │ └── pom.properties
│ └── maven-status
│ │ └── maven-compiler-plugin
│ │ └── compile
│ │ └── default-compile
│ │ ├── createdFiles.lst
│ │ └── inputFiles.lst
│ ├── src
│ └── main
│ │ └── java
│ │ └── is
│ │ └── rud
│ │ └── htmlunit
│ │ ├── RIncorrectnessListener.java
│ │ ├── RDefaultCssErrorHandler.java
│ │ └── Zapp.java
│ ├── pom.xml
│ └── Makefile
├── inst
├── java
│ └── htmlunit-1.0-SNAPSHOT.jar
└── tinytest
│ └── test_htmlunit.R
├── tests
└── tinytest.R
├── R
├── utils-infix-helpers.R
├── utils-pipe.R
├── zzz.R
├── htmlunit-package.R
├── wc-forms.R
├── wc-click.R
├── wc-as.R
├── wc-inspect.R
├── wc-html-nodes.R
├── wc-status.R
├── hu-read-html.R
├── wc-options.R
└── web-client.R
├── .Rbuildignore
├── NEWS.md
├── htmlunit.Rproj
├── NAMESPACE
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── README.Rmd
├── README.md
└── LICENSE
/man/figures/test-url-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/man/figures/test-url-table.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | .Rproj
6 | src/*.o
7 | src/*.so
8 | src/*.dll
9 |
--------------------------------------------------------------------------------
/java/htmlunit/deps/dec-0.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/dec-0.1.2.jar
--------------------------------------------------------------------------------
/inst/java/htmlunit-1.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/inst/java/htmlunit-1.0-SNAPSHOT.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/xalan-2.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/xalan-2.7.2.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-io-2.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-io-2.6.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-io-2.7.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-io-2.7.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/httpmime-4.5.8.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpmime-4.5.8.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/httpmime-4.5.9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpmime-4.5.9.jar
--------------------------------------------------------------------------------
/tests/tinytest.R:
--------------------------------------------------------------------------------
1 |
2 | if ( requireNamespace("tinytest", quietly=TRUE) ){
3 | tinytest::test_package("htmlunit")
4 | }
5 |
6 |
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-lang3-3.9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-lang3-3.9.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-net-3.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-net-3.6.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-net-3.7.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-net-3.7.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-text-1.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-text-1.6.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-text-1.7.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-text-1.7.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-text-1.8.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-text-1.8.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-text-1.9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-text-1.9.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-2.35.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.35.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-2.36.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.36.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-2.38.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.38.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-2.40.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.40.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-2.43.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-2.43.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/httpclient-4.5.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpclient-4.5.12.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/httpclient-4.5.8.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpclient-4.5.8.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/httpclient-4.5.9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpclient-4.5.9.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/httpcore-4.4.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpcore-4.4.11.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/httpcore-4.4.13.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpcore-4.4.13.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/httpmime-4.5.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/httpmime-4.5.12.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/salvation-2.7.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/salvation-2.7.1.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/salvation-2.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/salvation-2.7.2.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/serializer-2.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/serializer-2.7.2.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/xercesImpl-2.12.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/xercesImpl-2.12.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/xml-apis-1.4.01.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/xml-apis-1.4.01.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-codec-1.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-codec-1.11.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-lang3-3.10.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-lang3-3.10.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-lang3-3.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-lang3-3.11.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/commons-logging-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/commons-logging-1.2.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/neko-htmlunit-2.35.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.35.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/neko-htmlunit-2.36.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.36.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/neko-htmlunit-2.38.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.38.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/neko-htmlunit-2.40.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.40.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/neko-htmlunit-2.43.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/neko-htmlunit-2.43.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-core-js-2.35.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.35.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-core-js-2.36.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.36.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-core-js-2.38.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.38.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-core-js-2.40.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.40.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-core-js-2.43.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-core-js-2.43.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-cssparser-1.4.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-cssparser-1.4.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/htmlunit-cssparser-1.5.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/htmlunit-cssparser-1.5.0.jar
--------------------------------------------------------------------------------
/java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-http-9.4.16.v20190411.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.16.v20190411.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-http-9.4.20.v20190813.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.20.v20190813.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-http-9.4.27.v20200227.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.27.v20200227.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-http-9.4.28.v20200408.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.28.v20200408.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-io-9.4.16.v20190411.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.16.v20190411.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-io-9.4.20.v20190813.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.20.v20190813.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-io-9.4.27.v20200227.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.27.v20200227.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-io-9.4.28.v20200408.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.28.v20200408.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-util-9.4.16.v20190411.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.16.v20190411.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-util-9.4.20.v20190813.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.20.v20190813.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-util-9.4.27.v20200227.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.27.v20200227.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-util-9.4.28.v20200408.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.28.v20200408.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-xml-9.4.16.v20190411.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.16.v20190411.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-xml-9.4.20.v20190813.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.20.v20190813.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-xml-9.4.27.v20200227.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.27.v20200227.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-xml-9.4.28.v20200408.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.28.v20200408.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-client-9.4.16.v20190411.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.16.v20190411.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-client-9.4.20.v20190813.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.20.v20190813.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-client-9.4.27.v20200227.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.27.v20200227.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-client-9.4.28.v20200408.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.28.v20200408.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-api-9.4.16.v20190411.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.16.v20190411.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-api-9.4.20.v20190813.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.20.v20190813.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-api-9.4.27.v20200227.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.27.v20200227.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-api-9.4.28.v20200408.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.28.v20200408.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar
--------------------------------------------------------------------------------
/java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-client-9.4.16.v20190411.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.16.v20190411.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-client-9.4.20.v20190813.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.20.v20190813.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-client-9.4.27.v20200227.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.27.v20200227.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-client-9.4.28.v20200408.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.28.v20200408.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-common-9.4.16.v20190411.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.16.v20190411.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-common-9.4.20.v20190813.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.20.v20190813.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-common-9.4.27.v20200227.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.27.v20200227.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-common-9.4.28.v20200408.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.28.v20200408.jar
--------------------------------------------------------------------------------
/java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar
--------------------------------------------------------------------------------
/java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class
--------------------------------------------------------------------------------
/java/htmlunit/target/maven-archiver/pom.properties:
--------------------------------------------------------------------------------
1 | #Generated by Maven
2 | #Wed Aug 19 08:51:02 EDT 2020
3 | groupId=is.rud.htmlunit
4 | artifactId=htmlunit
5 | version=1.0-SNAPSHOT
6 |
--------------------------------------------------------------------------------
/java/htmlunit/target/classes/is/rud/htmlunit/RDefaultCssErrorHandler.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/classes/is/rud/htmlunit/RDefaultCssErrorHandler.class
--------------------------------------------------------------------------------
/java/htmlunit/target/classes/is/rud/htmlunit/RIncorrectnessListener.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hrbrmstr/htmlunit/HEAD/java/htmlunit/target/classes/is/rud/htmlunit/RIncorrectnessListener.class
--------------------------------------------------------------------------------
/R/utils-infix-helpers.R:
--------------------------------------------------------------------------------
1 | `%l0%` <- function(x, y) if (length(x) == 0) y else x
2 | `%||%` <- function(x, y) if (is.null(x)) y else x
3 | `%@%` <- function(x, name) attr(x, name, exact = TRUE)
4 | `%nin%` <- function(x, table) match(x, table, nomatch = 0) == 0
5 |
--------------------------------------------------------------------------------
/java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst:
--------------------------------------------------------------------------------
1 | is/rud/htmlunit/RDefaultCssErrorHandler.class
2 | is/rud/htmlunit/RIncorrectnessListener.class
3 | is/rud/htmlunit/Zapp.class
4 | is/rud/htmlunit/Zapp$1.class
5 |
--------------------------------------------------------------------------------
/R/utils-pipe.R:
--------------------------------------------------------------------------------
1 | #' Pipe operator
2 | #'
3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details.
4 | #'
5 | #' @name %>%
6 | #' @rdname pipe
7 | #' @keywords internal
8 | #' @export
9 | #' @importFrom magrittr %>%
10 | #' @usage lhs \%>\% rhs
11 | NULL
12 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.travis\.yml$
4 | ^README\.*Rmd$
5 | ^README\.*html$
6 | ^NOTES\.*Rmd$
7 | ^NOTES\.*html$
8 | ^\.codecov\.yml$
9 | ^README_files$
10 | ^java$
11 | ^doc$
12 | ^docs$
13 | ^tmp$
14 | ^notes$
15 | ^\.gitlab-ci\.yml$
16 |
--------------------------------------------------------------------------------
/man/pipe.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils-pipe.R
3 | \name{\%>\%}
4 | \alias{\%>\%}
5 | \title{Pipe operator}
6 | \usage{
7 | lhs \%>\% rhs
8 | }
9 | \description{
10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details.
11 | }
12 | \keyword{internal}
13 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | stop_logging <- function() {
2 | rJava::J("java.util.logging.LogManager")$getLogManager()$reset()
3 | invisible(NULL)
4 | }
5 |
6 | .onLoad <- function(libname, pkgname) {
7 | rJava::.jpackage(pkgname, jars = "*", lib.loc = libname)
8 | rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE))
9 | stop_logging()
10 | }
11 |
12 |
13 |
--------------------------------------------------------------------------------
/java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst:
--------------------------------------------------------------------------------
1 | /Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java
2 | /Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java
3 | /Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java
4 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | 0.5.0
2 | * Updated for 2.43.0 jars
3 | * Added support for Microsoft Edge browser
4 | * Added `timeout`, `css`, and `images` parameters to `wc_inspect()`
5 |
6 | 0.4.0
7 | * Switched to {tinytest}
8 | * Updated for 2.40.0 jars
9 |
10 | 0.3.0
11 | * java 11 compile
12 | * tested against new htmlunit jar release
13 |
14 | 0.2.0
15 | * inspect
16 |
17 | 0.1.0
18 | * Initial release
19 |
--------------------------------------------------------------------------------
/java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java:
--------------------------------------------------------------------------------
1 | package is.rud.htmlunit;
2 |
3 | public class RIncorrectnessListener implements com.gargoylesoftware.htmlunit.IncorrectnessListener,
4 | java.io.Serializable {
5 |
6 | /**
7 | * {@inheritDoc}
8 | */
9 | @Override
10 | public void notify(final java.lang.String message, final java.lang.Object origin) {
11 | }
12 |
13 | }
14 |
--------------------------------------------------------------------------------
/man/print.webclient.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/web-client.R
3 | \name{print.webclient}
4 | \alias{print.webclient}
5 | \title{Print method for \code{webclient} objects}
6 | \usage{
7 | \method{print}{webclient}(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{\code{webclient} object}
11 |
12 | \item{...}{unused}
13 | }
14 | \value{
15 | \code{x}
16 | }
17 | \description{
18 | Print method for \code{webclient} objects
19 | }
20 | \keyword{internal}
21 |
--------------------------------------------------------------------------------
/htmlunit.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | StripTrailingWhitespace: Yes
16 |
17 | BuildType: Package
18 | PackageUseDevtools: Yes
19 | PackageInstallArgs: --no-multiarch --with-keep.source
20 | PackageBuildArgs: --resave-data
21 | PackageRoxygenize: rd,collate,namespace
22 |
--------------------------------------------------------------------------------
/man/print.browserinfo.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/web-client.R
3 | \name{print.browserinfo}
4 | \alias{print.browserinfo}
5 | \title{Print method for \code{browserinfo} objects}
6 | \usage{
7 | \method{print}{browserinfo}(x, ...)
8 | }
9 | \arguments{
10 | \item{x}{\code{browserinfo} object}
11 |
12 | \item{...}{unused}
13 | }
14 | \value{
15 | \code{x}
16 | }
17 | \description{
18 | Print method for \code{browserinfo} objects
19 | }
20 | \keyword{internal}
21 |
--------------------------------------------------------------------------------
/man/wc_title.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-status.R
3 | \name{wc_title}
4 | \alias{wc_title}
5 | \title{Return page title for current page}
6 | \usage{
7 | wc_title(wc_obj)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 | }
12 | \value{
13 | page title of the current page \code{NULL} if no active page
14 | }
15 | \description{
16 | Return page title for current page
17 | }
18 | \note{
19 | This is an information retrieval function that does not return
20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
21 | }
22 |
--------------------------------------------------------------------------------
/man/wc_go.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/web-client.R
3 | \name{wc_go}
4 | \alias{wc_go}
5 | \title{Visit a URL}
6 | \usage{
7 | wc_go(wc_obj, url)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{url}{URL to retrieve}
13 | }
14 | \value{
15 | the \code{webclient} object (invisibly)
16 | }
17 | \description{
18 | Visit a URL
19 | }
20 | \note{
21 | The caller does not have to assign the output of this function to a
22 | variable as the browser state is managed internally by HtmlUnit.
23 | }
24 | \examples{
25 | w <- web_client()
26 | wc_go(w, "https://httpbin.org/")
27 | }
28 |
--------------------------------------------------------------------------------
/java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java:
--------------------------------------------------------------------------------
1 | package is.rud.htmlunit;
2 |
3 | public class RDefaultCssErrorHandler implements com.gargoylesoftware.css.parser.CSSErrorHandler,
4 | java.io.Serializable {
5 | @Override
6 | public void error(final com.gargoylesoftware.css.parser.CSSParseException exception) {
7 | }
8 |
9 | @Override
10 | public void fatalError(final com.gargoylesoftware.css.parser.CSSParseException exception) {
11 | }
12 |
13 | @Override
14 | public void warning(final com.gargoylesoftware.css.parser.CSSParseException exception) {
15 | }
16 |
17 | }
--------------------------------------------------------------------------------
/man/wc_url.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-status.R
3 | \name{wc_url}
4 | \alias{wc_url}
5 | \title{Return load time of the last web request for current page}
6 | \usage{
7 | wc_url(wc_obj)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 | }
12 | \value{
13 | the load time (in ms) of the web request or \code{NULL} if no active page
14 | }
15 | \description{
16 | Return load time of the last web request for current page
17 | }
18 | \note{
19 | This is an information retrieval function that does not return
20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
21 | }
22 |
--------------------------------------------------------------------------------
/man/wc_status.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-status.R
3 | \name{wc_status}
4 | \alias{wc_status}
5 | \title{Return status code of web request for current page}
6 | \usage{
7 | wc_status(wc_obj)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 | }
12 | \value{
13 | the HTTP status code and message of the web request or \code{NULL} if no active page
14 | }
15 | \description{
16 | Return status code of web request for current page
17 | }
18 | \note{
19 | This is an information retrieval function that does not return
20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
21 | }
22 |
--------------------------------------------------------------------------------
/man/wc_browser_info.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/web-client.R
3 | \name{wc_browser_info}
4 | \alias{wc_browser_info}
5 | \title{Retreive information about the browser used to create the \code{webclient}}
6 | \usage{
7 | wc_browser_info(wc_obj)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 | }
12 | \value{
13 | the browser version
14 | }
15 | \description{
16 | Retreive information about the browser used to create the \code{webclient}
17 | }
18 | \note{
19 | This is an information retrieval function that does not return
20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
21 | }
22 |
--------------------------------------------------------------------------------
/man/wc_content_type.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-status.R
3 | \name{wc_content_type}
4 | \alias{wc_content_type}
5 | \title{Return content type of web request for current page}
6 | \usage{
7 | wc_content_type(wc_obj)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 | }
12 | \value{
13 | the content type of the web request or \code{NULL} if no active page
14 | }
15 | \description{
16 | Return content type of web request for current page
17 | }
18 | \note{
19 | This is an information retrieval function that does not return
20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
21 | }
22 |
--------------------------------------------------------------------------------
/man/wc_load_time.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-status.R
3 | \name{wc_load_time}
4 | \alias{wc_load_time}
5 | \title{Return load time of the last web request for current page}
6 | \usage{
7 | wc_load_time(wc_obj)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 | }
12 | \value{
13 | the load time (in ms) of the web request or \code{NULL} if no active page
14 | }
15 | \description{
16 | Return load time of the last web request for current page
17 | }
18 | \note{
19 | This is an information retrieval function that does not return
20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
21 | }
22 |
--------------------------------------------------------------------------------
/man/wc_headers.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-status.R
3 | \name{wc_headers}
4 | \alias{wc_headers}
5 | \title{Return response headers of the last web request for current page}
6 | \usage{
7 | wc_headers(wc_obj)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 | }
12 | \value{
13 | the response headers of the web request as a data frame or \code{NULL} if
14 | no active page
15 | }
16 | \description{
17 | Return response headers of the last web request for current page
18 | }
19 | \note{
20 | This is an information retrieval function that does not return
21 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
22 | }
23 |
--------------------------------------------------------------------------------
/man/wc_content_length.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-status.R
3 | \name{wc_content_length}
4 | \alias{wc_content_length}
5 | \title{Return content length of the last web request for current page}
6 | \usage{
7 | wc_content_length(wc_obj)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 | }
12 | \value{
13 | the content length (in bytes) of the web request or \code{NULL} if no active page
14 | }
15 | \description{
16 | Return content length of the last web request for current page
17 | }
18 | \note{
19 | This is an information retrieval function that does not return
20 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
21 | }
22 |
--------------------------------------------------------------------------------
/man/wc_fill_in.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-forms.R
3 | \name{wc_fill_in}
4 | \alias{wc_fill_in}
5 | \title{Fill in a input box in a form field}
6 | \usage{
7 | wc_fill_in(wc_obj, value, css, xpath)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{value}{the value to fill in}
13 |
14 | \item{css, xpath}{Node to select for filling. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.}
15 | }
16 | \description{
17 | Fill in a input box in a form field
18 | }
19 | \note{
20 | The caller does not have to assign the output of this function to a
21 | variable as the browser state is managed internally by HtmlUnit.
22 | }
23 |
--------------------------------------------------------------------------------
/man/wc_click_on.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-click.R
3 | \name{wc_click_on}
4 | \alias{wc_click_on}
5 | \title{Click on a DOM element in a webclient loaded page}
6 | \usage{
7 | wc_click_on(wc_obj, css, xpath)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{css, xpath}{Node to click on. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.}
13 | }
14 | \description{
15 | Click on a DOM element in a webclient loaded page
16 | }
17 | \note{
18 | The caller does not have to assign the output of this function to a
19 | variable as the browser state is managed internally by HtmlUnit.
20 | }
21 | \examples{
22 | w <- web_client()
23 | wc_go(w, url = "https://hrbrmstr.github.io/htmlunitjars/index.html")
24 | wc_click_on(w, "table")
25 | }
26 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(print,browserinfo)
4 | S3method(print,webclient)
5 | export("%>%")
6 | export(hu_read_html)
7 | export(wc_browser_info)
8 | export(wc_click_on)
9 | export(wc_content_length)
10 | export(wc_content_type)
11 | export(wc_css)
12 | export(wc_dnt)
13 | export(wc_fill_in)
14 | export(wc_geo)
15 | export(wc_go)
16 | export(wc_headers)
17 | export(wc_html_attr)
18 | export(wc_html_name)
19 | export(wc_html_nodes)
20 | export(wc_html_text)
21 | export(wc_img_dl)
22 | export(wc_inspect)
23 | export(wc_load_time)
24 | export(wc_render)
25 | export(wc_resize)
26 | export(wc_status)
27 | export(wc_timeout)
28 | export(wc_title)
29 | export(wc_url)
30 | export(wc_use_insecure_ssl)
31 | export(wc_wait)
32 | export(web_client)
33 | export(webclient)
34 | import(htmlunitjars)
35 | import(rJava)
36 | import(rvest)
37 | import(xml2)
38 | importFrom(magrittr,"%>%")
39 |
--------------------------------------------------------------------------------
/man/wc_resize.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-options.R
3 | \name{wc_resize}
4 | \alias{wc_resize}
5 | \title{Resize the virtual browser window}
6 | \usage{
7 | wc_resize(wc_obj, h, w)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{h, w}{height and width (pixels)}
13 | }
14 | \value{
15 | the \code{webclient} object (invisibly)
16 | }
17 | \description{
18 | Resize the virtual browser window
19 | }
20 | \note{
21 | The caller does not have to assign the output of this function to a
22 | variable as the browser state is managed internally by HtmlUnit.
23 | }
24 | \seealso{
25 | Other wc_opts:
26 | \code{\link{wc_css}()},
27 | \code{\link{wc_dnt}()},
28 | \code{\link{wc_geo}()},
29 | \code{\link{wc_img_dl}()},
30 | \code{\link{wc_timeout}()},
31 | \code{\link{wc_use_insecure_ssl}()},
32 | \code{\link{wc_wait}()}
33 | }
34 | \concept{wc_opts}
35 |
--------------------------------------------------------------------------------
/man/wc_css.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-options.R
3 | \name{wc_css}
4 | \alias{wc_css}
5 | \title{Enable/Disable CSS support}
6 | \usage{
7 | wc_css(wc_obj, enable)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{enable}{if \code{TRUE} enable CSS support (which is the HtmlUnit default)}
13 | }
14 | \value{
15 | the \code{webclient} object (invisibly)
16 | }
17 | \description{
18 | Enable/Disable CSS support
19 | }
20 | \note{
21 | The caller does not have to assign the output of this function to a
22 | variable as the browser state is managed internally by HtmlUnit.
23 | }
24 | \seealso{
25 | Other wc_opts:
26 | \code{\link{wc_dnt}()},
27 | \code{\link{wc_geo}()},
28 | \code{\link{wc_img_dl}()},
29 | \code{\link{wc_resize}()},
30 | \code{\link{wc_timeout}()},
31 | \code{\link{wc_use_insecure_ssl}()},
32 | \code{\link{wc_wait}()}
33 | }
34 | \concept{wc_opts}
35 |
--------------------------------------------------------------------------------
/man/wc_geo.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-options.R
3 | \name{wc_geo}
4 | \alias{wc_geo}
5 | \title{Enable/Disable Geolocation}
6 | \usage{
7 | wc_geo(wc_obj, enable)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{enable}{if \code{TRUE} enable geolocation (which is the HtmlUnit default)}
13 | }
14 | \value{
15 | the \code{webclient} object (invisibly)
16 | }
17 | \description{
18 | Enable/Disable Geolocation
19 | }
20 | \note{
21 | The caller does not have to assign the output of this function to a
22 | variable as the browser state is managed internally by HtmlUnit.
23 | }
24 | \seealso{
25 | Other wc_opts:
26 | \code{\link{wc_css}()},
27 | \code{\link{wc_dnt}()},
28 | \code{\link{wc_img_dl}()},
29 | \code{\link{wc_resize}()},
30 | \code{\link{wc_timeout}()},
31 | \code{\link{wc_use_insecure_ssl}()},
32 | \code{\link{wc_wait}()}
33 | }
34 | \concept{wc_opts}
35 |
--------------------------------------------------------------------------------
/man/wc_html_nodes.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-html-nodes.R
3 | \name{wc_html_nodes}
4 | \alias{wc_html_nodes}
5 | \title{Select nodes from web client active page html content}
6 | \usage{
7 | wc_html_nodes(wc_obj, css, xpath)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{css, xpath}{Nodes to select. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.}
13 | }
14 | \description{
15 | Select nodes from web client active page html content
16 | }
17 | \examples{
18 | \dontrun{
19 | wc <- web_client()
20 |
21 | wc \%>\% wc_go("https://usa.gov/")
22 |
23 | wc \%>\%
24 | wc_html_nodes("a") \%>\%
25 | sapply(wc_html_text)
26 |
27 | wc \%>\%
28 | wc_html_nodes(xpath=".//a") \%>\%
29 | sapply(wc_html_text)
30 |
31 | wc \%>\%
32 | wc_html_nodes(xpath=".//a") \%>\%
33 | sapply(wc_html_attr, "href")
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/man/wc_dnt.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-options.R
3 | \name{wc_dnt}
4 | \alias{wc_dnt}
5 | \title{Enable/Disable Do-Not-Track}
6 | \usage{
7 | wc_dnt(wc_obj, enable)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{enable}{if \code{TRUE} enable Do-Not-Track support (which is the HtmlUnit default)}
13 | }
14 | \value{
15 | the \code{webclient} object (invisibly)
16 | }
17 | \description{
18 | Enable/Disable Do-Not-Track
19 | }
20 | \note{
21 | The caller does not have to assign the output of this function to a
22 | variable as the browser state is managed internally by HtmlUnit.
23 | }
24 | \seealso{
25 | Other wc_opts:
26 | \code{\link{wc_css}()},
27 | \code{\link{wc_geo}()},
28 | \code{\link{wc_img_dl}()},
29 | \code{\link{wc_resize}()},
30 | \code{\link{wc_timeout}()},
31 | \code{\link{wc_use_insecure_ssl}()},
32 | \code{\link{wc_wait}()}
33 | }
34 | \concept{wc_opts}
35 |
--------------------------------------------------------------------------------
/man/wc_img_dl.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-options.R
3 | \name{wc_img_dl}
4 | \alias{wc_img_dl}
5 | \title{Enable/Disable Image Downloading}
6 | \usage{
7 | wc_img_dl(wc_obj, enable)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{enable}{if \code{TRUE} enable image downloading (the default is not to download images)}
13 | }
14 | \value{
15 | the \code{webclient} object (invisibly)
16 | }
17 | \description{
18 | Enable/Disable Image Downloading
19 | }
20 | \note{
21 | The caller does not have to assign the output of this function to a
22 | variable as the browser state is managed internally by HtmlUnit.
23 | }
24 | \seealso{
25 | Other wc_opts:
26 | \code{\link{wc_css}()},
27 | \code{\link{wc_dnt}()},
28 | \code{\link{wc_geo}()},
29 | \code{\link{wc_resize}()},
30 | \code{\link{wc_timeout}()},
31 | \code{\link{wc_use_insecure_ssl}()},
32 | \code{\link{wc_wait}()}
33 | }
34 | \concept{wc_opts}
35 |
--------------------------------------------------------------------------------
/man/wc_wait.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-options.R
3 | \name{wc_wait}
4 | \alias{wc_wait}
5 | \title{Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing}
6 | \usage{
7 | wc_wait(wc_obj, js_delay = 2000L)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{js_delay}{number of ms to wait/block}
13 | }
14 | \description{
15 | Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing
16 | }
17 | \note{
18 | The caller does not have to assign the output of this function to a
19 | variable as the browser state is managed internally by HtmlUnit.
20 | }
21 | \seealso{
22 | Other wc_opts:
23 | \code{\link{wc_css}()},
24 | \code{\link{wc_dnt}()},
25 | \code{\link{wc_geo}()},
26 | \code{\link{wc_img_dl}()},
27 | \code{\link{wc_resize}()},
28 | \code{\link{wc_timeout}()},
29 | \code{\link{wc_use_insecure_ssl}()}
30 | }
31 | \concept{wc_opts}
32 |
--------------------------------------------------------------------------------
/man/wc_inspect.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-inspect.R
3 | \name{wc_inspect}
4 | \alias{wc_inspect}
5 | \title{Perform a "Developer Tools"-like Network Inspection of a URL}
6 | \usage{
7 | wc_inspect(
8 | url,
9 | js_delay = 5000L,
10 | timeout = 30000L,
11 | css = FALSE,
12 | images = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{url}{URL to fetch}
17 |
18 | \item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)}
19 |
20 | \item{timeout}{Sets the timeout (milliseconds) of the web connection. Set to zero for an infinite wait.
21 | Defaults to \code{30000}. Note: The timeout is used twice. The first is for making the socket
22 | connection, the second is for data retrieval. If the time is critical you must allow for twice
23 | the time specified here.}
24 |
25 | \item{css, images}{enable CSS/download images? (default \code{FALSE})}
26 | }
27 | \description{
28 | Retrieves \emph{all} content loaded
29 | }
30 |
--------------------------------------------------------------------------------
/man/wc_use_insecure_ssl.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-options.R
3 | \name{wc_use_insecure_ssl}
4 | \alias{wc_use_insecure_ssl}
5 | \title{Enable/Disable Ignoring SSL Validation Issues}
6 | \usage{
7 | wc_use_insecure_ssl(wc_obj, enable)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{enable}{if \code{TRUE} the client will accept connections to any host,
13 | regardless of whether they have valid certificates or not}
14 | }
15 | \value{
16 | the \code{webclient} object (invisibly)
17 | }
18 | \description{
19 | Enable/Disable Ignoring SSL Validation Issues
20 | }
21 | \note{
22 | The caller does not have to assign the output of this function to a
23 | variable as the browser state is managed internally by HtmlUnit.
24 | }
25 | \seealso{
26 | Other wc_opts:
27 | \code{\link{wc_css}()},
28 | \code{\link{wc_dnt}()},
29 | \code{\link{wc_geo}()},
30 | \code{\link{wc_img_dl}()},
31 | \code{\link{wc_resize}()},
32 | \code{\link{wc_timeout}()},
33 | \code{\link{wc_wait}()}
34 | }
35 | \concept{wc_opts}
36 |
--------------------------------------------------------------------------------
/man/wc_timeout.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-options.R
3 | \name{wc_timeout}
4 | \alias{wc_timeout}
5 | \title{Change default request timeout}
6 | \usage{
7 | wc_timeout(wc_obj, timeout)
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{timeout}{timeout (ms); The timeout is used twice. The first is for making
13 | the socket connection, the second is for data retrieval. If the
14 | time is critical you must allow for twice the time specified here.}
15 | }
16 | \value{
17 | the \code{webclient} object (invisibly)
18 | }
19 | \description{
20 | Change default request timeout
21 | }
22 | \note{
23 | The caller does not have to assign the output of this function to a
24 | variable as the browser state is managed internally by HtmlUnit.
25 | }
26 | \seealso{
27 | Other wc_opts:
28 | \code{\link{wc_css}()},
29 | \code{\link{wc_dnt}()},
30 | \code{\link{wc_geo}()},
31 | \code{\link{wc_img_dl}()},
32 | \code{\link{wc_resize}()},
33 | \code{\link{wc_use_insecure_ssl}()},
34 | \code{\link{wc_wait}()}
35 | }
36 | \concept{wc_opts}
37 |
--------------------------------------------------------------------------------
/man/web_client.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/web-client.R
3 | \name{web_client}
4 | \alias{web_client}
5 | \alias{webclient}
6 | \title{Create a new HtmlUnit WebClient instance}
7 | \usage{
8 | web_client(
9 | emulate = c("best", "chrome", "firefox", "ie", "edge"),
10 | proxy_host = NULL,
11 | proxy_port = NULL
12 | )
13 |
14 | webclient(
15 | emulate = c("best", "chrome", "firefox", "ie", "edge"),
16 | proxy_host = NULL,
17 | proxy_port = NULL
18 | )
19 | }
20 | \arguments{
21 | \item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"}
22 |
23 | \item{proxy_host, proxy_port}{the server/port that will act as proxy (default
24 | \code{NULL} = no proxy)}
25 | }
26 | \value{
27 | \code{webclient} object
28 | }
29 | \description{
30 | A new HtmlUnit web client (virtual browser) will be created and a \code{webclient}
31 | object will be returned.
32 | }
33 | \details{
34 | This is part of the \code{htmlunit} DSL interface.s
35 | }
36 | \examples{
37 | w <- web_client()
38 | wc_browser_info(w)
39 | }
40 | \concept{dsl}
41 |
--------------------------------------------------------------------------------
/inst/tinytest/test_htmlunit.R:
--------------------------------------------------------------------------------
1 |
2 | test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
3 |
4 | w <- web_client()
5 |
6 | expect_equal(class(w), "webclient")
7 | expect_equal(class(wc_browser_info(w)), "browserinfo")
8 |
9 | expect_equal(class(wc_go(w, url = test_url)), "webclient")
10 |
11 | expect_equal(wc_url(w), test_url)
12 | expect_equal(wc_title(w), "")
13 |
14 | expect_true(inherits(wc_render(w, "parsed"), "xml_document"))
15 | expect_true(inherits(wc_render(w, "html"), "character"))
16 | expect_true(inherits(wc_render(w, "text"), "character"))
17 |
18 | expect_true(inherits(wc_click_on(w, "table"), "webclient"))
19 |
20 | expect_equal(
21 | wc_html_nodes(w, "title") %>% sapply(wc_html_text),
22 | ""
23 | )
24 |
25 | expect_equal(
26 | wc_html_nodes(w, "title") %>% sapply(wc_html_name),
27 | "title"
28 | )
29 |
30 | h <- wc_headers(w)
31 | expect_true(any(h$value == "GitHub.com"))
32 |
33 | expect_inherits(
34 | hu_read_html(url = test_url, ret = "html_document"),
35 | "xml_document"
36 | )
37 | expect_true(
38 | inherits(hu_read_html(url = test_url, ret = "text"),
39 | "character"
40 | ))
41 |
42 |
--------------------------------------------------------------------------------
/man/wc_html_text.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-html-nodes.R
3 | \name{wc_html_text}
4 | \alias{wc_html_text}
5 | \alias{wc_html_attr}
6 | \alias{wc_html_name}
7 | \title{Extract attributes, text and tag name from webclient page html content}
8 | \usage{
9 | wc_html_text(dom_node, trim = FALSE)
10 |
11 | wc_html_attr(dom_node, attr)
12 |
13 | wc_html_name(dom_node)
14 | }
15 | \arguments{
16 | \item{dom_node}{a webclient page DOM node (likely produced by \code{\link[=wc_html_nodes]{wc_html_nodes()}})}
17 |
18 | \item{trim}{if \code{TRUE} will trim leading/trailing white space}
19 |
20 | \item{attr}{name of attribute to retrieve}
21 | }
22 | \description{
23 | Extract attributes, text and tag name from webclient page html content
24 | }
25 | \examples{
26 | \dontrun{
27 | wc <- web_client()
28 |
29 | wc \%>\% wc_go("https://usa.gov/")
30 |
31 | wc \%>\%
32 | wc_html_nodes("a") \%>\%
33 | sapply(wc_html_text)
34 |
35 | wc \%>\%
36 | wc_html_nodes(xpath=".//a") \%>\%
37 | sapply(wc_html_text)
38 |
39 | wc \%>\%
40 | wc_html_nodes(xpath=".//a") \%>\%
41 | sapply(wc_html_attr, "href")
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/java/htmlunit/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | is.rud.htmlunit
5 | htmlunit
6 | jar
7 | 1.0-SNAPSHOT
8 | htmlunit
9 | http://maven.apache.org
10 |
11 |
12 |
13 | org.apache.maven.plugins
14 | maven-compiler-plugin
15 | 3.1
16 |
17 | 1.7
18 | 1.7
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 | net.sourceforge.htmlunit
27 | htmlunit
28 | 2.43.0
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/R/htmlunit-package.R:
--------------------------------------------------------------------------------
1 | #' Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library
2 | #'
3 | #' `HtmlUnit` () is _a "'GUI'-Less
4 | #' browser for 'Java' programs". It models 'HTML' documents and provides an 'API'
5 | #' that allows one to invoke pages, fill out forms, click links and more just like
6 | #' one does in a "normal" browser. The library has fairly good and constantly
7 | #' improving 'JavaScript' support and is able to work even with quite complex 'AJAX'
8 | #' libraries, simulating 'Chrome', 'Firefox' or 'Internet Explorer' depending on
9 | #' the configuration used. It is typically used for testing purposes or to retrieve
10 | #' information from web sites._
11 | #'
12 | #' Tools are provided to work with this library at a higher level than provided by
13 | #' the exposed 'Java' libraries in the [`htmlunitjars`](https://gitlab.com/hrbrmstr/htmlunitjars)
14 | #' package.
15 | #'
16 | #' - URL:
17 | #' - BugReports:
18 | #'
19 | #' @md
20 | #' @name htmlunit
21 | #' @docType package
22 | #' @author Bob Rudis (bob@@rud.is)
23 | #' @import rvest htmlunitjars rJava xml2
24 | NULL
25 |
--------------------------------------------------------------------------------
/java/htmlunit/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: clean pkg deps run
2 |
3 | pkg:
4 | # JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn --quiet package
5 | JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn --quiet package
6 | cp target/htmlunit-1.0-SNAPSHOT.jar ../../inst/java
7 |
8 | clean:
9 | # JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn clean
10 | JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn clean
11 |
12 | deps:
13 | # JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps
14 | JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps
15 |
16 | new:
17 | # JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false
18 | JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false
19 |
--------------------------------------------------------------------------------
/man/htmlunit.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/htmlunit-package.R
3 | \docType{package}
4 | \name{htmlunit}
5 | \alias{htmlunit}
6 | \title{Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library}
7 | \description{
8 | \code{HtmlUnit} (\url{http://htmlunit.sourceforge.net/}) is \emph{a "'GUI'-Less
9 | browser for 'Java' programs". It models 'HTML' documents and provides an 'API'
10 | that allows one to invoke pages, fill out forms, click links and more just like
11 | one does in a "normal" browser. The library has fairly good and constantly
12 | improving 'JavaScript' support and is able to work even with quite complex 'AJAX'
13 | libraries, simulating 'Chrome', 'Firefox' or 'Internet Explorer' depending on
14 | the configuration used. It is typically used for testing purposes or to retrieve
15 | information from web sites.}
16 | }
17 | \details{
18 | Tools are provided to work with this library at a higher level than provided by
19 | the exposed 'Java' libraries in the \href{https://gitlab.com/hrbrmstr/htmlunitjars}{\code{htmlunitjars}}
20 | package.
21 | \itemize{
22 | \item URL: \url{https://gitlab.com/hrbrmstr/htmlunit}
23 | \item BugReports: \url{https://gitlab.com/hrbrmstr/htmlunit/issues}
24 | }
25 | }
26 | \author{
27 | Bob Rudis (bob@rud.is)
28 | }
29 |
--------------------------------------------------------------------------------
/R/wc-forms.R:
--------------------------------------------------------------------------------
1 | #' Fill in a input box in a form field
2 | #'
3 | #' @note The caller does not have to assign the output of this function to a
4 | #' variable as the browser state is managed internally by HtmlUnit.
5 | #' @param wc_obj a `webclient` object
6 | #' @param value the value to fill in
7 | #' @param css,xpath Node to select for filling. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.
8 | #' @export
9 | wc_fill_in <- function(wc_obj, value, css, xpath) {
10 |
11 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
12 |
13 | if (.jnull() == pg) return(NULL)
14 |
15 | if (missing(css) && missing(xpath))
16 | stop("Please supply one of css or xpath", call. = FALSE)
17 |
18 | if (!missing(css) && !missing(xpath))
19 | stop("Please supply css or xpath, not both", call. = FALSE)
20 |
21 | if (!missing(css)) {
22 | if (!is.character(css) && length(css) == 1) stop("`css` must be a string")
23 |
24 | item <- pg$querySelector(css)
25 |
26 | } else {
27 | if (!is.character(xpath) && length(xpath) == 1)
28 | stop("`xpath` must be a string")
29 |
30 | item <- as.list(pg$getByXPath(xpath))[[1]]
31 |
32 | }
33 |
34 | if (length(item) == 0) {
35 | warning("No item found with that selector.")
36 | } else if (length(item) > 1) {
37 | warning("More than one item found with that selector.")
38 | } else {
39 |
40 | }
41 |
42 | return(wc_obj)
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Code of Conduct
2 |
3 | As contributors and maintainers of this project, we pledge to respect all people who
4 | contribute through reporting issues, posting feature requests, updating documentation,
5 | submitting pull requests or patches, and other activities.
6 |
7 | We are committed to making participation in this project a harassment-free experience for
8 | everyone, regardless of level of experience, gender, gender identity and expression,
9 | sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
10 |
11 | Examples of unacceptable behavior by participants include the use of sexual language or
12 | imagery, derogatory comments or personal attacks, trolling, public or private harassment,
13 | insults, or other unprofessional conduct.
14 |
15 | Project maintainers have the right and responsibility to remove, edit, or reject comments,
16 | commits, code, wiki edits, issues, and other contributions that are not aligned to this
17 | Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed
18 | from the project team.
19 |
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by
21 | opening an issue or contacting one or more of the project maintainers.
22 |
23 | This Code of Conduct is adapted from the Contributor Covenant
24 | (http://contributor-covenant.org), version 1.0.0, available at
25 | http://contributor-covenant.org/version/1/0/0/
26 |
--------------------------------------------------------------------------------
/R/wc-click.R:
--------------------------------------------------------------------------------
1 | #' Click on a DOM element in a webclient loaded page
2 | #'
3 | #' @note The caller does not have to assign the output of this function to a
4 | #' variable as the browser state is managed internally by HtmlUnit.
5 | #' @param wc_obj a `webclient` object
6 | #' @param css,xpath Node to click on. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.
7 | #' @export
8 | #' @examples
9 | #' w <- web_client()
10 | #' wc_go(w, url = "https://hrbrmstr.github.io/htmlunitjars/index.html")
11 | #' wc_click_on(w, "table")
12 | wc_click_on <- function(wc_obj, css, xpath) {
13 |
14 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
15 |
16 | if (.jnull() == pg) return(NULL)
17 |
18 | if (missing(css) && missing(xpath))
19 | stop("Please supply one of css or xpath", call. = FALSE)
20 |
21 | if (!missing(css) && !missing(xpath))
22 | stop("Please supply css or xpath, not both", call. = FALSE)
23 |
24 | if (!missing(css)) {
25 | if (!is.character(css) && length(css) == 1) stop("`css` must be a string")
26 |
27 | item <- pg$querySelector(css)
28 |
29 | } else {
30 | if (!is.character(xpath) && length(xpath) == 1)
31 | stop("`xpath` must be a string")
32 |
33 | item <- as.list(pg$getByXPath(xpath))[[1]]
34 |
35 | }
36 |
37 | if (length(item) == 0) {
38 | warning("No item found with that selector.")
39 | } else if (length(item) > 1) {
40 | warning("More than one item found with that selector.")
41 | } else {
42 | item$click()
43 | }
44 |
45 | return(wc_obj)
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/man/wc_render.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/wc-as.R
3 | \name{wc_render}
4 | \alias{wc_render}
5 | \title{Retrieve current page contents}
6 | \usage{
7 | wc_render(wc_obj, what = c("parsed", "html", "text"))
8 | }
9 | \arguments{
10 | \item{wc_obj}{a \code{webclient} object}
11 |
12 | \item{what}{what to return (see Details); NOTE that if there is no active
13 | page this function returns \code{NULL}.}
14 | }
15 | \value{
16 | if \code{what} is \code{parsed}, an \code{xml2} \code{html_document}; if \code{html},
17 | the character HTML representation of the page; if \code{text}
18 | the rendered text of the document as viewed by a human.
19 | }
20 | \description{
21 | If there is a page in the active browser context, return the contents of
22 | the page.
23 | }
24 | \details{
25 | The page contents can be returned as one of:
26 | \itemize{
27 | \item Parsed HTML (i.e. an \code{xml2} \code{html_document})
28 | \item A string representation of the HTML document. NOTE: The charset used is the
29 | current page encoding.
30 | \item A textual representation of this page that represents what would be visible
31 | to the user if this page was shown in a web browser. This is useful for,
32 | say, text mining.
33 | }
34 | }
35 | \note{
36 | This is an information retrieval function that does not return
37 | the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
38 | }
39 | \examples{
40 | w <- web_client()
41 | wc_go(w, url = "https://hrbrmstr.github.io/htmlunitjars/index.html")
42 | wc_render(w, "parsed")
43 | wc_render(w, "html")
44 | wc_render(w, "text")
45 | }
46 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: htmlunit
2 | Type: Package
3 | Title: Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library
4 | Version: 0.5.0
5 | Date: 2020-07-18
6 | Authors@R: c(
7 | person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
8 | comment = c(ORCID = "0000-0001-5670-2640")),
9 | person("Everet", "Rummel", email = "everet.rummel@gmail.com", role = "ctb")
10 | )
11 | Maintainer: Bob Rudis
12 | Description: 'HtmlUnit' () is a "'GUI'-Less
13 | browser for 'Java' programs". It models 'HTML' documents and provides an 'API'
14 | that allows one to invoke pages, fill out forms, click links and more just like
15 | one does in a "normal" browser. The library has fairly good and constantly
16 | improving 'JavaScript' support and is able to work even with quite complex 'AJAX'
17 | libraries, simulating 'Chrome', 'Firefox' or 'Internet Explorer' depending on
18 | the configuration used. It is typically used for testing purposes or to retrieve
19 | information from web sites. Tools are provided to work with this library
20 | at a higher level than provided by the exposed 'Java' libraries in the
21 | 'htmlunitjars' package.
22 | URL: https://github.com/hrbrmstr/htmlunit
23 | Encoding: UTF-8
24 | License: Apache License 2.0 | file LICENSE
25 | Imports:
26 | magrittr
27 | Suggests:
28 | covr, tinytest
29 | Depends:
30 | R (>= 3.6.0),
31 | rJava,
32 | htmlunitjars (>= 2.43.0),
33 | rvest,
34 | xml2
35 | Roxygen: list(markdown = TRUE)
36 | RoxygenNote: 7.1.1
37 | Remotes: github::hrbrmstr/htmlunitjars
38 |
--------------------------------------------------------------------------------
/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java:
--------------------------------------------------------------------------------
1 | package is.rud.htmlunit;
2 |
3 | import com.gargoylesoftware.htmlunit.*;
4 | import com.gargoylesoftware.htmlunit.util.*;
5 |
6 | import java.util.*;
7 | import java.lang.*;
8 | import java.io.*;
9 |
10 | public class Zapp {
11 |
12 | private static com.gargoylesoftware.htmlunit.IncorrectnessListener incorrectnessListener_ = new RIncorrectnessListener();
13 | private static com.gargoylesoftware.css.parser.CSSErrorHandler cssErrorHandler_ = new RDefaultCssErrorHandler();
14 |
15 | public static List getRequestsFor(String url, long jsDelay, int timeout, Boolean css, Boolean images) throws IOException {
16 |
17 | final WebClient webClient = new WebClient(BrowserVersion.CHROME);
18 |
19 | webClient.setCssErrorHandler(cssErrorHandler_);
20 | webClient.setIncorrectnessListener(incorrectnessListener_);
21 |
22 | WebClientOptions wco = webClient.getOptions();
23 |
24 | wco.setThrowExceptionOnScriptError(false);
25 | wco.setCssEnabled(css);
26 | wco.setDownloadImages(images);
27 | wco.setTimeout(timeout);
28 |
29 | final List list = new ArrayList<>();
30 |
31 | new WebConnectionWrapper(webClient) {
32 | @Override
33 | public WebResponse getResponse(final WebRequest request) throws IOException {
34 | final WebResponse response = super.getResponse(request);
35 | list.add(response);
36 | return response;
37 | }
38 | };
39 |
40 | webClient.getPage(url);
41 | webClient.waitForBackgroundJavaScript(jsDelay);
42 |
43 | return(list);
44 |
45 | }
46 |
47 | }
48 |
49 |
--------------------------------------------------------------------------------
/R/wc-as.R:
--------------------------------------------------------------------------------
1 | #' Retrieve current page contents
2 | #'
3 | #' If there is a page in the active browser context, return the contents of
4 | #' the page.
5 | #'
6 | #' The page contents can be returned as one of:
7 | #'
8 | #' - Parsed HTML (i.e. an `xml2` `html_document`)
9 | #' - A string representation of the HTML document. NOTE: The charset used is the
10 | #' current page encoding.
11 | #' - A textual representation of this page that represents what would be visible
12 | #' to the user if this page was shown in a web browser. This is useful for,
13 | #' say, text mining.
14 | #'
15 | #' @note This is an information retrieval function that does not return
16 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
17 | #' @param wc_obj a `webclient` object
18 | #' @param what what to return (see Details); NOTE that if there is no active
19 | #' page this function returns `NULL`.
20 | #' @return if `what` is `parsed`, an `xml2` `html_document`; if `html`,
21 | #' the character HTML representation of the page; if `text`
22 | #' the rendered text of the document as viewed by a human.
23 | #' @export
24 | #' @examples
25 | #' w <- web_client()
26 | #' wc_go(w, url = "https://hrbrmstr.github.io/htmlunitjars/index.html")
27 | #' wc_render(w, "parsed")
28 | #' wc_render(w, "html")
29 | #' wc_render(w, "text")
30 | wc_render <- function(wc_obj, what = c("parsed", "html", "text")) {
31 |
32 | what <- match.arg(what, c("parsed", "html", "text"))
33 |
34 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
35 |
36 | if (.jnull() == pg) return(NULL)
37 |
38 | response <- pg$getWebResponse()
39 | content <- response$getContentAsString()
40 |
41 | switch(
42 | what,
43 | parsed = xml2::read_html(pg$asXml()),
44 | html = pg$asXml(),
45 | text = pg$asText()
46 | )
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/R/wc-inspect.R:
--------------------------------------------------------------------------------
1 | #' Perform a "Developer Tools"-like Network Inspection of a URL
2 | #'
3 | #' Retrieves _all_ content loaded
4 | #'
5 | #' @md
6 | #' @param url URL to fetch
7 | #' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)
8 | #' @param timeout Sets the timeout (milliseconds) of the web connection. Set to zero for an infinite wait.
9 | #' Defaults to `30000`. Note: The timeout is used twice. The first is for making the socket
10 | #' connection, the second is for data retrieval. If the time is critical you must allow for twice
11 | #' the time specified here.
12 | #' @param css,images enable CSS/download images? (default `FALSE`)
13 | #' @export
14 | wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L, css = FALSE, images = FALSE) {
15 |
16 | app <- J("is.rud.htmlunit.Zapp")
17 |
18 | app$getRequestsFor(
19 | url,
20 | .jlong(js_delay),
21 | as.integer(timeout),
22 | .jnew("java/lang/Boolean", css),
23 | .jnew("java/lang/Boolean", images)
24 | ) -> res
25 |
26 | res <- as.list(res)
27 |
28 | lapply(res, function(.x) {
29 |
30 | wr <- .x$getWebRequest()
31 | hdrs <- as.list(.x$getResponseHeaders())
32 |
33 | lapply(hdrs, function(.x) {
34 | data.frame(
35 | name = .x$getName() %||% NA_character_,
36 | value = .x$getValue() %||% NA_character_,
37 | stringsAsFactors = FALSE
38 | )
39 | }) -> hdrs
40 |
41 | hdrs <- do.call(rbind.data.frame, hdrs)
42 | class(hdrs) <- c("tbl_df", "tbl", "data.frame")
43 |
44 | data.frame(
45 | method = wr$getHttpMethod()$toString() %||% NA_character_,
46 | url = wr$getUrl()$toString() %||% NA_character_,
47 | status_code = .x$getStatusCode() %||% NA_integer_,
48 | message = .x$getStatusMessage() %||% NA_character_,
49 | content =I(list(charToRaw(.x$getContentAsString()))) %||% NA_character_,
50 | content_length = as.double(.x$getContentLength() %||% NA_real_),
51 | content_type = .x$getContentType() %||% NA_character_,
52 | load_time = as.double(.x$getLoadTime() %||% NA_real_),
53 | headers = I(list(hdrs)),
54 | stringsAsFactors = FALSE
55 | )
56 |
57 | }) -> out
58 |
59 | out <- do.call(rbind.data.frame, out)
60 | class(out) <- c("tbl_df", "tbl", "data.frame")
61 |
62 | out
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/R/wc-html-nodes.R:
--------------------------------------------------------------------------------
1 | #' Select nodes from web client active page html content
2 | #'
3 | #' @md
4 | #' @param wc_obj a `webclient` object
5 | #' @param css,xpath Nodes to select. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.
6 | #' @export
7 | #' @examples \dontrun{
8 | #' wc <- web_client()
9 | #'
10 | #' wc %>% wc_go("https://usa.gov/")
11 | #'
12 | #' wc %>%
13 | #' wc_html_nodes("a") %>%
14 | #' sapply(wc_html_text)
15 | #'
16 | #' wc %>%
17 | #' wc_html_nodes(xpath=".//a") %>%
18 | #' sapply(wc_html_text)
19 | #'
20 | #' wc %>%
21 | #' wc_html_nodes(xpath=".//a") %>%
22 | #' sapply(wc_html_attr, "href")
23 | #' }
24 | wc_html_nodes <- function(wc_obj, css, xpath) {
25 |
26 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
27 |
28 | if (.jnull() == pg) return(NULL)
29 |
30 | if (missing(css) && missing(xpath))
31 | stop("Please supply one of css or xpath", call. = FALSE)
32 |
33 | if (!missing(css) && !missing(xpath))
34 | stop("Please supply css or xpath, not both", call. = FALSE)
35 |
36 | if (!missing(css)) {
37 | if (!is.character(css) && length(css) == 1) stop("`css` must be a string")
38 |
39 | out <- pg$querySelectorAll(css)
40 |
41 | } else {
42 | if (!is.character(xpath) && length(xpath) == 1)
43 | stop("`xpath` must be a string")
44 |
45 | out <- pg$getByXPath(xpath)
46 |
47 | }
48 |
49 | out
50 |
51 | }
52 |
53 | #' Extract attributes, text and tag name from webclient page html content
54 | #'
55 | #' @md
56 | #' @param dom_node a webclient page DOM node (likely produced by [wc_html_nodes()])
57 | #' @param trim if `TRUE` will trim leading/trailing white space
58 | #' @export
59 | #' @examples \dontrun{
60 | #' wc <- web_client()
61 | #'
62 | #' wc %>% wc_go("https://usa.gov/")
63 | #'
64 | #' wc %>%
65 | #' wc_html_nodes("a") %>%
66 | #' sapply(wc_html_text)
67 | #'
68 | #' wc %>%
69 | #' wc_html_nodes(xpath=".//a") %>%
70 | #' sapply(wc_html_text)
71 | #'
72 | #' wc %>%
73 | #' wc_html_nodes(xpath=".//a") %>%
74 | #' sapply(wc_html_attr, "href")
75 | #' }
76 | wc_html_text <- function(dom_node, trim = FALSE) {
77 | x <- dom_node$getTextContent()
78 | if (trim) x <- trimws(x)
79 | x
80 | }
81 |
82 | #' @rdname wc_html_text
83 | #' @export
84 | #' @param attr name of attribute to retrieve
85 | wc_html_attr <- function(dom_node, attr) {
86 | dom_node$getAttribute(attr)
87 | }
88 |
89 | #' @rdname wc_html_text
90 | #' @export
91 | wc_html_name <- function(dom_node) {
92 | dom_node$getNodeName()
93 | }
94 |
--------------------------------------------------------------------------------
/man/hu_read_html.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/hu-read-html.R
3 | \name{hu_read_html}
4 | \alias{hu_read_html}
5 | \title{Read HTML from a URL with Browser Emulation & in a JavaScript Context}
6 | \usage{
7 | hu_read_html(
8 | url,
9 | emulate = c("best", "chrome", "firefox", "ie", "edge"),
10 | ret = c("html_document", "text"),
11 | js_delay = 2000L,
12 | timeout = 30000L,
13 | ignore_ssl_errors = TRUE,
14 | enable_dnt = FALSE,
15 | download_images = FALSE,
16 | options = c("RECOVER", "NOERROR", "NOBLANKS")
17 | )
18 | }
19 | \arguments{
20 | \item{url}{URL to retrieve}
21 |
22 | \item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"}
23 |
24 | \item{ret}{what to return; if \code{html_document} (the default) then the HTML created
25 | by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_xml]{xml2::read_html()}}
26 | and an \code{xml2} \code{html_document}/\code{xml_document} is returned. Note that this causes
27 | further HTML processing by \code{xml2}/\code{libxml2} so is not \emph{exactly} what
28 | \code{HtmlUnit} generated. If you want the HTML code (text) without any further
29 | processing then use \code{text} as the value.}
30 |
31 | \item{js_delay}{time (ms) to let loaded javascript to execute; default is 2 seconds (2000 ms)}
32 |
33 | \item{timeout}{overall timeout (ms); \code{0} == infinite wait (not recommended); note: the
34 | timeout is used twice: first in making the socket connection,
35 | second for data retrieval. If the time is critical you must
36 | allow for twice the time specified here. Default 30s (30000 ms)}
37 |
38 | \item{ignore_ssl_errors}{Should SSL/TLS errors be ignored. The default (\code{TRUE}) is
39 | a current hack due to how \code{HtmlUnit} seems to handle virtual hosted sites
40 | with multiple vhosts and multiple certificates. You can try it with \code{FALSE}
41 | initially and revert back to \code{TRUE} if you encounter issues.}
42 |
43 | \item{enable_dnt}{Enable the "Do Not Track" header. Default: \code{FALSE}.}
44 |
45 | \item{download_images}{Download images as the page is loaded? Since this
46 | function is a high-level wrapper designed to do a read of HTML,
47 | it is recommended that you leave this the default \code{FALSE} to save
48 | time/bandwidth.}
49 |
50 | \item{options}{options to pass to \code{\link[xml2:read_xml]{xml2::read_html()}} if \code{ret} == \code{html_document}.}
51 | }
52 | \value{
53 | an \code{xml2} \code{html_document}/\code{xml_document} if \code{ret} == \code{html_document} else
54 | the HTML document text generated by \code{HtmlUnit}.
55 | }
56 | \description{
57 | Use a JavaScript-enabled browser context to read and render HTML from a URL.
58 | }
59 | \details{
60 | For the code in the examples, this is the site that is being scraped:
61 |
62 | \if{html}{
63 | \figure{test-url-table.png}{options: width="100\%" alt="Figure: test-url-table.png"}
64 | }
65 |
66 | \if{latex}{
67 | \figure{test-url-table.png}{options: width=10cm}
68 | }
69 |
70 | Note that it has a table of values but it is rendered via JavaScript.
71 | }
72 | \examples{
73 | \dontrun{
74 | test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
75 | hu_read_html(test_url)
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/R/wc-status.R:
--------------------------------------------------------------------------------
1 | #' Return status code of web request for current page
2 | #'
3 | #' @note This is an information retrieval function that does not return
4 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
5 | #' @param wc_obj a `webclient` object
6 | #' @return the HTTP status code and message of the web request or `NULL` if no active page
7 | #' @export
8 | wc_status<- function(wc_obj) {
9 |
10 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
11 |
12 | if (.jnull() == pg) return(NULL)
13 |
14 | response <- pg$getWebResponse()
15 |
16 | list(
17 | status_code = response$getStatusCode(),
18 | message = response$getStatusMessage()
19 | )
20 |
21 | }
22 |
23 | #' Return content type of web request for current page
24 | #'
25 | #' @note This is an information retrieval function that does not return
26 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
27 | #' @param wc_obj a `webclient` object
28 | #' @return the content type of the web request or `NULL` if no active page
29 | #' @export
30 | wc_content_type <- function(wc_obj) {
31 |
32 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
33 |
34 | if (.jnull() == pg) return(NULL)
35 |
36 | response <- pg$getWebResponse()
37 |
38 | response$getContentType()
39 |
40 | }
41 |
42 | #' Return content length of the last web request for current page
43 | #'
44 | #' @note This is an information retrieval function that does not return
45 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
46 | #' @param wc_obj a `webclient` object
47 | #' @return the content length (in bytes) of the web request or `NULL` if no active page
48 | #' @export
49 | wc_content_length <- function(wc_obj) {
50 |
51 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
52 |
53 | if (.jnull() == pg) return(NULL)
54 |
55 | response <- pg$getWebResponse()
56 |
57 | response$getContentLength()
58 |
59 | }
60 |
61 | #' Return load time of the last web request for current page
62 | #'
63 | #' @note This is an information retrieval function that does not return
64 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
65 | #' @param wc_obj a `webclient` object
66 | #' @return the load time (in ms) of the web request or `NULL` if no active page
67 | #' @export
68 | wc_load_time <- function(wc_obj) {
69 |
70 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
71 |
72 | if (.jnull() == pg) return(NULL)
73 |
74 | response <- pg$getWebResponse()
75 |
76 | response$getLoadTime()
77 |
78 | }
79 |
80 | #' Return load time of the last web request for current page
81 | #'
82 | #' @note This is an information retrieval function that does not return
83 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
84 | #' @param wc_obj a `webclient` object
85 | #' @return the load time (in ms) of the web request or `NULL` if no active page
86 | #' @export
87 | wc_url <- function(wc_obj) {
88 |
89 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
90 |
91 | if (.jnull() == pg) return(NULL)
92 |
93 | pg$getUrl()$toString()
94 |
95 | }
96 |
97 | #' Return page title for current page
98 | #'
99 | #' @note This is an information retrieval function that does not return
100 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
101 | #' @param wc_obj a `webclient` object
102 | #' @return page title of the current page `NULL` if no active page
103 | #' @export
104 | wc_title <- function(wc_obj) {
105 |
106 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
107 |
108 | if (.jnull() == pg) return(NULL)
109 |
110 | pg$getTitleText()
111 |
112 | }
113 |
114 | #' Return response headers of the last web request for current page
115 | #'
116 | #' @note This is an information retrieval function that does not return
117 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
118 | #' @param wc_obj a `webclient` object
119 | #' @return the response headers of the web request as a data frame or `NULL` if
120 | #' no active page
121 | #' @export
122 | wc_headers <- function(wc_obj) {
123 |
124 | pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
125 |
126 | if (.jnull() == pg) return(NULL)
127 |
128 | response <- pg$getWebResponse()
129 |
130 | do.call(
131 | rbind.data.frame,
132 | c(
133 | lapply(
134 | as.list(response$getResponseHeaders()),
135 | function(x) list(name = x$getName(), value = x$getValue())
136 | ),
137 | stringsAsFactors=FALSE
138 | )
139 | ) -> out
140 |
141 | class(out) <- c("tbl_df", "tbl", "data.frame")
142 |
143 | out
144 |
145 | }
146 |
147 |
--------------------------------------------------------------------------------
/R/hu-read-html.R:
--------------------------------------------------------------------------------
1 | #' Read HTML from a URL with Browser Emulation & in a JavaScript Context
2 | #'
3 | #' Use a JavaScript-enabled browser context to read and render HTML from a URL.
4 | #'
5 | #' For the code in the examples, this is the site that is being scraped:
6 | #'
7 | #' \if{html}{
8 | #' \figure{test-url-table.png}{options: width="100\%" alt="Figure: test-url-table.png"}
9 | #' }
10 | #'
11 | #' \if{latex}{
12 | #' \figure{test-url-table.png}{options: width=10cm}
13 | #' }
14 | #'
15 | #' Note that it has a table of values but it is rendered via JavaScript.
16 | #'
17 | #' @param url URL to retrieve
18 | #' @param emulate browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`"
19 | #' @param ret what to return; if `html_document` (the default) then the HTML created
20 | #' by the `HtmlUnit` emulated browser context is passed to [xml2::read_html()]
21 | #' and an `xml2` `html_document`/`xml_document` is returned. Note that this causes
22 | #' further HTML processing by `xml2`/`libxml2` so is not _exactly_ what
23 | #' `HtmlUnit` generated. If you want the HTML code (text) without any further
24 | #' processing then use `text` as the value.
25 | #' @param js_delay time (ms) to let loaded javascript to execute; default is 2 seconds (2000 ms)
26 | #' @param timeout overall timeout (ms); `0` == infinite wait (not recommended); note: the
27 | #' timeout is used twice: first in making the socket connection,
28 | #' second for data retrieval. If the time is critical you must
29 | #' allow for twice the time specified here. Default 30s (30000 ms)
30 | #' @param ignore_ssl_errors Should SSL/TLS errors be ignored. The default (`TRUE`) is
31 | #' a current hack due to how `HtmlUnit` seems to handle virtual hosted sites
32 | #' with multiple vhosts and multiple certificates. You can try it with `FALSE`
33 | #' initially and revert back to `TRUE` if you encounter issues.
34 | #' @param enable_dnt Enable the "Do Not Track" header. Default: `FALSE`.
35 | #' @param download_images Download images as the page is loaded? Since this
36 | #' function is a high-level wrapper designed to do a read of HTML,
37 | #' it is recommended that you leave this the default `FALSE` to save
38 | #' time/bandwidth.
39 | #' @param options options to pass to [xml2::read_html()] if `ret` == `html_document`.
40 | #' @return an `xml2` `html_document`/`xml_document` if `ret` == `html_document` else
41 | #' the HTML document text generated by `HtmlUnit`.
42 | #' @export
43 | #' @examples \dontrun{
44 | #' test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
45 | #' hu_read_html(test_url)
46 | #' }
47 | hu_read_html <- function(url,
48 | emulate = c("best", "chrome", "firefox", "ie", "edge"),
49 | ret = c("html_document", "text"),
50 | js_delay = 2000L,
51 | timeout = 30000L,
52 | ignore_ssl_errors = TRUE,
53 | enable_dnt = FALSE,
54 | download_images = FALSE,
55 | options = c("RECOVER", "NOERROR", "NOBLANKS")) {
56 |
57 | emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
58 | ret <- match.arg(ret, c("html_document", "text"))
59 |
60 | available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")
61 |
62 | switch(
63 | emulate,
64 | best = available_browsers$BEST_SUPPORTED,
65 | chrome = available_browsers$CHROME,
66 | firefox = available_browsers$FIREFOX,
67 | edge = available_browsers$EDGE,
68 | ie = available_browsers$INTERNET_EXPLORER
69 | ) -> use_browser
70 |
71 | wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser)
72 |
73 | cssErrorHandler <- .jnew("is.rud.htmlunit.RDefaultCssErrorHandler")
74 | wc$setCssErrorHandler(cssErrorHandler)
75 |
76 | incorrectListenerHandler <- .jnew("is.rud.htmlunit.RIncorrectnessListener")
77 | wc$setIncorrectnessListener(incorrectListenerHandler)
78 |
79 | res <- wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay)))
80 |
81 | wc_opts <- wc$getOptions()
82 | wc_opts$setThrowExceptionOnFailingStatusCode(FALSE)
83 | wc_opts$setThrowExceptionOnScriptError(FALSE)
84 | wc_opts$setTimeout(as.integer(timeout))
85 |
86 | if (ignore_ssl_errors) wc_opts$setUseInsecureSSL(TRUE)
87 | if (enable_dnt) wc_opts$setDoNotTrackEnabled(TRUE)
88 | if (download_images) wc_opts$setDownloadImages(TRUE)
89 |
90 | pg <- wc$getPage(url)
91 |
92 | # response <- pg$getWebResponse()
93 | # content <- response$getContentAsString()
94 |
95 | if (ret == "html_document") return(xml2::read_html(pg$asXml(), options = options))
96 |
97 | return(pg$asText())
98 |
99 | }
100 |
101 |
--------------------------------------------------------------------------------
/R/wc-options.R:
--------------------------------------------------------------------------------
1 | #' Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing
2 | #'
3 | #' @note The caller does not have to assign the output of this function to a
4 | #' variable as the browser state is managed internally by HtmlUnit.
5 | #' @param wc_obj a `webclient` object
6 | #' @param js_delay number of ms to wait/block
7 | #' @family wc_opts
8 | #' @export
9 | wc_wait <- function(wc_obj, js_delay = 2000L) {
10 |
11 | res <- wc_obj$wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay)))
12 |
13 | invisible(wc_obj)
14 |
15 | }
16 |
17 | #' Enable/Disable CSS support
18 | #'
19 | #' @note The caller does not have to assign the output of this function to a
20 | #' variable as the browser state is managed internally by HtmlUnit.
21 | #' @param wc_obj a `webclient` object
22 | #' @param enable if `TRUE` enable CSS support (which is the HtmlUnit default)
23 | #' @return the `webclient` object (invisibly)
24 | #' @family wc_opts
25 | #' @export
26 | wc_css <- function(wc_obj, enable) {
27 |
28 | wc_obj$wc_opts$setCssEnabled(enable)
29 |
30 | invisible(wc_obj)
31 |
32 | }
33 |
34 | #' Enable/Disable Do-Not-Track
35 | #'
36 | #' @note The caller does not have to assign the output of this function to a
37 | #' variable as the browser state is managed internally by HtmlUnit.
38 | #' @param wc_obj a `webclient` object
39 | #' @param enable if `TRUE` enable Do-Not-Track support (which is the HtmlUnit default)
40 | #' @return the `webclient` object (invisibly)
41 | #' @family wc_opts
42 | #' @export
43 | wc_dnt <- function(wc_obj, enable) {
44 |
45 | wc_obj$wc_opts$setDoNotTrackEnabled(enable)
46 |
47 | invisible(wc_obj)
48 |
49 | }
50 |
51 | #' Enable/Disable Image Downloading
52 | #'
53 | #' @note The caller does not have to assign the output of this function to a
54 | #' variable as the browser state is managed internally by HtmlUnit.
55 | #' @param wc_obj a `webclient` object
56 | #' @param enable if `TRUE` enable image downloading (the default is not to download images)
57 | #' @return the `webclient` object (invisibly)
58 | #' @family wc_opts
59 | #' @export
60 | wc_img_dl <- function(wc_obj, enable) {
61 |
62 | wc_obj$wc_opts$setDownloadImages(enable)
63 |
64 | invisible(wc_obj)
65 |
66 | }
67 |
68 | #' Enable/Disable Geolocation
69 | #'
70 | #' @note The caller does not have to assign the output of this function to a
71 | #' variable as the browser state is managed internally by HtmlUnit.
72 | #' @param wc_obj a `webclient` object
73 | #' @param enable if `TRUE` enable geolocation (which is the HtmlUnit default)
74 | #' @return the `webclient` object (invisibly)
75 | #' @family wc_opts
76 | #' @export
77 | wc_geo <- function(wc_obj, enable) {
78 |
79 | wc_obj$wc_opts$setGeolocationEnabled(enable)
80 |
81 | invisible(wc_obj)
82 |
83 | }
84 |
85 | #' Change default request timeout
86 | #'
87 | #' @note The caller does not have to assign the output of this function to a
88 | #' variable as the browser state is managed internally by HtmlUnit.
89 | #' @param wc_obj a `webclient` object
90 | #' @param timeout timeout (ms); The timeout is used twice. The first is for making
91 | #' the socket connection, the second is for data retrieval. If the
92 | #' time is critical you must allow for twice the time specified here.
93 | #' @return the `webclient` object (invisibly)
94 | #' @family wc_opts
95 | #' @export
96 | wc_timeout <- function(wc_obj, timeout) {
97 |
98 | wc_obj$wc_opts$setTimeout(timeout)
99 |
100 | invisible(wc_obj)
101 |
102 | }
103 |
104 | #' Resize the virtual browser window
105 | #'
106 | #' @note The caller does not have to assign the output of this function to a
107 | #' variable as the browser state is managed internally by HtmlUnit.
108 | #' @param wc_obj a `webclient` object
109 | #' @param h,w height and width (pixels)
110 | #' @return the `webclient` object (invisibly)
111 | #' @family wc_opts
112 | #' @export
113 | wc_resize <- function(wc_obj, h, w) {
114 |
115 | wc_obj$wc_opts$setScreenHeight(h)
116 | wc_obj$wc_opts$setScreenWidth(w)
117 |
118 | invisible(wc_obj)
119 |
120 | }
121 |
122 | #' Enable/Disable Ignoring SSL Validation Issues
123 | #'
124 | #' @note The caller does not have to assign the output of this function to a
125 | #' variable as the browser state is managed internally by HtmlUnit.
126 | #' @param wc_obj a `webclient` object
127 | #' @param enable if `TRUE` the client will accept connections to any host,
128 | #' regardless of whether they have valid certificates or not
129 | #' @return the `webclient` object (invisibly)
130 | #' @family wc_opts
131 | #' @export
132 | wc_use_insecure_ssl <- function(wc_obj, enable) {
133 |
134 | wc_obj$wc_opts$setUseInsecureSSL(enable)
135 |
136 | invisible(wc_obj)
137 |
138 | }
139 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output:
3 | rmarkdown::github_document
4 | editor_options:
5 | chunk_output_type: console
6 | ---
7 | ```{r pkg-knitr-opts, include=FALSE}
8 | hrbrpkghelpr::global_opts()
9 | ```
10 |
11 | ```{r badges, results='asis', echo=FALSE, cache=FALSE}
12 | hrbrpkghelpr::stinking_badges()
13 | ```
14 |
15 | ```{r description, results='asis', echo=FALSE, cache=FALSE}
16 | hrbrpkghelpr::yank_title_and_description()
17 | ```
18 |
19 | ## What's Inside The Tin
20 |
21 | The following functions are implemented:
22 |
23 | ### DSL
24 |
25 | - `web_client`/`webclient`: Create a new HtmlUnit WebClient instance
26 |
27 | - `wc_go`: Visit a URL
28 |
29 | - `wc_html_nodes`: Select nodes from web client active page html content
30 | - `wc_html_text`: Extract attributes, text and tag name from webclient page html content
31 | - `wc_html_attr`: Extract attributes, text and tag name from webclient page html content
32 | - `wc_html_name`: Extract attributes, text and tag name from webclient page html content
33 |
34 | - `wc_headers`: Return response headers of the last web request for current page
35 | - `wc_browser_info`: Retreive information about the browser used to create the 'webclient'
36 | - `wc_content_length`: Return content length of the last web request for current page
37 | - `wc_content_type`: Return content type of web request for current page
38 |
39 | - `wc_render`: Retrieve current page contents
40 |
41 | - `wc_css`: Enable/Disable CSS support
42 | - `wc_dnt`: Enable/Disable Do-Not-Track
43 | - `wc_geo`: Enable/Disable Geolocation
44 | - `wc_img_dl`: Enable/Disable Image Downloading
45 | - `wc_load_time`: Return load time of the last web request for current page
46 | - `wc_resize`: Resize the virtual browser window
47 | - `wc_status`: Return status code of web request for current page
48 | - `wc_timeout`: Change default request timeout
49 | - `wc_title`: Return page title for current page
50 | - `wc_url`: Return load time of the last web request for current page
51 | - `wc_use_insecure_ssl`: Enable/Disable Ignoring SSL Validation Issues
52 | - `wc_wait`: Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing
53 |
54 | ### Just the Content (pls)
55 |
56 | - `hu_read_html`: Read HTML from a URL with Browser Emulation & in a JavaScript Context
57 |
58 | ### Content++
59 |
60 | - `wc_inspect`: Perform a "Developer Tools"-like Network Inspection of a URL
61 |
62 | ## Installation
63 |
64 | ```{r install-ex, results='asis', echo=FALSE, cache=FALSE}
65 | hrbrpkghelpr::install_block()
66 | ```
67 |
68 | ## Usage
69 |
70 | ```{r cache=FALSE}
71 | library(htmlunit)
72 | library(tidyverse) # for some data ops; not req'd for pkg
73 |
74 | # current verison
75 | packageVersion("htmlunit")
76 |
77 | ```
78 |
79 | Something `xml2::read_html()` cannot do, read the table from :
80 |
81 | 
82 |
83 | ```{r ex1}
84 | test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
85 |
86 | pg <- xml2::read_html(test_url)
87 |
88 | html_table(pg)
89 | ```
90 |
91 | ☹️
92 |
93 | But, `hu_read_html()` can!
94 |
95 | ```{r ex2}
96 | pg <- hu_read_html(test_url)
97 |
98 | html_table(pg)
99 | ```
100 |
101 | All without needing a separate Selenium or Splash server instance.
102 |
103 | ### Content++
104 |
105 | We can also get a HAR-like content + metadata dump:
106 |
107 | ```{r ex3}
108 | xdf <- wc_inspect("https://rstudio.com")
109 |
110 | colnames(xdf)
111 |
112 | select(xdf, method, url, status_code, content_length, load_time)
113 |
114 | group_by(xdf, content_type) %>%
115 | summarise(
116 | total_size = sum(content_length),
117 | total_load_time = sum(load_time)/1000
118 | )
119 | ```
120 |
121 | ### DSL
122 |
123 | ```{r ex4}
124 | wc <- web_client(emulate = "chrome")
125 |
126 | wc %>% wc_browser_info()
127 |
128 | wc <- web_client()
129 |
130 | wc %>% wc_go("https://usa.gov/")
131 |
132 | # if you want to use purrr::map_ functions the result of wc_html_nodes() needs to be passed to as.list()
133 |
134 | wc %>%
135 | wc_html_nodes("a") %>%
136 | sapply(wc_html_text, trim = TRUE) %>%
137 | head(10)
138 |
139 | wc %>%
140 | wc_html_nodes(xpath=".//a") %>%
141 | sapply(wc_html_text, trim = TRUE) %>%
142 | head(10)
143 |
144 | wc %>%
145 | wc_html_nodes(xpath=".//a") %>%
146 | sapply(wc_html_attr, "href") %>%
147 | head(10)
148 | ```
149 |
150 | Handy function to get rendered plain text for text mining:
151 |
152 | ```{r ex5}
153 | wc %>%
154 | wc_render("text") %>%
155 | substr(1, 300) %>%
156 | cat()
157 | ```
158 |
159 | ### htmlunit Metrics
160 |
161 | ```{r echo=FALSE}
162 | cloc::cloc_pkg_md()
163 | ```
164 |
165 | ## Code of Conduct
166 |
167 | Please note that this project is released with a Contributor Code of Conduct.
168 | By participating in this project you agree to abide by its terms.
169 |
--------------------------------------------------------------------------------
/R/web-client.R:
--------------------------------------------------------------------------------
1 | #' Create a new HtmlUnit WebClient instance
2 | #'
3 | #' A new HtmlUnit web client (virtual browser) will be created and a `webclient`
4 | #' object will be returned.
5 | #'
6 | #' This is part of the `htmlunit` DSL interface.s
7 | #'
8 | #' @param emulate browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`"
9 | #' @param proxy_host,proxy_port the server/port that will act as proxy (default
10 | #' `NULL` = no proxy)
11 | #' @return `webclient` object
12 | #' @family dsl
13 | #' @export
14 | #' @examples
15 | #' w <- web_client()
16 | #' wc_browser_info(w)
17 | web_client <- function(emulate = c("best", "chrome", "firefox", "ie", "edge"),
18 | proxy_host = NULL, proxy_port = NULL) {
19 |
20 | emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
21 | available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")
22 |
23 | switch(
24 | emulate,
25 | best = available_browsers$BEST_SUPPORTED,
26 | chrome = available_browsers$CHROME,
27 | firefox = available_browsers$FIREFOX,
28 | edge = available_browsers$EDGE,
29 | ie = available_browsers$INTERNET_EXPLORER
30 | ) -> use_browser
31 |
32 | wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser)
33 |
34 | wc$getOptions()$setThrowExceptionOnFailingStatusCode(FALSE)
35 | wc$getOptions()$setThrowExceptionOnScriptError(FALSE)
36 | wc$getOptions()$setDownloadImages(FALSE)
37 | wc$getOptions()$setJavaScriptEnabled(TRUE)
38 | wc$getOptions()$setCssEnabled(TRUE)
39 | wc$getOptions()$setDoNotTrackEnabled(FALSE)
40 | wc$getOptions()$setGeolocationEnabled(TRUE)
41 | wc$getOptions()$setPopupBlockerEnabled(FALSE)
42 | wc$getOptions()$setPrintContentOnFailingStatusCode(TRUE)
43 | wc$getOptions()$setRedirectEnabled(TRUE)
44 |
45 | list(
46 | wc = wc,
47 | wc_opts = wc$getOptions()
48 | ) -> wc_obj
49 |
50 | class(wc_obj) <- c("webclient")
51 |
52 | invisible(wc_obj)
53 |
54 | }
55 |
56 | #' @rdname web_client
57 | #' @export
58 | webclient <- web_client
59 |
60 | #' Visit a URL
61 | #'
62 | #' @note The caller does not have to assign the output of this function to a
63 | #' variable as the browser state is managed internally by HtmlUnit.
64 | #' @param wc_obj a `webclient` object
65 | #' @param url URL to retrieve
66 | #' @return the `webclient` object (invisibly)
67 | #' @export
68 | #' @examples
69 | #' w <- web_client()
70 | #' wc_go(w, "https://httpbin.org/")
71 | wc_go <- function(wc_obj, url) {
72 |
73 | wc_obj$wc$getPage(url)
74 |
75 | invisible(wc_obj)
76 |
77 | }
78 |
79 | #' Retreive information about the browser used to create the `webclient`
80 | #'
81 | #' @note This is an information retrieval function that does not return
82 | #' the `wc_obj` so must be the last function call in a `webclient` pipe.
83 | #' @param wc_obj a `webclient` object
84 | #' @return the browser version
85 | #' @export
86 | wc_browser_info <- function(wc_obj) {
87 |
88 | bv <- wc_obj$wc$getBrowserVersion()
89 |
90 | list(
91 | name = bv$getApplicationName(),
92 | version = bv$getApplicationVersion(),
93 | language = bv$getBrowserLanguage()
94 | ) -> bv_lst
95 |
96 | class(bv_lst) <- "browserinfo"
97 |
98 | bv_lst
99 |
100 | }
101 |
102 | #' Print method for `browserinfo` objects
103 | #' @keywords internal
104 | #' @param x `browserinfo` object
105 | #' @param ... unused
106 | #' @return `x`
107 | #' @export
108 | print.browserinfo <- function(x, ...) {
109 |
110 | cat(
111 | sprintf("< %s / %s / %s >\n", x$name, x$version, x$language)
112 | )
113 |
114 | invisible(x)
115 |
116 | }
117 |
118 |
119 | # Closes all virtual browser opened windows & stop all background JavaScript processing
120 | #
121 | # @param wc_obj a `webclient` object
122 | # @return the `webclient` object (invisibly)
123 | # @export
124 | # wc_go <- function(wc_obj, url) {
125 | #
126 | # wc_obj$wc$getPage(url)
127 | #
128 | # invisible(wc_obj)
129 | #
130 | # }
131 |
132 |
133 | #' Print method for `webclient` objects
134 | #' @keywords internal
135 | #' @param x `webclient` object
136 | #' @param ... unused
137 | #' @return `x`
138 | #' @export
139 | print.webclient <- function(x, ...) {
140 |
141 | bv <- x$wc$getBrowserVersion()
142 |
143 | cat(
144 | sprintf(
145 | "\n",
146 | bv$getApplicationName(),
147 | bv$getApplicationVersion(),
148 | bv$getBrowserLanguage())
149 | )
150 |
151 | pg <- x$wc$getCurrentWindow()$getEnclosedPage()
152 |
153 | if (!(.jnull() == pg)) {
154 |
155 | cat(sprintf(" Current URL: <%s>\n", pg$getUrl()$toString()))
156 |
157 | if (pg$getTitleText() != "") cat(sprintf(" Page Title: <%s>\n", pg$getTitleText()))
158 |
159 | res <- pg$getWebResponse()
160 |
161 | cat(sprintf(" Status Code: %s\n", res$getStatusCode()))
162 | cat(sprintf(" Content Type: %s\n", res$getContentType()))
163 | cat(sprintf(" Content Length: %s bytes\n", prettyNum(res$getContentLength(), big.mark=",")))
164 | cat(sprintf(" Load Time: %s ms\n", prettyNum(res$getLoadTime(), big.mark=",")))
165 |
166 | }
167 |
168 | invisible(x)
169 |
170 | }
171 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://www.repostatus.org/#active)
5 | [](https://keybase.io/hrbrmstr)
7 | 
9 | [](https://travis-ci.org/hrbrmstr/htmlunit)
11 | [](https://codecov.io/gh/hrbrmstr/htmlunit)
13 | 
15 | 
16 |
17 | # htmlunit
18 |
19 | Tools to Scrape Dynamic Web Content via the ‘HtmlUnit’ Java Library
20 |
21 | ## Description
22 |
23 | ‘HtmlUnit’ () is a “‘GUI’-Less
24 | browser for ‘Java’ programs”. It models ‘HTML’ documents and provides an
25 | ‘API’ that allows one to invoke pages, fill out forms, click links and
26 | more just like one does in a “normal” browser. The library has fairly
27 | good and constantly improving ‘JavaScript’ support and is able to work
28 | even with quite complex ‘AJAX’ libraries, simulating ‘Chrome’, ‘Firefox’
29 | or ‘Internet Explorer’ depending on the configuration used. It is
30 | typically used for testing purposes or to retrieve information from web
31 | sites. Tools are provided to work with this library at a higher level
32 | than provided by the exposed ‘Java’ libraries in the ‘htmlunitjars’
33 | package.
34 |
35 | ## What’s Inside The Tin
36 |
37 | The following functions are implemented:
38 |
39 | ### DSL
40 |
41 | - `web_client`/`webclient`: Create a new HtmlUnit WebClient
42 | instance
43 |
44 | - `wc_go`: Visit a URL
45 |
46 | - `wc_html_nodes`: Select nodes from web client active page html
47 | content
48 |
49 | - `wc_html_text`: Extract attributes, text and tag name from webclient
50 | page html content
51 |
52 | - `wc_html_attr`: Extract attributes, text and tag name from webclient
53 | page html content
54 |
55 | - `wc_html_name`: Extract attributes, text and tag name from webclient
56 | page html content
57 |
58 | - `wc_headers`: Return response headers of the last web request for
59 | current page
60 |
61 | - `wc_browser_info`: Retreive information about the browser used to
62 | create the ‘webclient’
63 |
64 | - `wc_content_length`: Return content length of the last web request
65 | for current page
66 |
67 | - `wc_content_type`: Return content type of web request for current
68 | page
69 |
70 | - `wc_render`: Retrieve current page contents
71 |
72 | - `wc_css`: Enable/Disable CSS support
73 |
74 | - `wc_dnt`: Enable/Disable Do-Not-Track
75 |
76 | - `wc_geo`: Enable/Disable Geolocation
77 |
78 | - `wc_img_dl`: Enable/Disable Image Downloading
79 |
80 | - `wc_load_time`: Return load time of the last web request for current
81 | page
82 |
83 | - `wc_resize`: Resize the virtual browser window
84 |
85 | - `wc_status`: Return status code of web request for current page
86 |
87 | - `wc_timeout`: Change default request timeout
88 |
89 | - `wc_title`: Return page title for current page
90 |
91 | - `wc_url`: Return load time of the last web request for current page
92 |
93 | - `wc_use_insecure_ssl`: Enable/Disable Ignoring SSL Validation Issues
94 |
95 | - `wc_wait`: Block HtlUnit final rendering blocks until all background
96 | JavaScript tasks have finished executing
97 |
98 | ### Just the Content (pls)
99 |
100 | - `hu_read_html`: Read HTML from a URL with Browser Emulation & in a
101 | JavaScript Context
102 |
103 | ### Content++
104 |
105 | - `wc_inspect`: Perform a “Developer Tools”-like Network Inspection of
106 | a URL
107 |
108 | ## Installation
109 |
110 | ``` r
111 | install.packages("htmlunit", repos = c("https://cinc.rud.is", "https://cloud.r-project.org/"))
112 | # or
113 | remotes::install_git("https://git.rud.is/hrbrmstr/htmlunit.git")
114 | # or
115 | remotes::install_git("https://git.sr.ht/~hrbrmstr/htmlunit")
116 | # or
117 | remotes::install_gitlab("hrbrmstr/htmlunit")
118 | # or
119 | remotes::install_bitbucket("hrbrmstr/htmlunit")
120 | # or
121 | remotes::install_github("hrbrmstr/htmlunit")
122 | ```
123 |
124 | NOTE: To use the ‘remotes’ install options you will need to have the
125 | [{remotes} package](https://github.com/r-lib/remotes) installed.
126 |
127 | ## Usage
128 |
129 | ``` r
130 | library(htmlunit)
131 | library(tidyverse) # for some data ops; not req'd for pkg
132 |
133 | # current verison
134 | packageVersion("htmlunit")
135 | ## [1] '0.5.0'
136 | ```
137 |
138 | Something `xml2::read_html()` cannot do, read the table from
139 | :
140 |
141 | 
142 |
143 | ``` r
144 | test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
145 |
146 | pg <- xml2::read_html(test_url)
147 |
148 | html_table(pg)
149 | ## list()
150 | ```
151 |
152 | ☹️
153 |
154 | But, `hu_read_html()` can\!
155 |
156 | ``` r
157 | pg <- hu_read_html(test_url)
158 |
159 | html_table(pg)
160 | ## [[1]]
161 | ## X1 X2
162 | ## 1 One Two
163 | ## 2 Three Four
164 | ## 3 Five Six
165 | ```
166 |
167 | All without needing a separate Selenium or Splash server instance.
168 |
169 | ### Content++
170 |
171 | We can also get a HAR-like content + metadata dump:
172 |
173 | ``` r
174 | xdf <- wc_inspect("https://rstudio.com")
175 |
176 | colnames(xdf)
177 | ## [1] "method" "url" "status_code" "message" "content" "content_length"
178 | ## [7] "content_type" "load_time" "headers"
179 |
180 | select(xdf, method, url, status_code, content_length, load_time)
181 | ## # A tibble: 36 x 5
182 | ## method url status_code content_length load_time
183 | ##
184 | ## 1 GET https://rstudio.com/ 200 14621 495
185 | ## 2 GET https://metadata-static-files.sfo2.cdn.digitaloceanspaces.com/pixel/lp.js 200 3576 221
186 | ## 3 GET https://snap.licdn.com/li.lms-analytics/insight.min.js 200 1576 162
187 | ## 4 GET https://connect.facebook.net/en_US/fbevents.js 200 34269 138
188 | ## 5 GET https://connect.facebook.net/signals/config/151855192184380?v=2.9.23&r=s… 200 134841 66
189 | ## 6 GET https://munchkin.marketo.net/munchkin-beta.js 200 752 230
190 | ## 7 GET https://munchkin.marketo.net/159/munchkin.js 200 4810 27
191 | ## 8 GET https://x.clearbitjs.com/v1/pk_60c5aa2221e3c03eca10fb6876aa6df7/clearbit… 200 86568 483
192 | ## 9 GET https://cdn.segment.com/analytics.js/v1/gO0uTGfCkO4DQpfkRim9mBsjdKrehtnu… 200 62860 243
193 | ## 10 GET https://static.hotjar.com/c/hotjar-1446157.js?sv=6 200 1708 212
194 | ## # … with 26 more rows
195 |
196 | group_by(xdf, content_type) %>%
197 | summarise(
198 | total_size = sum(content_length),
199 | total_load_time = sum(load_time)/1000
200 | )
201 | ## # A tibble: 7 x 3
202 | ## content_type total_size total_load_time
203 | ##
204 | ## 1 application/javascript 431338 2.58
205 | ## 2 application/json 4118 1.37
206 | ## 3 application/x-javascript 176248 0.623
207 | ## 4 image/gif 35 0.232
208 | ## 5 text/html 16640 1.36
209 | ## 6 text/javascript 254971 0.996
210 | ## 7 text/plain 28 0.189
211 | ```
212 |
213 | ### DSL
214 |
215 | ``` r
216 | wc <- web_client(emulate = "chrome")
217 |
218 | wc %>% wc_browser_info()
219 | ## < Netscape / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 / en-US >
220 |
221 | wc <- web_client()
222 |
223 | wc %>% wc_go("https://usa.gov/")
224 |
225 | # if you want to use purrr::map_ functions the result of wc_html_nodes() needs to be passed to as.list()
226 |
227 | wc %>%
228 | wc_html_nodes("a") %>%
229 | sapply(wc_html_text, trim = TRUE) %>%
230 | head(10)
231 | ## [1] "Skip to main content" "" "Español"
232 | ## [4] "1-844-USA-GOV1" "All Topics and Services" "About the U.S."
233 | ## [7] "American Flag" "Branches of the U.S. Government" "Budget of the U.S. Government"
234 | ## [10] "Data and Statistics about the U.S."
235 |
236 | wc %>%
237 | wc_html_nodes(xpath=".//a") %>%
238 | sapply(wc_html_text, trim = TRUE) %>%
239 | head(10)
240 | ## [1] "Skip to main content" "" "Español"
241 | ## [4] "1-844-USA-GOV1" "All Topics and Services" "About the U.S."
242 | ## [7] "American Flag" "Branches of the U.S. Government" "Budget of the U.S. Government"
243 | ## [10] "Data and Statistics about the U.S."
244 |
245 | wc %>%
246 | wc_html_nodes(xpath=".//a") %>%
247 | sapply(wc_html_attr, "href") %>%
248 | head(10)
249 | ## [1] "#content" "/" "/espanol/" "/phone"
250 | ## [5] "/#tpcs" "#" "/flag" "/branches-of-government"
251 | ## [9] "/budget" "/statistics"
252 | ```
253 |
254 | Handy function to get rendered plain text for text mining:
255 |
256 | ``` r
257 | wc %>%
258 | wc_render("text") %>%
259 | substr(1, 300) %>%
260 | cat()
261 | ## Official Guide to Government Information and Services | USAGov
262 | ## Skip to main content
263 | ## An official website of the United States government Here's how you know
264 | ##
265 | ##
266 | ## Main Navigation
267 | ## Search
268 | ## Search
269 | ## Search
270 | ## 1-844-USA-GOV1
271 | ## All Topics and Services
272 | ## Benefits, Grants, Loans
273 | ## Government Agencies and Elected Officials
274 | ```
275 |
276 | ### htmlunit Metrics
277 |
278 | | Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) |
279 | | :---- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
280 | | R | 14 | 0.70 | 341 | 0.72 | 188 | 0.70 | 377 | 0.82 |
281 | | Java | 3 | 0.15 | 52 | 0.11 | 23 | 0.09 | 3 | 0.01 |
282 | | Rmd | 1 | 0.05 | 41 | 0.09 | 52 | 0.19 | 75 | 0.16 |
283 | | Maven | 1 | 0.05 | 30 | 0.06 | 0 | 0.00 | 1 | 0.00 |
284 | | make | 1 | 0.05 | 10 | 0.02 | 4 | 0.01 | 4 | 0.01 |
285 |
286 | clock Package Metrics for htmlunit
287 |
288 | ## Code of Conduct
289 |
290 | Please note that this project is released with a Contributor Code of
291 | Conduct. By participating in this project you agree to abide by its
292 | terms.
293 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------