├── .github └── FUNDING.yml ├── .gitignore ├── .irbrc ├── CHANGELOG.md ├── Gemfile ├── Gemfile.lock ├── LICENSE.txt ├── README.md ├── README.rdoc ├── Rakefile ├── bin └── curlyq ├── curlyq.gemspec ├── curlyq.rdoc ├── lib ├── curly.rb └── curly │ ├── array.rb │ ├── curl.rb │ ├── curl │ ├── html.rb │ └── json.rb │ ├── hash.rb │ ├── numeric.rb │ ├── string.rb │ └── version.rb ├── src └── _README.md └── test ├── curlyq_extract_test.rb ├── curlyq_headlinks_test.rb ├── curlyq_html_test.rb ├── curlyq_images_test.rb ├── curlyq_json_test.rb ├── curlyq_links_test.rb ├── curlyq_scrape_test.rb ├── curlyq_tags_test.rb ├── default_test.rb ├── helpers ├── curlyq-helpers.rb ├── fake_std_out.rb └── threaded_tests.rb └── test_helper.rb /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [ttscoff] 2 | custom: ['https://brettterpstra.com/support/', 'https://brettterpstra.com/donate/'] 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | html 2 | *.bak 3 | -------------------------------------------------------------------------------- /.irbrc: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.join(__dir__, 'lib') 2 | require_relative 'lib/curly' 3 | include Curly 4 | 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 0.0.16 2 | 3 | 2024-11-07 06:45 4 | 5 | #### FIXED 6 | 7 | - Encoding error 8 | 9 | ### 0.0.15 10 | 11 | 2024-10-25 10:31 12 | 13 | #### IMPROVED 14 | 15 | - Better error when no results, return nothing to STDOUT 16 | 17 | ### 0.0.14 18 | 19 | 2024-10-25 10:26 20 | 21 | #### FIXED 22 | 23 | - Fix permissions 24 | 25 | ### 0.0.13 26 | 27 | 2024-10-25 10:23 28 | 29 | #### FIXED 30 | 31 | - Fix tests, handle empty results better 32 | 33 | ### 0.0.12 34 | 35 | 2024-04-04 13:06 36 | 37 | #### NEW 38 | 39 | - Add --script option to screenshot command 40 | - Add `execute` command for executing JavaScript on a page 41 | 42 | ### 0.0.11 43 | 44 | 2024-01-21 15:29 45 | 46 | #### IMPROVED 47 | 48 | - Add option for --local_links_only to html and links command, only returning links with the same origin site 49 | 50 | ### 0.0.10 51 | 52 | 2024-01-17 13:50 53 | 54 | #### IMPROVED 55 | 56 | - Update YARD documentation 57 | - Breaking change, ensure all return types are Arrays, even with single objects, to aid in scriptability 58 | - Screenshot test suite 59 | 60 | ### 0.0.9 61 | 62 | 2024-01-16 12:38 63 | 64 | #### IMPROVED 65 | 66 | - You can now use dot syntax inside of a square bracket comparison in --query (`[attrs.id*=what]`) 67 | - *=, ^=, $=, and == work with array values 68 | - [] comparisons with no comparison, e.g. [attrs.id], will return every match that has that element populated 69 | 70 | ### 0.0.8 71 | 72 | 2024-01-15 16:45 73 | 74 | #### IMPROVED 75 | 76 | - Dot syntax query can now operate on a full array using empty set [] 77 | - Dot syntax query should output a specific key, e.g. attrs[id*=news].content (work in progress) 78 | - Dot query syntax handling touch-ups. Piping to jq is still more flexible, but the basics are there. 79 | 80 | ### 0.0.7 81 | 82 | 2024-01-12 17:03 83 | 84 | #### FIXED 85 | 86 | - Revert back to offering single response (no array) in cases where there are single results (for some commands) 87 | 88 | ### 0.0.6 89 | 90 | 2024-01-12 14:44 91 | 92 | #### CHANGED 93 | 94 | - Attributes array is now a hash directly keyed to the attribute key 95 | 96 | #### NEW 97 | 98 | - Tags command has option to output only raw html of matched tags 99 | 100 | #### FIXED 101 | 102 | - --query works with --search on scrape and tags command 103 | - Json command dot query works now 104 | 105 | ### 0.0.5 106 | 107 | 2024-01-11 18:06 108 | 109 | #### IMPROVED 110 | 111 | - Add --query capabilities to images command 112 | - Add --query to links command 113 | - Allow hyphens in query syntax 114 | - Allow any character other than comma, ampersand, or right square bracket in query value 115 | 116 | #### FIXED 117 | 118 | - Html --search returns a full Curl::Html object 119 | - --query works better with --search and is consistent with other query functions 120 | - Scrape command outputting malformed data 121 | - Hash output when --query is used with scrape 122 | - Nil match on tags command 123 | 124 | ### 0.0.4 125 | 126 | 2024-01-10 13:54 127 | 128 | #### FIXED 129 | 130 | - Queries combined with + or & not requiring all matches to be true 131 | 132 | ### 0.0.3 133 | 134 | 2024-01-10 13:38 135 | 136 | #### IMPROVED 137 | 138 | - Refactor Curl and Json libs to allow setting of options after creation of object 139 | - Allow setting of headers on most subcommands 140 | - --clean now affects source, head, and body keys of output 141 | - Also remove tabs when cleaning whitespace 142 | 143 | ### 0.0.2 144 | 145 | 2024-01-10 09:18 146 | 147 | ### 0.0.1 148 | 149 | 2024-01-10 08:20 150 | 151 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | gemspec 3 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | curlyq (0.0.16) 5 | gli (~> 2.21.0) 6 | nokogiri (~> 1.16.0) 7 | selenium-webdriver (~> 4.16.0) 8 | tty-which (~> 0.5.0) 9 | 10 | GEM 11 | remote: https://rubygems.org/ 12 | specs: 13 | gli (2.21.5) 14 | nokogiri (1.16.7-arm64-darwin) 15 | racc (~> 1.4) 16 | parallel (1.26.3) 17 | parallel_tests (3.13.0) 18 | parallel 19 | pastel (0.8.0) 20 | tty-color (~> 0.5) 21 | power_assert (2.0.4) 22 | racc (1.8.1) 23 | rake (13.2.1) 24 | rdoc (6.3.4.1) 25 | rexml (3.3.9) 26 | rubyzip (2.3.2) 27 | selenium-webdriver (4.16.0) 28 | rexml (~> 3.2, >= 3.2.5) 29 | rubyzip (>= 1.2.2, < 3.0) 30 | websocket (~> 1.0) 31 | strings-ansi (0.2.0) 32 | test-unit (3.4.9) 33 | power_assert 34 | tty-color (0.6.0) 35 | tty-cursor (0.7.1) 36 | tty-progressbar (0.18.2) 37 | strings-ansi (~> 0.2) 38 | tty-cursor (~> 0.7) 39 | tty-screen (~> 0.8) 40 | unicode-display_width (>= 1.6, < 3.0) 41 | tty-screen (0.8.2) 42 | tty-spinner (0.9.3) 43 | tty-cursor (~> 0.7) 44 | tty-which (0.5.0) 45 | unicode-display_width (2.6.0) 46 | websocket (1.2.11) 47 | yard (0.9.37) 48 | 49 | PLATFORMS 50 | arm64-darwin-20 51 | x86_64-darwin-20 52 | 53 | DEPENDENCIES 54 | curlyq! 55 | parallel_tests (~> 3.7, >= 3.7.3) 56 | pastel (~> 0.8.0) 57 | rake (~> 13.0, >= 13.0.1) 58 | rdoc (~> 6.3.1) 59 | test-unit (~> 3.4.4) 60 | tty-progressbar (~> 0.18, >= 0.18.2) 61 | tty-spinner (~> 0.9, >= 0.9.3) 62 | yard (~> 0.9, >= 0.9.26) 63 | 64 | BUNDLED WITH 65 | 2.2.29 66 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice (including the next 11 | paragraph) shall be included in all copies or substantial portions of the 12 | Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS 17 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 | OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CurlyQ 2 | 3 | [![Gem](https://img.shields.io/gem/v/na.svg)](https://rubygems.org/gems/curlyq) 4 | [![GitHub license](https://img.shields.io/github/license/ttscoff/curlyq.svg)](./LICENSE.txt) 5 | 6 | **A command line helper for curl and web scraping** 7 | 8 | _If you find this useful, feel free to [buy me some coffee][donate]._ 9 | 10 | [donate]: https://brettterpstra.com/donate 11 | 12 | 13 | [jq]: https://github.com/jqlang/jq "Command-line JSON processor" 14 | [yq]: https://github.com/mikefarah/yq "yq is a portable command-line YAML, JSON, XML, CSV, TOML and properties processor" 15 | 16 | The current version of `curlyq` is 0.0.16. 17 | 18 | CurlyQ is a utility that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It's designed to be part of a scripting pipeline, outputting everything as structured data (JSON or YAML). It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like [jq] to parse the output. 19 | 20 | [github]: https://github.com/ttscoff/curlyq/ 21 | 22 | ### Installation 23 | 24 | Assuming you have Ruby and RubyGems installed, you can just run `gem install curlyq`. If you run into errors, try `gem install --user-install curlyq`, or use `sudo gem install curlyq`. 25 | 26 | If you're using Homebrew, you have the option to install via [brew-gem](https://github.com/sportngin/brew-gem): 27 | 28 | brew install brew-gem 29 | brew gem install curlyq 30 | 31 | If you don't have Ruby/RubyGems, you can install them pretty easily with [Homebrew], [rvm], or [asdf]. 32 | 33 | [Homebrew]: https://brew.sh/ "Homebrew???The Missing Package Manager for macOS (or Linux)" 34 | [rvm]: https://rvm.io/ "Ruby Version Manager (RVM)" 35 | [asdf]: https://github.com/asdf-vm/asdf "asdf-vm/asdf:Extendable version manager with support for ..." 36 | 37 | ### Usage 38 | 39 | Run `curlyq help` for a list of subcommands. Run `curlyq help SUBCOMMAND` for details on a particular subcommand and its options. 40 | 41 | ``` 42 | NAME 43 | curlyq - A scriptable interface to curl 44 | 45 | SYNOPSIS 46 | curlyq [global options] command [command options] [arguments...] 47 | 48 | VERSION 49 | 0.0.16 50 | 51 | GLOBAL OPTIONS 52 | --help - Show this message 53 | --[no-]pretty - Output "pretty" JSON (default: enabled) 54 | --version - Display the program version 55 | -y, --[no-]yaml - Output YAML instead of json 56 | 57 | COMMANDS 58 | execute - Execute JavaScript on a URL 59 | extract - Extract contents between two regular expressions 60 | headlinks - Return all links on URL's page 61 | help - Shows a list of commands or help for one command 62 | html, curl - Curl URL and output its elements, multiple URLs allowed 63 | images - Extract all images from a URL 64 | json - Get a JSON response from a URL, multiple URLs allowed 65 | links - Return all links on a URL's page 66 | scrape - Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed. 67 | screenshot - Save a screenshot of a URL 68 | tags - Extract all instances of a tag 69 | ``` 70 | 71 | ### Query and Search syntax 72 | 73 | You can shape the results using `--search` (`-s`) and `--query` (`-q`) on some commands. 74 | 75 | A search uses either CSS or XPath syntax to locate elements. For example, if you wanted to locate all of the `
` elements with a class of `post` inside of the div with an id of `main`, you would run `--search '#main article.post'`. Searches can target tags, ids, and classes, and can accept `>` to target direct descendents. You can also use XPaths, but I hate those so I'm not going to document them. 76 | 77 | > I've tried to make the query function useful, but if you want to do any kind of advanced shaping, you're better off piping the JSON output to [jq] or [yq]. 78 | 79 | 80 | Queries are specifically for shaping CurlyQ output. If you're using the `html` command, it returns a key called `images`, so you can target just the images in the response with `-q 'images'`. The queries accept array syntax, so to get the first image, you would use `-q 'images[0]'`. Ranges are accepted as well, so `-q 'images[1..4]'` will return the 2nd through 5th images found on the page. You can also do comparisons, e.g. `images[rel=me]'` to target only images with a `rel` attribute of `me`. 81 | 82 | The comparisons for the query flag are: 83 | 84 | - `<` less than 85 | - `>` greater than 86 | - `<=` less than or equal to 87 | - `>=` greater than or equal to 88 | - `=` or `==` is equal to 89 | - `*=` contains text 90 | - `^=` starts with text 91 | - `$=` ends with text 92 | 93 | Comparisons can be numeric or string comparisons. A numeric comparison like `curlyq images -q '[width>500]' URL` would return all of the images on the page with a width attribute greater than 500. 94 | 95 | You can also use dot syntax inside of comparisons, e.g. `[links.rel*=me]` to target the links object (`html` command), and return only the links with a `rel=me` attribute. If the comparison is to an array object (like `class` or `rel`), it will match if any of the elements of the array match your comparison. 96 | 97 | If you end the query with a specific key, only that key will be output. If there's only one match, it will be output as a raw string. If there are multiple matches, output will be an array: 98 | 99 | curlyq tags --search '#main .post h3' -q '[attrs.id*=what].source' 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/' 100 | 101 |

What???s Next

102 | 103 | #### Commands 104 | 105 | curlyq makes use of subcommands, e.g. `curlyq html [options] URL` or `curlyq extract [options] URL`. Each subcommand takes its own options, but I've made an effort to standardize the choices between each command as much as possible. 106 | 107 | ##### extract 108 | 109 | Example: 110 | 111 | curlyq extract -i -b 'Adding' -a 'accessing the source.' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python' 112 | 113 | [ 114 | "Adding time.sleep(10) in various places in case the page had not fully loaded when I was accessing the source." 115 | ] 116 | 117 | This specifies a before and after string and includes them (`-i`) in the result. 118 | 119 | ``` 120 | NAME 121 | extract - Extract contents between two regular expressions 122 | 123 | SYNOPSIS 124 | 125 | curlyq [global options] extract [command options] URL... 126 | 127 | COMMAND OPTIONS 128 | -a, --after=arg - Text after extraction (default: none) 129 | -b, --before=arg - Text before extraction (default: none) 130 | -c, --[no-]compressed - Expect compressed results 131 | --[no-]clean - Remove extra whitespace from results 132 | -h, --header=arg - Define a header to send as key=value (may be used more than once, default: none) 133 | -i, --[no-]include - Include the before/after matches in the result 134 | -r, --[no-]regex - Process before/after strings as regular expressions 135 | --[no-]strip - Strip HTML tags from results 136 | ``` 137 | 138 | 139 | ##### execute 140 | 141 | You can execute JavaScript on a given web page using the `execute` subcommand. 142 | 143 | Example: 144 | 145 | curlyq execute -s "NiftyAPI.find('file/save').arrow().shoot('file-save')" file:///Users/ttscoff/Desktop/Code/niftymenu/dist/MultiMarkdown-Composer.html 146 | 147 | You can specify an element id to wait for using `--id`, and define a pause to wait after executing a script with `--wait` (defaults to 2 seconds). Scripts can be read from the command line arguments with `--script "SCRIPT"`, from STDIN with `--script -`, or from a file using `--script PATH`. 148 | 149 | If you expect a return value, be sure to include a `return` statement in your executed script. Results will be output to STDOUT. 150 | 151 | ``` 152 | NAME 153 | execute - Execute JavaScript on a URL 154 | 155 | SYNOPSIS 156 | 157 | curlyq [global options] execute [command options] URL... 158 | 159 | COMMAND OPTIONS 160 | -b, --browser=arg - Browser to use (firefox, chrome) (default: chrome) 161 | -h, --header=arg - Define a header to send as key=value (may be used more than once, default: none) 162 | -i, --id=arg - Element ID to wait for before executing (default: none) 163 | -s, --script=arg - Script to execute, use - to read from STDIN (may be used more than once, default: none) 164 | -w, --wait=arg - Seconds to wait after executing JS (default: 2) 165 | ``` 166 | 167 | ##### headlinks 168 | 169 | Example: 170 | 171 | curlyq headlinks -q '[rel=stylesheet]' https://brettterpstra.com 172 | 173 | { 174 | "rel": "stylesheet", 175 | "href": "https://cdn3.brettterpstra.com/stylesheets/screen.7261.css", 176 | "type": "text/css", 177 | "title": null 178 | } 179 | 180 | This pulls all `` from the `` of the page, and uses a query `-q` to only show links with `rel="stylesheet"`. 181 | 182 | ``` 183 | NAME 184 | headlinks - Return all links on URL's page 185 | 186 | SYNOPSIS 187 | 188 | curlyq [global options] headlinks [command options] URL... 189 | 190 | COMMAND OPTIONS 191 | -q, --query, --filter=arg - Filter output using dot-syntax path (default: none) 192 | ``` 193 | 194 | ##### html 195 | 196 | The html command (aliased as `curl`) gets the entire text of the web page and provides a JSON response with a breakdown of: 197 | 198 | - URL, after any redirects 199 | - Response code 200 | - Response headers as a keyed hash 201 | - Meta elements for the page as a keyed hash 202 | - All meta links in the head as an array of objects containing (as available): 203 | - rel 204 | - href 205 | - type 206 | - title 207 | - source of `` 208 | - source of `` 209 | - the page title (determined first by og:title, then by a title tag) 210 | - description (using og:description first) 211 | - All links on the page as an array of objects with: 212 | - href 213 | - title 214 | - rel 215 | - text content 216 | - classes as array 217 | - All images on the page as an array of objects containing: 218 | - class 219 | - all attributes as key/value pairs 220 | - width and height (if specified) 221 | - src 222 | - alt and title 223 | 224 | You can add a query (`-q`) to only get the information needed, e.g. `-q images[width>600]`. 225 | 226 | Example: 227 | 228 | curlyq html -s '#main article .aligncenter' -q 'images[1]' 'https://brettterpstra.com' 229 | 230 | [ 231 | { 232 | "class": "aligncenter", 233 | "original": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb_tw.jpg", 234 | "at2x": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb@2x.jpg", 235 | "width": "800", 236 | "height": "226", 237 | "src": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb.jpg", 238 | "alt": "Giveaway Robot with Keyboard Maestro icon", 239 | "title": "Giveaway Robot with Keyboard Maestro icon" 240 | } 241 | ] 242 | 243 | The above example queries the full html of the page, but narrows the elements using `--search` and then takes the 2nd image from the results. 244 | 245 | curlyq html -q 'meta.title' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/ 246 | 247 | Introducing CurlyQ, a pipeline-oriented curl helper - BrettTerpstra.com 248 | 249 | The above example curls the page and returns the title attribute found in the meta (`-q 'meta.title'`). 250 | 251 | ``` 252 | NAME 253 | html - Curl URL and output its elements, multiple URLs allowed 254 | 255 | SYNOPSIS 256 | 257 | curlyq [global options] html [command options] URL... 258 | 259 | COMMAND OPTIONS 260 | -I, --info - Only retrieve headers/info 261 | -b, --browser=arg - Use a browser to retrieve a dynamic web page (firefox, chrome) (default: none) 262 | -c, --compressed - Expect compressed results 263 | --[no-]clean - Remove extra whitespace from results 264 | -f, --fallback=arg - If curl doesn't work, use a fallback browser (firefox, chrome) (default: none) 265 | -h, --header=arg - Define a header to send as "key=value" (may be used more than once, default: none) 266 | --[no-]ignore_fragments - Ignore fragment hrefs when gathering content links 267 | --[no-]ignore_relative - Ignore relative hrefs when gathering content links 268 | -l, --local_links_only - Only gather internal (same-site) links 269 | -q, --query, --filter=arg - Filter output using dot-syntax path (default: none) 270 | -r, --raw=arg - Output a raw value for a key (default: none) 271 | -s, --search=arg - Regurn an array of matches to a CSS or XPath query (default: none) 272 | -x, --external_links_only - Only gather external links 273 | ``` 274 | 275 | ##### images 276 | 277 | The images command returns only the images on the page as an array of objects. It can be queried to match certain requirements (see Query and Search syntax above). 278 | 279 | The base command will return all images on the page, including OpenGraph images from the head, `` tags from the body, and `` tags along with their child images. 280 | 281 | OpenGraph images will be returned with the structure: 282 | 283 | { 284 | "type": "opengraph", 285 | "attrs": null, 286 | "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg" 287 | } 288 | 289 | `img` tags will be returned with the structure: 290 | 291 | { 292 | "type": "img", 293 | "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb.jpg", 294 | "width": "800", 295 | "height": "226", 296 | "alt": "Banner image for CurlyQ", 297 | "title": "CurlyQ, curl better", 298 | "attrs": [ 299 | { 300 | "class": [ 301 | "aligncenter" 302 | ], // all attributes included 303 | } 304 | ] 305 | } 306 | 307 | 308 | 309 | `srcset` images will be returned with the structure: 310 | 311 | { 312 | "type": "srcset", 313 | "attrs": [ 314 | { 315 | "key": "srcset", 316 | "value": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg 1x, https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg 2x" 317 | } 318 | ], 319 | "images": [ 320 | { 321 | "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg", 322 | "media": "1x" 323 | }, 324 | { 325 | "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg", 326 | "media": "2x" 327 | } 328 | ] 329 | } 330 | } 331 | 332 | Example: 333 | 334 | curlyq images -t img -q '[alt$=screenshot]' https://brettterpstra.com 335 | 336 | This will return an array of images that are `` tags, and only show the ones that have an `alt` attribute that ends with `screenshot`. 337 | 338 | curlyq images -q '[width>750]' https://brettterpstra.com 339 | 340 | This example will only return images that have a width greater than 750 pixels. This query depends on the images having proper `width` attributes set on them in the source. 341 | 342 | ``` 343 | NAME 344 | images - Extract all images from a URL 345 | 346 | SYNOPSIS 347 | 348 | curlyq [global options] images [command options] URL... 349 | 350 | COMMAND OPTIONS 351 | -c, --[no-]compressed - Expect compressed results 352 | --[no-]clean - Remove extra whitespace from results 353 | -h, --header=arg - Define a header to send as key=value (may be used more than once, default: none) 354 | -q, --query, --filter=arg - Filter output using dot-syntax path (default: none) 355 | -t, --type=arg - Type of images to return (img, srcset, opengraph, all) (may be used more than once, default: ["all"]) 356 | ``` 357 | 358 | ##### json 359 | 360 | The `json` command just returns an object with header/response info, and the contents of the JSON response after it's been read by the Ruby JSON library and output. If there are fetching or parsing errors it will fail gracefully with an error code. 361 | 362 | ``` 363 | NAME 364 | json - Get a JSON response from a URL, multiple URLs allowed 365 | 366 | SYNOPSIS 367 | 368 | curlyq [global options] json [command options] URL... 369 | 370 | COMMAND OPTIONS 371 | -c, --[no-]compressed - Expect compressed results 372 | -h, --header=arg - Define a header to send as key=value (may be used more than once, default: none) 373 | -q, --query, --filter=arg - Filter output using dot-syntax path (default: none) 374 | ``` 375 | 376 | ##### links 377 | 378 | Returns all the links on the page, which can be queried on any attribute. 379 | 380 | Example: 381 | 382 | curlyq links -q '[content*=twitter]' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python' 383 | 384 | [ 385 | { 386 | "href": "https://twitter.com/stackoverflow", 387 | "title": null, 388 | "rel": null, 389 | "content": "Twitter", 390 | "class": [ 391 | "-link", 392 | "js-gps-track" 393 | ] 394 | } 395 | ] 396 | 397 | This example gets all links from the page but only returns ones with link content containing 'twitter' (`-q '[content*=twitter]'`). 398 | 399 | ``` 400 | NAME 401 | links - Return all links on a URL's page 402 | 403 | SYNOPSIS 404 | 405 | curlyq [global options] links [command options] URL... 406 | 407 | COMMAND OPTIONS 408 | -d, --[no-]dedup - Filter out duplicate links, preserving only first one 409 | --[no-]ignore_fragments - Ignore fragment hrefs when gathering content links 410 | --[no-]ignore_relative - Ignore relative hrefs when gathering content links 411 | -l, --local_links_only - Only gather internal (same-site) links 412 | -q, --query, --filter=arg - Filter output using dot-syntax path (default: none) 413 | -x, --external_links_only - Only gather external links 414 | ``` 415 | 416 | ##### scrape 417 | 418 | Loads the page in a web browser, allowing scraping of dynamically loaded pages that return nothing but scripts when `curl`ed. The `-b` (`--browser`) option is required and should be 'chrome' or 'firefox' (or just 'c' or 'f'). The selected browser must be installed on your system. 419 | 420 | Example: 421 | 422 | curlyq scrape -b firefox -q 'links[rel=me&content*=mastodon][0]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/ 423 | 424 | { 425 | "href": "https://nojack.easydns.ca/@ttscoff", 426 | "title": null, 427 | "rel": [ 428 | "me" 429 | ], 430 | "content": "Mastodon", 431 | "class": [ 432 | "u-url" 433 | ] 434 | } 435 | 436 | This example scrapes the page using firefox and finds the first link with a rel of 'me' and text containing 'mastodon'. 437 | 438 | ``` 439 | NAME 440 | scrape - Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed. 441 | 442 | SYNOPSIS 443 | 444 | curlyq [global options] scrape [command options] URL... 445 | 446 | COMMAND OPTIONS 447 | -b, --browser=arg - Browser to use (firefox, chrome) (required, default: none) 448 | --[no-]clean - Remove extra whitespace from results 449 | -h, --header=arg - Define a header to send as "key=value" (may be used more than once, default: none) 450 | -q, --query, --filter=arg - Filter output using dot-syntax path (default: none) 451 | -r, --raw=arg - Output a raw value for a key (default: none) 452 | --search=arg - Regurn an array of matches to a CSS or XPath query (default: none) 453 | ``` 454 | 455 | ##### screenshot 456 | 457 | Full-page screenshots require Firefox, installed and specified with `--browser firefox`. 458 | 459 | Type defaults to `full`, but will only work if `-b` is Firefox. If you want to use Chrome, you must specify a `--type` as 'visible' or 'print'. 460 | 461 | The `-o` (`--output`) flag is required. It should be a path to a target PNG file (or PDF for `-t print` output). Extension will be modified automatically, all you need is the base name. 462 | 463 | Example: 464 | 465 | curlyq screenshot -b f -o ~/Desktop/test https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/ 466 | 467 | Screenshot saved to /Users/ttscoff/Desktop/test.png 468 | 469 | You can wait for an element ID to be visible using `--id`. This can be any `#ID` on the page. If the ID doesn't exist on the page, though, the screenshot will hang for a timeout of 10 seconds. 470 | 471 | You can execute a script before taking the screenshot with the `--script` flag. If this is set to `-`, it will read the script from STDIN. If it's set to an existing file path, that file will be read for script input. Specify an interval (in seconds) to wait after executing the script with `--wait`. 472 | 473 | ``` 474 | NAME 475 | screenshot - Save a screenshot of a URL 476 | 477 | SYNOPSIS 478 | 479 | curlyq [global options] screenshot [command options] URL... 480 | 481 | COMMAND OPTIONS 482 | -b, --browser=arg - Browser to use (firefox, chrome) (default: chrome) 483 | -h, --header=arg - Define a header to send as key=value (may be used more than once, default: none) 484 | -i, --id=arg - Element ID to wait for before taking screenshot (default: none) 485 | -o, --out, --file=arg - File destination (required, default: none) 486 | -s, --script=arg - Script to execute before taking screenshot (may be used more than once, default: none) 487 | -t, --type=arg - Type of screenshot to save (full (requires firefox), print, visible) (default: visible) 488 | -w, --wait=arg - Time to wait before taking screenshot (default: 0) 489 | ``` 490 | 491 | ##### tags 492 | 493 | Return a hierarchy of all tags in a page. Use `-t` to limit to a specific tag. 494 | 495 | curlyq tags --search '#main .post h3' -q '[attrs.id*=what]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/ 496 | 497 | [ 498 | { 499 | "tag": "h3", 500 | "source": "

What???s Next

", 501 | "attrs": [ 502 | { 503 | "id": "whats-next" 504 | } 505 | ], 506 | "content": "What???s Next", 507 | "tags": [ 508 | 509 | ] 510 | } 511 | ] 512 | 513 | The above command filters the tags based on a CSS query, then further filters them to just tags with an id containing 'what'. 514 | 515 | ``` 516 | NAME 517 | tags - Extract all instances of a tag 518 | 519 | SYNOPSIS 520 | 521 | curlyq [global options] tags [command options] URL... 522 | 523 | COMMAND OPTIONS 524 | -c, --[no-]compressed - Expect compressed results 525 | --[no-]clean - Remove extra whitespace from results 526 | -h, --header=KEY=VAL - Define a header to send as key=value (may be used more than once, default: none) 527 | -q, --query, --filter=DOT_SYNTAX - Dot syntax query to filter results (default: none) 528 | --search=CSS/XPATH - Regurn an array of matches to a CSS or XPath query (default: none) 529 | --[no-]source, --[no-]html - Output the HTML source of the results 530 | -t, --tag=TAG - Specify a tag to collect (may be used more than once, default: none) 531 | ``` 532 | 533 | 534 | PayPal link: [paypal.me/ttscoff](https://paypal.me/ttscoff) 535 | 536 | ## Changelog 537 | 538 | See [CHANGELOG.md](https://github.com/ttscoff/curlyq/blob/main/CHANGELOG.md) 539 | -------------------------------------------------------------------------------- /README.rdoc: -------------------------------------------------------------------------------- 1 | = curly 2 | 3 | A CLI helper for curl and web scraping 4 | 5 | :include:curlyq.rdoc 6 | 7 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rake/clean' 2 | require 'rake/testtask' 3 | require 'rubygems' 4 | require 'rubygems/package_task' 5 | require 'rdoc/task' 6 | require 'yard' 7 | require 'parallel_tests' 8 | require 'parallel_tests/tasks' 9 | require 'tty-spinner' 10 | 11 | YARD::Rake::YardocTask.new do |t| 12 | t.files = ['lib/curly/*.rb'] 13 | t.options = ['--markup=markdown', '--no-private', '-p', 'yard_templates'] 14 | # t.stats_options = ['--list-undoc'] 15 | end 16 | 17 | task :doc, [*Rake.application[:yard].arg_names] => [:yard] 18 | 19 | Rake::RDocTask.new do |rd| 20 | rd.main = "README.rdoc" 21 | rd.rdoc_files.include("README.rdoc","lib/**/*.rb","bin/**/*") 22 | rd.title = 'curlyq' 23 | end 24 | 25 | spec = eval(File.read('curlyq.gemspec')) 26 | 27 | Gem::PackageTask.new(spec) do |pkg| 28 | end 29 | 30 | namespace :test do 31 | FileList['test/*_test.rb'].each do |rakefile| 32 | test_name = File.basename(rakefile, '.rb').sub(/^.*?_(.*?)_.*?$/, '\1') 33 | 34 | Rake::TestTask.new(:"#{test_name}") do |t| 35 | t.libs << ['test', 'test/helpers'] 36 | t.pattern = rakefile 37 | t.verbose = ENV['VERBOSE'] =~ /(true|1)/i ? true : false 38 | end 39 | # Define default task for :test 40 | task default: test_name 41 | end 42 | end 43 | 44 | desc 'Run one test verbosely' 45 | task :test_one, :test do |_, args| 46 | args.with_defaults(test: '*') 47 | puts `bundle exec rake test TESTOPTS="-v" TEST="test/curlyq_#{args[:test]}_test.rb"` 48 | end 49 | 50 | desc 'Run all tests, threaded' 51 | task :test, :pattern, :threads, :max_tests do |_, args| 52 | args.with_defaults(pattern: '*', threads: 8, max_tests: 0) 53 | pattern = args[:pattern] =~ /(n[iu]ll?|0|\.)/i ? '*' : args[:pattern] 54 | 55 | require_relative 'test/helpers/threaded_tests' 56 | ThreadedTests.new.run(pattern: pattern, max_threads: args[:threads].to_i, max_tests: args[:max_tests]) 57 | end 58 | 59 | desc 'Install current gem in all versions of asdf-controlled ruby' 60 | task :install do 61 | Rake::Task['clobber'].invoke 62 | Rake::Task['package'].invoke 63 | Dir.chdir 'pkg' 64 | file = Dir.glob('*.gem').last 65 | 66 | current_ruby = `asdf current ruby`.match(/(\d.\d+.\d+)/)[1] 67 | 68 | `asdf list ruby`.split.map { |ruby| ruby.strip.sub(/^*/, '') }.each do |ruby| 69 | `asdf shell ruby #{ruby}` 70 | puts `gem install #{file}` 71 | end 72 | 73 | `asdf shell ruby #{current_ruby}` 74 | end 75 | 76 | desc 'Development version check' 77 | task :ver do 78 | gver = `git ver` 79 | cver = IO.read(File.join(File.dirname(__FILE__), 'CHANGELOG.md')).match(/^#+ (\d+\.\d+\.\d+(\w+)?)/)[1] 80 | res = `grep VERSION lib/curly/version.rb` 81 | version = res.match(/VERSION *= *['"](\d+\.\d+\.\d+(\w+)?)/)[1] 82 | puts "git tag: #{gver}" 83 | puts "version.rb: #{version}" 84 | puts "changelog: #{cver}" 85 | end 86 | 87 | desc 'Changelog version check' 88 | task :cver do 89 | puts IO.read(File.join(File.dirname(__FILE__), 'CHANGELOG.md')).match(/^#+ (\d+\.\d+\.\d+(\w+)?)/)[1] 90 | end 91 | 92 | desc 'Bump incremental version number' 93 | task :bump, :type do |_, args| 94 | args.with_defaults(type: 'inc') 95 | version_file = 'lib/curly/version.rb' 96 | content = IO.read(version_file) 97 | content.sub!(/VERSION = '(?\d+)\.(?\d+)\.(?\d+)(?
\S+)?'/) do
 98 |     m = Regexp.last_match
 99 |     major = m['major'].to_i
100 |     minor = m['minor'].to_i
101 |     inc = m['inc'].to_i
102 |     pre = m['pre']
103 | 
104 |     case args[:type]
105 |     when /^maj/
106 |       major += 1
107 |       minor = 0
108 |       inc = 0
109 |     when /^min/
110 |       minor += 1
111 |       inc = 0
112 |     else
113 |       inc += 1
114 |     end
115 | 
116 |     $stdout.puts "At version #{major}.#{minor}.#{inc}#{pre}"
117 |     "VERSION = '#{major}.#{minor}.#{inc}#{pre}'"
118 |   end
119 |   File.open(version_file, 'w+') { |f| f.puts content }
120 | end
121 | 
122 | task default: %i[test clobber package]
123 | 


--------------------------------------------------------------------------------
/bin/curlyq:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | require 'gli'
  3 | require 'curly'
  4 | require 'curly/curl'
  5 | 
  6 | include GLI::App
  7 | 
  8 | program_desc 'A scriptable interface to curl'
  9 | 
 10 | version Curly::VERSION
 11 | 
 12 | subcommand_option_handling :normal
 13 | arguments :strict
 14 | 
 15 | ImageType = Class.new(Symbol)
 16 | accept ImageType do |value|
 17 |   value.normalize_image_type(:all)
 18 | end
 19 | 
 20 | BrowserType = Class.new(Symbol)
 21 | accept BrowserType do |value|
 22 |   value.normalize_browser_type(:none)
 23 | end
 24 | 
 25 | ScreenshotType = Class.new(Symbol)
 26 | accept ScreenshotType do |value|
 27 |   value.normalize_screenshot_type(:full_page)
 28 | end
 29 | 
 30 | desc 'Output YAML instead of json'
 31 | switch %i[y yaml]
 32 | 
 33 | desc 'Output "pretty" JSON'
 34 | switch %i[pretty], default_value: true, negatable: true
 35 | 
 36 | # TODO: Post method, html and json with --data flags, accept key=value and files
 37 | # TODO: Handle binary responses, deal gracefully with compressed data
 38 | # TODO: File uploads?
 39 | 
 40 | def self.break_headers(headers)
 41 |   out = {}
 42 |   headers.each do |h|
 43 |     m = h.match(/(?[^=]+)=(?.*?)$/)
 44 |     out[m['key'].strip] = m['value'].strip
 45 |   end
 46 |   out
 47 | end
 48 | 
 49 | def self.print_out(output, yaml, raw: false, pretty: true)
 50 |   output = output.to_data if output.respond_to?(:to_data)
 51 |   # Was intended to flatten single responses, but not getting an array back is unpredictable
 52 |   output = output.clean_output
 53 |   if output.is_a?(String)
 54 |     print output
 55 |   elsif raw
 56 |     output = output.join("\n") if output.is_a?(Array)
 57 |     print output
 58 |   else
 59 |     if yaml
 60 |       print YAML.dump(output)
 61 |     else
 62 |       print pretty ? JSON.pretty_generate(output) : JSON.generate(output)
 63 |     end
 64 |   end
 65 | end
 66 | 
 67 | desc 'Curl URL and output its elements, multiple URLs allowed'
 68 | arg_name 'URL', multiple: true
 69 | command %i[html curl] do |c|
 70 |   c.desc 'Only retrieve headers/info'
 71 |   c.switch %i[I info], negatable: false
 72 | 
 73 |   c.desc 'Regurn an array of matches to a CSS or XPath query'
 74 |   c.flag %i[s search]
 75 | 
 76 |   c.desc 'Define a header to send as "key=value"'
 77 |   c.flag %i[h header], multiple: true
 78 | 
 79 |   c.desc 'Use a browser to retrieve a dynamic web page (firefox, chrome)'
 80 |   c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/
 81 | 
 82 |   c.desc %(If curl doesn't work, use a fallback browser (firefox, chrome))
 83 |   c.flag %i[f fallback], type: BrowserType, must_match: /^[fc].*?$/
 84 | 
 85 |   c.desc 'Expect compressed results'
 86 |   c.switch %i[c compressed], negatable: false
 87 | 
 88 |   c.desc 'Remove extra whitespace from results'
 89 |   c.switch %i[clean]
 90 | 
 91 |   c.desc 'Filter output using dot-syntax path'
 92 |   c.flag %i[q query filter]
 93 | 
 94 |   c.desc 'Output a raw value for a key'
 95 |   c.flag %i[r raw]
 96 | 
 97 |   c.desc 'Ignore relative hrefs when gathering content links'
 98 |   c.switch %i[ignore_relative], negatable: true
 99 | 
100 |   c.desc 'Ignore fragment hrefs when gathering content links'
101 |   c.switch %i[ignore_fragments], negatable: true
102 | 
103 |   c.desc 'Only gather external links'
104 |   c.switch %i[x external_links_only], default_value: false, negatable: false
105 | 
106 |   c.desc 'Only gather internal (same-site) links'
107 |   c.switch %i[l local_links_only], default_value: false, negatable: false
108 | 
109 |   c.action do |global_options, options, args|
110 |     urls = args.join(' ').split(/[, ]+/)
111 |     headers = break_headers(options[:header])
112 | 
113 |     output = []
114 | 
115 |     urls.each do |url|
116 |       curl_settings = { browser: options[:browser], fallback: options[:fallback],
117 |                         headers: headers, headers_only: options[:info],
118 |                         compressed: options[:compressed], clean: options[:clean],
119 |                         ignore_local_links: options[:ignore_relative],
120 |                         ignore_fragment_links: options[:ignore_fragments],
121 |                         external_links_only: options[:external_links_only],
122 |                         local_links_only: options[:local_links_only] }
123 |       res = Curl::Html.new(url, curl_settings)
124 |       res.curl
125 | 
126 |       if options[:info]
127 |         output.push(res.headers)
128 |         next
129 |       end
130 | 
131 |       if options[:search]
132 |         source = res.search(options[:search], return_source: true)
133 | 
134 |         out = res.parse(source)
135 | 
136 |         if options[:query]
137 |           out = out.to_data(url: url, clean: options[:clean]).dot_query(options[:query], full_tag: false)
138 |         else
139 |           out = out.to_data
140 |         end
141 |         output.push([out])
142 |       elsif options[:query]
143 |         queried = res.to_data.dot_query(options[:query], full_tag: false)
144 |         output.push(queried) if queried
145 |       else
146 |         output.push(res.to_data(url: url))
147 |       end
148 |     end
149 |     output.delete_if(&:nil?)
150 |     output.delete_if(&:empty?)
151 | 
152 |     exit_now!('No results') if output.nil? || output.empty?
153 | 
154 |     output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
155 | 
156 |     output = output.clean_output
157 | 
158 |     print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
159 |   end
160 | end
161 | 
162 | desc 'Execute JavaScript on a URL'
163 | arg_name 'URL', multiple: true
164 | command :execute do |c|
165 |   c.desc 'Browser to use (firefox, chrome)'
166 |   c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
167 | 
168 |   c.desc 'Define a header to send as key=value'
169 |   c.flag %i[h header], multiple: true
170 | 
171 |   c.desc 'Script to execute, use - to read from STDIN'
172 |   c.flag %i[s script], multiple: true
173 | 
174 |   c.desc 'Element ID to wait for before executing'
175 |   c.flag %i[i id]
176 | 
177 |   c.desc 'Seconds to wait after executing JS'
178 |   c.flag %i[w wait], default_value: 2
179 | 
180 |   c.action do |_, options, args|
181 |     urls = args.join(' ').split(/[, ]+/)
182 | 
183 |     raise 'Script input required' unless options[:file] || options[:script]
184 | 
185 |     compiled_script = []
186 | 
187 |     if options[:script].count.positive?
188 |       options[:script].each do |scr|
189 |         scr.strip!
190 |         if scr == '-'
191 |           compiled_script << $stdin.read
192 |         elsif File.exist?(File.expand_path(scr))
193 |           compiled_script << IO.read(File.expand_path(scr))
194 |         else
195 |           compiled_script << scr
196 |         end
197 |       end
198 |     end
199 | 
200 |     script = compiled_script.count.positive? ? compiled_script.join(';') : nil
201 | 
202 |     headers = break_headers(options[:header])
203 | 
204 |     browser = options[:browser]
205 | 
206 |     browser = browser.is_a?(Symbol) ? browser : browser.normalize_browser_type
207 | 
208 |     urls.each do |url|
209 |       c = Curl::Html.new(url)
210 |       c.headers = headers
211 |       c.browser = browser
212 |       $stdout.puts c.execute(script, options[:wait], options[:id])
213 |     end
214 |   end
215 | end
216 | 
217 | desc 'Save a screenshot of a URL'
218 | arg_name 'URL', multiple: true
219 | command :screenshot do |c|
220 |   c.desc 'Type of screenshot to save (full (requires firefox), print, visible)'
221 |   c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'visible'
222 | 
223 |   c.desc 'Browser to use (firefox, chrome)'
224 |   c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
225 | 
226 |   c.desc 'File destination'
227 |   c.flag %i[o out file], required: true
228 | 
229 |   c.desc 'Define a header to send as key=value'
230 |   c.flag %i[h header], multiple: true
231 | 
232 |   c.desc 'Script to execute before taking screenshot'
233 |   c.flag %i[s script], multiple: true
234 | 
235 |   c.desc 'Element ID to wait for before taking screenshot'
236 |   c.flag %i[i id]
237 | 
238 |   c.desc 'Time to wait before taking screenshot'
239 |   c.flag %i[w wait], default_value: 0, type: Integer
240 | 
241 |   c.action do |_, options, args|
242 |     urls = args.join(' ').split(/[, ]+/)
243 |     headers = break_headers(options[:header])
244 | 
245 |     type = options[:type]
246 |     browser = options[:browser]
247 | 
248 |     type = type.is_a?(Symbol) ? type : type.normalize_screenshot_type
249 |     browser = browser.is_a?(Symbol) ? browser : browser.normalize_browser_type
250 | 
251 |     compiled_script = []
252 | 
253 |     if options[:script].count.positive?
254 |       options[:script].each do |scr|
255 |         scr.strip!
256 |         if scr == '-'
257 |           compiled_script << $stdin.read
258 |         elsif File.exist?(File.expand_path(scr))
259 |           compiled_script << IO.read(File.expand_path(scr))
260 |         else
261 |           compiled_script << scr
262 |         end
263 |       end
264 |     end
265 | 
266 |     script = compiled_script.count.positive? ? compiled_script.join(';') : nil
267 | 
268 |     raise 'Full page screen shots only available with Firefox' if type == :full_page && browser != :firefox
269 | 
270 |     urls.each do |url|
271 |       c = Curl::Html.new(url)
272 |       c.headers = headers
273 |       c.browser = browser
274 |       c.screenshot(options[:out], type: type, script: script, id: options[:id], wait: options[:wait])
275 |     end
276 |   end
277 | end
278 | 
279 | desc 'Get a JSON response from a URL, multiple URLs allowed'
280 | arg_name 'URL', multiple: true
281 | command :json do |c|
282 |   c.desc 'Define a header to send as key=value'
283 |   c.flag %i[h header], multiple: true
284 | 
285 |   c.desc 'Expect compressed results'
286 |   c.switch %i[c compressed]
287 | 
288 |   c.desc 'Filter output using dot-syntax path'
289 |   c.flag %i[q query filter]
290 | 
291 |   c.action do |global_options, options, args|
292 |     urls = args.join(' ').split(/[, ]+/)
293 |     headers = break_headers(options[:header])
294 | 
295 |     output = []
296 | 
297 |     urls.each do |url|
298 |       res = Curl::Json.new(url)
299 |       res.request_headers = headers
300 |       res.compressed = options[:compressed],
301 |       res.symbolize_names = false
302 |       res.curl
303 | 
304 |       json = res.json
305 | 
306 |       if json.nil?
307 |         output.push({
308 |           status: 'error parsing JSON',
309 |           url: res.url,
310 |           code: res.code,
311 |           headers: res.headers
312 |         })
313 |       else
314 |         if options[:query]
315 |           if options[:query] =~ /^json$/
316 |             res = json
317 |           elsif options[:query] =~ /^json\./
318 |             query = options[:query].sub(/^json\./, '')
319 |           else
320 |             query = options[:query]
321 |           end
322 | 
323 |           res = json.dot_query(query)
324 |         else
325 |           res = res.to_data
326 |         end
327 | 
328 |         output.push(res)
329 |       end
330 |     end
331 | 
332 |     output = output.clean_output
333 | 
334 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
335 |   end
336 | end
337 | 
338 | desc 'Extract contents between two regular expressions'
339 | arg_name 'URL', multiple: true
340 | command :extract do |c|
341 |   c.desc 'Text before extraction'
342 |   c.flag %i[b before]
343 | 
344 |   c.desc 'Text after extraction'
345 |   c.flag %i[a after]
346 | 
347 |   c.desc 'Process before/after strings as regular expressions'
348 |   c.switch %i[r regex]
349 | 
350 |   c.desc 'Include the before/after matches in the result'
351 |   c.switch %i[i include]
352 | 
353 |   c.desc 'Define a header to send as key=value'
354 |   c.flag %i[h header], multiple: true
355 | 
356 |   c.desc 'Expect compressed results'
357 |   c.switch %i[c compressed]
358 | 
359 |   c.desc 'Remove extra whitespace from results'
360 |   c.switch %i[clean]
361 | 
362 |   c.desc 'Strip HTML tags from results'
363 |   c.switch %i[strip]
364 | 
365 |   c.action do |global_options,options,args|
366 |     urls = args.join(' ').split(/[, ]+/)
367 |     headers = break_headers(options[:header])
368 | 
369 |     output = []
370 | 
371 |     urls.each do |url|
372 |       res = Curl::Html.new(url, { headers: headers, headers_only: false,
373 |                                   compressed: options[:compressed], clean: options[:clean] })
374 |       res.curl
375 |       if options[:regex]
376 |         before = Regexp.new(options[:before])
377 |         after = Regexp.new(options[:after])
378 |       else
379 |         before = /#{Regexp.escape(options[:before])}/
380 |         after = /#{Regexp.escape(options[:after])}/
381 |       end
382 | 
383 |       extracted = res.extract(before, after, inclusive: options[:include])
384 |       extracted.strip_tags! if options[:strip]
385 |       output.concat(extracted)
386 |     end
387 | 
388 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
389 |   end
390 | end
391 | 
392 | desc 'Extract all instances of a tag'
393 | arg_name 'URL', multiple: true
394 | command :tags do |c|
395 |   c.desc 'Define a header to send as key=value'
396 |   c.flag %i[h header], multiple: true, arg_name: 'KEY=VAL'
397 | 
398 |   c.desc 'Specify a tag to collect'
399 |   c.flag %i[t tag], multiple: true, arg_name: 'TAG'
400 | 
401 |   c.desc 'Expect compressed results'
402 |   c.switch %i[c compressed]
403 | 
404 |   c.desc 'Remove extra whitespace from results'
405 |   c.switch %i[clean]
406 | 
407 |   c.desc 'Output the HTML source of the results'
408 |   c.switch %i[source html]
409 | 
410 |   c.desc 'Dot syntax query to filter results'
411 |   c.flag %i[q query filter], arg_name: 'DOT_SYNTAX'
412 | 
413 |   c.desc 'Regurn an array of matches to a CSS or XPath query'
414 |   c.flag %i[search], arg_name: 'CSS/XPATH'
415 | 
416 |   c.action do |global_options, options, args|
417 |     urls = args.join(' ').split(/[, ]+/)
418 |     headers = break_headers(options[:header])
419 |     tags = options[:tag].join(' ').split(/[, ]+/)
420 |     output = []
421 | 
422 |     urls.each do |url|
423 |       res = Curl::Html.new(url, { headers: headers, headers_only: options[:headers],
424 |                                   compressed: options[:compressed], clean: options[:clean] })
425 |       res.curl
426 | 
427 |       output = []
428 |       if options[:search]
429 |         out = res.search(options[:search])
430 | 
431 |         out = out.dot_query(options[:query]) if options[:query]
432 |         output.push(out)
433 |       elsif options[:query]
434 |         output = res.to_data.dot_query(options[:query])
435 |       elsif tags.count.positive?
436 |         tags.each { |tag| output.concat(res.tags(tag)) }
437 |       else
438 |         output.concat(res.tags)
439 |       end
440 |     end
441 | 
442 |     output = output.clean_output
443 | 
444 |     if options[:source]
445 |       puts output.to_html
446 |     else
447 |       print_out(output, global_options[:yaml], pretty: global_options[:pretty])
448 |     end
449 |   end
450 | end
451 | 
452 | desc 'Extract all images from a URL'
453 | arg_name 'URL', multiple: true
454 | command :images do |c|
455 |   c.desc 'Type of images to return (img, srcset, opengraph, all)'
456 |   c.flag %i[t type], multiple: true, type: ImageType, default_value: ['all']
457 | 
458 |   c.desc 'Expect compressed results'
459 |   c.switch %i[c compressed]
460 | 
461 |   c.desc 'Remove extra whitespace from results'
462 |   c.switch %i[clean]
463 | 
464 |   c.desc 'Filter output using dot-syntax path'
465 |   c.flag %i[q query filter]
466 | 
467 |   c.desc 'Define a header to send as key=value'
468 |   c.flag %i[h header], multiple: true
469 | 
470 |   c.action do |global_options, options, args|
471 |     urls = args.join(' ').split(/[, ]+/)
472 |     headers = break_headers(options[:header])
473 | 
474 |     output = []
475 | 
476 |     types = options[:type].join(' ').split(/[ ,]+/).map(&:normalize_image_type)
477 | 
478 |     urls.each do |url|
479 |       res = Curl::Html.new(url, { compressed: options[:compressed], clean: options[:clean] })
480 |       res.curl
481 | 
482 |       res = res.images(types: types)
483 |       res = { images: res }.dot_query(options[:query], 'images', full_tag: false) if options[:query]
484 | 
485 |       if res.is_a?(Array)
486 |         output.concat(res)
487 |       else
488 |         output.push(res)
489 |       end
490 |     end
491 | 
492 |     exit_now!('No results') if output.nil? || output.empty?
493 | 
494 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
495 |   end
496 | end
497 | 
498 | desc %(Return all links on a URL's page)
499 | arg_name 'URL', multiple: true
500 | command :links do |c|
501 |   c.desc 'Ignore relative hrefs when gathering content links'
502 |   c.switch %i[ignore_relative], negatable: true
503 | 
504 |   c.desc 'Ignore fragment hrefs when gathering content links'
505 |   c.switch %i[ignore_fragments], negatable: true
506 | 
507 |   c.desc 'Only gather external links'
508 |   c.switch %i[x external_links_only], default_value: false, negatable: false
509 | 
510 |   c.desc 'Only gather internal (same-site) links'
511 |   c.switch %i[l local_links_only], default_value: false, negatable: false
512 | 
513 |   c.desc 'Filter output using dot-syntax path'
514 |   c.flag %i[q query filter]
515 | 
516 |   c.desc 'Filter out duplicate links, preserving only first one'
517 |   c.switch %i[d dedup], negatable: true
518 | 
519 |   c.action do |global_options, options, args|
520 |     urls = args.join(' ').split(/[, ]+/)
521 | 
522 |     output = []
523 | 
524 |     urls.each do |url|
525 |       res = Curl::Html.new(url, {
526 |                              compressed: options[:compressed], clean: options[:clean],
527 |                              ignore_local_links: options[:ignore_relative],
528 |                              ignore_fragment_links: options[:ignore_fragments],
529 |                              external_links_only: options[:external_links_only],
530 |                              local_links_only: options[:local_links_only]
531 |                            })
532 |       res.curl
533 | 
534 |       if options[:query]
535 |         queried = res.to_data.dot_query(options[:query], 'links', full_tag: false)
536 | 
537 |         queried.is_a?(Array) ? output.concat(queried) : output.push(queried) if queried
538 |       else
539 |         output.concat(res.body_links)
540 |       end
541 |     end
542 | 
543 |     output.dedup_links! if options[:dedup]
544 | 
545 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
546 |   end
547 | end
548 | 
549 | desc %(Return all  links on URL's page)
550 | arg_name 'URL', multiple: true
551 | command :headlinks do |c|
552 |   c.desc 'Filter output using dot-syntax path'
553 |   c.flag %i[q query filter]
554 | 
555 |   c.action do |global_options, options, args|
556 |     urls = args.join(' ').split(/[, ]+/)
557 | 
558 |     output = []
559 | 
560 |     urls.each do |url|
561 |       res = Curl::Html.new(url, { compressed: options[:compressed], clean: options[:clean] })
562 |       res.curl
563 | 
564 |       if options[:query]
565 |         queried = { links: res.to_data[:meta_links] }.dot_query(options[:query], 'links', full_tag: false)
566 |         output.push(queried) if queried
567 |       else
568 |         output.push(res.to_data[:meta_links])
569 |       end
570 |     end
571 | 
572 |     output = output.clean_output
573 | 
574 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
575 |   end
576 | end
577 | 
578 | desc %(Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.)
579 | arg_name 'URL', multiple: true
580 | command :scrape do |c|
581 |   c.desc 'Browser to use (firefox, chrome)'
582 |   c.flag %i[b browser], type: BrowserType, required: true
583 | 
584 |   c.desc 'Regurn an array of matches to a CSS or XPath query'
585 |   c.flag %i[search]
586 | 
587 |   c.desc 'Define a header to send as "key=value"'
588 |   c.flag %i[h header], multiple: true
589 | 
590 |   c.desc 'Remove extra whitespace from results'
591 |   c.switch %i[clean]
592 | 
593 |   c.desc 'Filter output using dot-syntax path'
594 |   c.flag %i[q query filter]
595 | 
596 |   c.desc 'Output a raw value for a key'
597 |   c.flag %i[r raw]
598 | 
599 |   c.action do |global_options, options, args|
600 |     urls = args.join(' ').split(/[, ]+/)
601 | 
602 |     output = []
603 | 
604 |     urls.each do |url|
605 |       res = Curl::Html.new(url, { browser: options[:browser], clean: options[:clean] })
606 |       res.curl
607 | 
608 |       if options[:search]
609 |         out = res.search(options[:search])
610 | 
611 |         out = out.dot_query(options[:query], full_tag: false) if options[:query]
612 |         output.push(out)
613 |       elsif options[:query]
614 |         queried = res.to_data(url: url).dot_query(options[:query], full_tag: false)
615 |         output.push(queried) if queried
616 |       else
617 |         output.push(res.to_data(url: url))
618 |       end
619 |     end
620 | 
621 |     output.delete_if(&:empty?)
622 | 
623 |     output = output.clean_output
624 | 
625 |     if options[:raw]
626 |       output.map! { |o| o[options[:raw].to_sym] }
627 |     end
628 | 
629 |     print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
630 |   end
631 | end
632 | 
633 | pre do |global, command, options, args|
634 |   # Pre logic here
635 |   # Return true to proceed; false to abort and not call the
636 |   # chosen command
637 |   # Use skips_pre before a command to skip this block
638 |   # on that command only
639 |   true
640 | end
641 | 
642 | post do |global, command, options, args|
643 |   # Post logic here
644 |   # Use skips_post before a command to skip this
645 |   # block on that command only
646 | end
647 | 
648 | on_error do |exception|
649 |   # Error logic here
650 |   # return false to skip default error handling
651 |   true
652 | end
653 | 
654 | exit run(ARGV)
655 | 


--------------------------------------------------------------------------------
/curlyq.gemspec:
--------------------------------------------------------------------------------
 1 | # Ensure we require the local version and not one we might have installed already
 2 | require File.join([File.dirname(__FILE__),'lib','curly','version.rb'])
 3 | spec = Gem::Specification.new do |s|
 4 |   s.name = 'curlyq'
 5 |   s.version = Curly::VERSION
 6 |   s.author = 'Brett Terpstra'
 7 |   s.email = 'me@brettterpstra.com'
 8 |   s.homepage = 'https://brettterpstra.com'
 9 |   s.platform = Gem::Platform::RUBY
10 |   s.licenses = 'MIT'
11 |   s.summary = 'A CLI helper for curl and web scraping'
12 |   s.files = `git ls-files`.split("
13 | ")
14 |   s.require_paths << 'lib'
15 |   s.extra_rdoc_files = ['README.rdoc','curlyq.rdoc']
16 |   s.rdoc_options << '--title' << 'curlyq' << '--main' << 'README.rdoc' << '-ri'
17 |   s.bindir = 'bin'
18 |   s.executables << 'curlyq'
19 |   s.add_development_dependency('rake','~> 13.0', '>= 13.0.1')
20 |   s.add_development_dependency('rdoc', '~> 6.3.1')
21 |   s.add_development_dependency('test-unit', '~> 3.4.4')
22 |   s.add_development_dependency('yard', '~> 0.9', '>= 0.9.26')
23 |   s.add_development_dependency('tty-spinner', '~> 0.9', '>= 0.9.3')
24 |   s.add_development_dependency('tty-progressbar', '~> 0.18', '>= 0.18.2')
25 |   s.add_development_dependency('pastel', '~> 0.8.0')
26 |   s.add_development_dependency('parallel_tests', '~> 3.7', '>= 3.7.3')
27 |   s.add_runtime_dependency('gli','~> 2.21.0')
28 |   s.add_runtime_dependency('tty-which','~> 0.5.0')
29 |   s.add_runtime_dependency('nokogiri','~> 1.16.0')
30 |   s.add_runtime_dependency('selenium-webdriver', '~> 4.16.0')
31 | end
32 | 


--------------------------------------------------------------------------------
/curlyq.rdoc:
--------------------------------------------------------------------------------
  1 | == curlyq - A scriptable interface to curl
  2 | 
  3 | v0.0.1
  4 | 
  5 | === Global Options
  6 | === --help
  7 | Show this message
  8 | 
  9 | 
 10 | 
 11 | === --[no-]pretty
 12 | Output "pretty" JSON
 13 | 
 14 | 
 15 | 
 16 | === --version
 17 | Display the program version
 18 | 
 19 | 
 20 | 
 21 | === -y|--[no-]yaml
 22 | Output YAML instead of json
 23 | 
 24 | 
 25 | 
 26 | === Commands
 27 | ==== Command: extract  URL...
 28 | Extract contents between two regular expressions
 29 | 
 30 | 
 31 | ===== Options
 32 | ===== -a|--after arg
 33 | 
 34 | Text after extraction, parsed as regex
 35 | 
 36 | [Default Value] None
 37 | 
 38 | 
 39 | ===== -b|--before arg
 40 | 
 41 | Text before extraction, parsed as regex
 42 | 
 43 | [Default Value] None
 44 | 
 45 | 
 46 | ===== -h|--header arg
 47 | 
 48 | Define a header to send as key=value
 49 | 
 50 | [Default Value] None
 51 | 
 52 | 
 53 | ===== -c|--[no-]compressed
 54 | Expect compressed results
 55 | 
 56 | 
 57 | 
 58 | ===== --[no-]clean
 59 | Remove extra whitespace from results
 60 | 
 61 | 
 62 | 
 63 | ===== --[no-]strip
 64 | Strip HTML tags from results
 65 | 
 66 | 
 67 | 
 68 | ==== Command: headlinks  URL...
 69 | Return all  links on URL's page
 70 | 
 71 | 
 72 | ===== Options
 73 | ===== -q|--query|--filter arg
 74 | 
 75 | Filter output using dot-syntax path
 76 | 
 77 | [Default Value] None
 78 | 
 79 | 
 80 | ==== Command: help  command
 81 | Shows a list of commands or help for one command
 82 | 
 83 | Gets help for the application or its commands. Can also list the commands in a way helpful to creating a bash-style completion function
 84 | ===== Options
 85 | ===== -c
 86 | List commands one per line, to assist with shell completion
 87 | 
 88 | 
 89 | 
 90 | ==== Command: html|curl  URL...
 91 | Curl URL and output its elements, multiple URLs allowed
 92 | 
 93 | 
 94 | ===== Options
 95 | ===== -b|--browser arg
 96 | 
 97 | Use a browser to retrieve a dynamic web page (firefox, chrome)
 98 | 
 99 | [Default Value] None
100 | [Must Match] (?-mix:^[fc].*?$)
101 | 
102 | 
103 | ===== -f|--fallback arg
104 | 
105 | If curl doesn't work, use a fallback browser (firefox, chrome)
106 | 
107 | [Default Value] None
108 | [Must Match] (?-mix:^[fc].*?$)
109 | 
110 | 
111 | ===== -h|--header arg
112 | 
113 | Define a header to send as "key=value"
114 | 
115 | [Default Value] None
116 | 
117 | 
118 | ===== -q|--query|--filter arg
119 | 
120 | Filter output using dot-syntax path
121 | 
122 | [Default Value] None
123 | 
124 | 
125 | ===== -r|--raw arg
126 | 
127 | Output a raw value for a key
128 | 
129 | [Default Value] None
130 | 
131 | 
132 | ===== --search arg
133 | 
134 | Regurn an array of matches to a CSS or XPath query
135 | 
136 | [Default Value] None
137 | 
138 | 
139 | ===== -I|--info
140 | Only retrieve headers/info
141 | 
142 | 
143 | 
144 | ===== -c|--compressed
145 | Expect compressed results
146 | 
147 | 
148 | 
149 | ===== --[no-]clean
150 | Remove extra whitespace from results
151 | 
152 | 
153 | 
154 | ===== --[no-]ignore_fragments
155 | Ignore fragment hrefs when gathering content links
156 | 
157 | 
158 | 
159 | ===== --[no-]ignore_relative
160 | Ignore relative hrefs when gathering content links
161 | 
162 | 
163 | 
164 | ===== -x|--external_links_only
165 | Only gather external links
166 | 
167 | 
168 | 
169 | ==== Command: images  URL...
170 | Extract all images from a URL
171 | 
172 | 
173 | ===== Options
174 | ===== -t|--type arg
175 | 
176 | Type of images to return (img, srcset, opengraph, all)
177 | 
178 | [Default Value] ["all"]
179 | 
180 | 
181 | ===== -c|--[no-]compressed
182 | Expect compressed results
183 | 
184 | 
185 | 
186 | ===== --[no-]clean
187 | Remove extra whitespace from results
188 | 
189 | 
190 | 
191 | ==== Command: json  URL...
192 | Get a JSON response from a URL, multiple URLs allowed
193 | 
194 | 
195 | ===== Options
196 | ===== -h|--header arg
197 | 
198 | Define a header to send as key=value
199 | 
200 | [Default Value] None
201 | 
202 | 
203 | ===== -q|--query|--filter arg
204 | 
205 | Filter output using dot-syntax path
206 | 
207 | [Default Value] None
208 | 
209 | 
210 | ===== -c|--[no-]compressed
211 | Expect compressed results
212 | 
213 | 
214 | 
215 | ==== Command: links  URL...
216 | Return all links on a URL's page
217 | 
218 | 
219 | ===== Options
220 | ===== -q|--query|--filter arg
221 | 
222 | Filter output using dot-syntax path
223 | 
224 | [Default Value] None
225 | 
226 | 
227 | ===== -d|--[no-]dedup
228 | Filter out duplicate links, preserving only first one
229 | 
230 | 
231 | 
232 | ===== --[no-]ignore_fragments
233 | Ignore fragment hrefs when gathering content links
234 | 
235 | 
236 | 
237 | ===== --[no-]ignore_relative
238 | Ignore relative hrefs when gathering content links
239 | 
240 | 
241 | 
242 | ===== -x|--external_links_only
243 | Only gather external links
244 | 
245 | 
246 | 
247 | ==== Command: scrape  URL...
248 | Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.
249 | 
250 | 
251 | ===== Options
252 | ===== -b|--browser arg
253 | 
254 | Browser to use (firefox, chrome)
255 | 
256 | [Default Value] None
257 | 
258 | 
259 | ===== -h|--header arg
260 | 
261 | Define a header to send as "key=value"
262 | 
263 | [Default Value] None
264 | 
265 | 
266 | ===== -q|--query|--filter arg
267 | 
268 | Filter output using dot-syntax path
269 | 
270 | [Default Value] None
271 | 
272 | 
273 | ===== -r|--raw arg
274 | 
275 | Output a raw value for a key
276 | 
277 | [Default Value] None
278 | 
279 | 
280 | ===== --search arg
281 | 
282 | Regurn an array of matches to a CSS or XPath query
283 | 
284 | [Default Value] None
285 | 
286 | 
287 | ===== --[no-]clean
288 | Remove extra whitespace from results
289 | 
290 | 
291 | 
292 | ==== Command: screenshot  URL...
293 | Save a screenshot of the URL
294 | 
295 | 
296 | ===== Options
297 | ===== -b|--browser arg
298 | 
299 | Browser to use (firefox, chrome)
300 | 
301 | [Default Value] chrome
302 | [Must Match] (?-mix:^[fc].*?$)
303 | 
304 | 
305 | ===== -o|--out|--file arg
306 | 
307 | File destination
308 | 
309 | [Default Value] None
310 | 
311 | 
312 | ===== -t|--type arg
313 | 
314 | Type of screenshot to save (full (requires firefox), print, visible)
315 | 
316 | [Default Value] full
317 | [Must Match] (?-mix:^[fpv].*?$)
318 | 
319 | 
320 | ==== Command: tags  URL...
321 | Extract all instances of a tag
322 | 
323 | 
324 | ===== Options
325 | ===== -h|--header arg
326 | 
327 | Define a header to send as key=value
328 | 
329 | [Default Value] None
330 | 
331 | 
332 | ===== -q|--query|--search arg
333 | 
334 | CSS/XPath query
335 | 
336 | [Default Value] None
337 | 
338 | 
339 | ===== -t|--tag arg
340 | 
341 | Specify a tag to collect
342 | 
343 | [Default Value] None
344 | 
345 | 
346 | ===== -c|--[no-]compressed
347 | Expect compressed results
348 | 
349 | 
350 | 
351 | ===== --[no-]clean
352 | Remove extra whitespace from results
353 | 
354 | 
355 | 
356 | 


--------------------------------------------------------------------------------
/lib/curly.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'curly/version'
 4 | require 'curly/hash'
 5 | require 'curly/string'
 6 | require 'curly/array'
 7 | require 'curly/numeric'
 8 | require 'json'
 9 | require 'yaml'
10 | require 'uri'
11 | require 'tty-which'
12 | require 'nokogiri'
13 | require 'selenium-webdriver'
14 | 


--------------------------------------------------------------------------------
/lib/curly/array.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | # Array helpers
  4 | class ::Array
  5 |   ##
  6 |   ## Remove extra spaces from each element of an array of
  7 |   ## strings
  8 |   ##
  9 |   ## @return     [Array] cleaned array
 10 |   ##
 11 |   def clean
 12 |     map(&:clean)
 13 |   end
 14 | 
 15 |   ##
 16 |   ## @see #clean
 17 |   ##
 18 |   def clean!
 19 |     replace clean
 20 |   end
 21 | 
 22 |   ##
 23 |   ## Strip HTML tags from each element of an array of
 24 |   ## strings
 25 |   ##
 26 |   ## @return     [Array] array of strings with HTML tags removed
 27 |   ##
 28 |   def strip_tags
 29 |     map(&:strip_tags)
 30 |   end
 31 | 
 32 |   ##
 33 |   ## Destructive version of #strip_tags
 34 |   ##
 35 |   ## @see #strip_tags
 36 |   ##
 37 |   def strip_tags!
 38 |     replace strip_tags
 39 |   end
 40 | 
 41 |   ##
 42 |   ## Remove duplicate links from an array of link objects
 43 |   ##
 44 |   ## @return     [Array] deduped array of link objects
 45 |   ##
 46 |   def dedup_links
 47 |     used = []
 48 |     good = []
 49 |     each do |link|
 50 |       href = link[:href].sub(%r{/$}, '')
 51 |       next if used.include?(href)
 52 | 
 53 |       used.push(href)
 54 |       good.push(link)
 55 |     end
 56 | 
 57 |     good
 58 |   end
 59 | 
 60 |   ##
 61 |   ## Destructive version of #dedup_links
 62 |   ##
 63 |   ## @see #dedup_links
 64 |   ##
 65 |   def dedup_links!
 66 |     replace dedup_links
 67 |   end
 68 | 
 69 |   ##
 70 |   ## Run a query on array elements
 71 |   ##
 72 |   ## @param      path [String] dot.syntax path to compare
 73 |   ##
 74 |   ## @return [Array] elements matching dot query
 75 |   ##
 76 |   def dot_query(path)
 77 |     res = map { |el| el.dot_query(path) }
 78 |     res.delete_if { |r| !r }
 79 |     res.delete_if(&:empty?)
 80 |     res
 81 |   end
 82 | 
 83 |   ##
 84 |   ## Gets the value of every item in the array
 85 |   ##
 86 |   ## @param      path  The query path (dot syntax)
 87 |   ##
 88 |   ## @return     [Array] array of values
 89 |   ##
 90 |   def get_value(path)
 91 |     map { |el| el.get_value(path) }
 92 |   end
 93 | 
 94 |   ##
 95 |   ## Convert every item in the array to HTML
 96 |   ##
 97 |   ## @return     [String] Html representation of the object.
 98 |   ##
 99 |   def to_html
100 |     map(&:to_html)
101 |   end
102 | 
103 |   ##
104 |   ## Test if a tag contains an attribute matching filter
105 |   ## queries
106 |   ##
107 |   ## @param      tag_name    [String] The tag name
108 |   ## @param      classes     [String] The classes to match
109 |   ## @param      id          [String] The id attribute to
110 |   ##                         match
111 |   ## @param      attribute   [String] The attribute
112 |   ## @param      operator    [String] The operator, <>= *=
113 |   ##                         $= ^=
114 |   ## @param      value       [String] The value to match
115 |   ## @param      descendant  [Boolean] Check descendant tags
116 |   ##
117 |   ## @return     [Boolean] tag matches
118 |   ##
119 |   def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
120 |     tag = self
121 |     keep = true
122 | 
123 |     keep = false if tag_name && !tag['tag'] =~ /^#{tag_name}$/i
124 | 
125 |     if tag.key?('attrs') && tag['attrs']
126 |       if keep && id
127 |         tag_id = tag['attrs'].filter { |a| a['key'] == 'id' }.first['value']
128 |         keep = tag_id && tag_id =~ /#{id}/i
129 |       end
130 | 
131 |       if keep && classes
132 |         cls = tag['attrs'].filter { |a| a['key'] == 'class' }.first
133 |         if cls
134 |           all = true
135 |           classes.each { |c| all = cls['value'].include?(c) }
136 |           keep = all
137 |         else
138 |           keep = false
139 |         end
140 |       end
141 | 
142 |       if keep && attribute
143 |         attributes = tag['attrs'].filter { |a| a['key'] =~ /^#{attribute}$/i }
144 |         any = false
145 |         attributes.each do |a|
146 |           break if any
147 | 
148 |           any = case operator
149 |                 when /^*/
150 |                   a['value'] =~ /#{value}/i
151 |                 when /^\^/
152 |                   a['value'] =~ /^#{value}/i
153 |                 when /^\$/
154 |                   a['value'] =~ /#{value}$/i
155 |                 else
156 |                   a['value'] =~ /^#{value}$/i
157 |                 end
158 |         end
159 |         keep = any
160 |       end
161 |     end
162 | 
163 |     return false if descendant && !keep
164 | 
165 |     if !descendant && tag.key?('tags')
166 |       tags = tag['tags'].filter { |t| t.tag_match(tag_name, classes, id, attribute, operator, value) }
167 |       tags.count.positive?
168 |     else
169 |       keep
170 |     end
171 |   end
172 | 
173 |   ##
174 |   ## Clean up output, shrink single-item arrays, ensure array output
175 |   ##
176 |   ## @return [Array] cleaned up array
177 |   ##
178 |   def clean_output
179 |     output = dup
180 |     while output.is_a?(Array) && output.count == 1
181 |       output = output[0]
182 |     end
183 |     return [] unless output
184 | 
185 |     output.ensure_array
186 |   end
187 | 
188 |   ##
189 |   ## Ensure that an object is an array
190 |   ##
191 |   ## @return     [Array] object as Array
192 |   ##
193 |   def ensure_array
194 |     return self
195 |   end
196 | end
197 | 


--------------------------------------------------------------------------------
/lib/curly/curl.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | # import
4 | require_relative 'curl/html'
5 | 
6 | # import
7 | require_relative 'curl/json'
8 | 


--------------------------------------------------------------------------------
/lib/curly/curl/html.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | module Curl
  4 |   # String helpers
  5 |   class ::String
  6 |     def remove_entities
  7 |       gsub(/ /, ' ')
  8 |     end
  9 |   end
 10 | 
 11 |   # Class for CURLing an HTML page
 12 |   class Html
 13 |     attr_accessor :settings, :browser, :source, :headers, :headers_only, :compressed, :clean, :fallback,
 14 |                   :ignore_local_links, :ignore_fragment_links, :external_links_only, :local_links_only
 15 | 
 16 |     attr_reader :url, :code, :meta, :links, :head, :body,
 17 |                 :title, :description, :body_links, :body_images
 18 | 
 19 |     # Convert self to a hash of data
 20 |     #
 21 |     # @param      url   [String]  A base url to fall back to
 22 |     #
 23 |     # @return     [Hash] a hash of data
 24 |     #
 25 |     def to_data(url: nil)
 26 |       {
 27 |         url: @url || url,
 28 |         code: @code,
 29 |         headers: @headers,
 30 |         meta: @meta,
 31 |         meta_links: @links,
 32 |         head: @clean ? @head&.strip&.clean : @head,
 33 |         body: @clean ? @body&.strip&.clean : @body,
 34 |         source: @clean ? @source&.strip&.clean : @source,
 35 |         title: @title,
 36 |         description: @description,
 37 |         links: @body_links,
 38 |         images: @body_images
 39 |       }
 40 |     end
 41 | 
 42 |     ##
 43 |     ## Create a new page object from a URL
 44 |     ##
 45 |     ## @param      url      [String] The url
 46 |     ## @param      options  [Hash] The options
 47 |     ##
 48 |     ## @option options :browser [Symbol] the browser to use instead of curl (:chrome, :firefox)
 49 |     ## @option options :source [String] source provided instead of curl
 50 |     ## @option options :headers [Hash] headers to send in the request
 51 |     ## @option options :headers_only [Boolean] whether to return just response headers
 52 |     ## @option options :compressed [Boolean] expect compressed response
 53 |     ## @option options :clean [Boolean] clean whitespace from response
 54 |     ## @option options :fallback [Symbol] browser to fall back to if curl doesn't work (:chrome, :firefox)
 55 |     ## @option options :ignore_local_links [Boolean] when collecting links, ignore local/relative links
 56 |     ## @option options :ignore_fragment_links [Boolean] when collecting links, ignore links that are just #fragments
 57 |     ## @option options :external_links_only [Boolean] only collect links outside of current site
 58 |     ##
 59 |     ## @return     [HTMLCurl] new page object
 60 |     ##
 61 |     def initialize(url, options = {})
 62 |       @browser = options[:browser] || :none
 63 |       @source = options[:source]
 64 |       @headers = options[:headers] || {}
 65 |       @headers_only = options[:headers_only]
 66 |       @compressed = options[:compressed]
 67 |       @clean = options[:clean]
 68 |       @fallback = options[:fallback]
 69 |       @ignore_local_links = options[:ignore_local_links]
 70 |       @ignore_fragment_links = options[:ignore_fragment_links]
 71 |       @external_links_only = options[:external_links_only]
 72 |       @local_links_only = options[:local_links_only]
 73 | 
 74 |       @curl = TTY::Which.which('curl')
 75 |       @url = url.nil? ? options[:url] : url
 76 |     end
 77 | 
 78 |     ##
 79 |     # Parse raw HTML source instead of curling
 80 |     #
 81 |     # @param      source  [String] The source
 82 |     #
 83 |     #
 84 |     # @return     [Hash] Hash of data after processing #
 85 |     #
 86 |     def parse(source)
 87 |       @body = source
 88 |       { url: @url, code: @code, headers: @headers, meta: @meta, links: @links, head: @head, body: source,
 89 |         source: source.strip, body_links: content_links, body_images: content_images }
 90 |     end
 91 | 
 92 |     ##
 93 |     ## Curl a url, either with curl or Selenium based on browser settings
 94 |     ##
 95 |     def curl
 96 |       res = if @url && @browser && @browser != :none
 97 |               source = curl_dynamic_html
 98 |               curl_html(nil, source: source, headers: @headers)
 99 |             elsif url.nil? && !source.nil?
100 |               curl_html(nil, source: @source, headers: @headers, headers_only: @headers_only,
101 |                              compressed: @compressed, fallback: false)
102 |             else
103 |               curl_html(@url, headers: @headers, headers_only: @headers_only,
104 |                               compressed: @compressed, fallback: @fallback)
105 |             end
106 |       @url = res[:url]
107 |       @code = res[:code]
108 |       @headers = res[:headers]
109 |       @meta = res[:meta]
110 |       @links = res[:links]
111 |       @head = res[:head] unless res[:head].nil?
112 |       @body = reencode(res[:body])
113 |       @source = res[:source]
114 |       @title = @meta['og:title'] || @meta['title'] unless @meta.nil?
115 |       @description = @meta['og:description'] || @meta['description'] unless @meta.nil?
116 |       @body_links = content_links
117 |       @body_images = content_images
118 |     end
119 | 
120 |     ##
121 |     ## Save a screenshot of the url
122 |     ##
123 |     ## @param      urls         [Array] The urls
124 |     ## @param      destination  The file destination
125 |     ## @param      browser      The browser (:firefox,
126 |     ##                          :chrome)
127 |     ## @param      type         The type of screenshot to
128 |     ##                          save (:full_page,
129 |     ##                          :print_page, :visible)
130 |     ##
131 |     def screenshot(destination = nil, type: :full_page, script: nil, id: nil, wait: 0)
132 |       # full_page = type.to_sym == :full_page
133 |       # print_page = type.to_sym == :print_page
134 |       save_screenshot(destination, type: type, script: script, id: id, wait_seconds: wait)
135 |     end
136 | 
137 |     ##
138 |     ## @brief      Execute JavaScript
139 |     ##
140 |     ## @param      script  The script to run
141 |     ##
142 |     def execute(script, wait, element_id)
143 |       run_js(script, wait, element_id)
144 |     end
145 | 
146 |     ##
147 |     ## Extract text between two regular expressions
148 |     ##
149 |     ## @param      before  [String, Regexp] The before
150 |     ## @param      after   [String, Regexp] The after
151 |     ##
152 |     ## @return     [Array] array of matches
153 |     ##
154 |     def extract(before, after, inclusive: false)
155 |       before = /#{Regexp.escape(before)}/ unless before.is_a?(Regexp)
156 |       after = /#{Regexp.escape(after)}/ unless after.is_a?(Regexp)
157 |       rx = if inclusive
158 |              /(#{before.source}.*?#{after.source})/m
159 |            else
160 |              /(?<=#{before.source})(.*?)(?=#{after.source})/m
161 |            end
162 |       @body.scan(rx).map { |r| @clean ? r[0].clean : r[0] }
163 |     end
164 | 
165 |     ##
166 |     ## Extract an array of tags or tag attributes
167 |     ##
168 |     ## @param      tag        [String] The tag
169 |     ## @param      attribute  [String] The attribute
170 |     ## @param      source     [Boolean] Return full tag source
171 |     ##                        (negates attribute if true)
172 |     ## @param      content    [Boolean] Return only tag
173 |     ##                        contents
174 |     ##
175 |     ## @return     [Hash, Array] if source, return array of full
176 |     ##             tags, if content, return array of tag contents,
177 |     ##             otherwise, return a hash of tags including
178 |     ##             attributes and content
179 |     ##
180 |     ## If attribute is not given, tag contents will be returned
181 |     ##
182 |     ## @example    page.extract_tag('h1') => [Array of h1 tag
183 |     ## contents]
184 |     ## @example    page.extract_tag('img', 'src') => [Array of img
185 |     ## src attributes]
186 |     ##
187 |     def extract_tag(tag, attribute = nil, source: false, content: false)
188 |       res = extract_tag_contents(tag, source: true)
189 | 
190 |       return res if source
191 | 
192 |       res.map! do |tag_source|
193 |         m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
194 |         attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
195 |         tags = tag_source.match(/<.*?>(?.*?)(?:.*?)?}) if source
221 | 
222 |       @body.scan(/<#{tag}.*?>(.*?))
325 |       ].join(' ')
326 |     end
327 | 
328 |     ##
329 |     ## Return all headers of given level
330 |     ##
331 |     ## @param      level  [Number] The level (1-6)
332 |     ##
333 |     ## @return [Array] array of headers with text and all tag attributes as symbols
334 |     ##
335 |     def h(level = '\d')
336 |       res = []
337 |       headlines = @body.to_enum(:scan, %r{#{level})(? .*?)?>(?.*?)}i).map do
338 |         Regexp.last_match
339 |       end
340 |       headlines.each do |m|
341 |         headline = { level: m['level'] }
342 |         if m['tag'].nil?
343 |           attrs = nil
344 |         else
345 |           attrs = m['tag'].to_enum(:scan, /(?\w+)=(?["'])(?.*?)\k/).map { Regexp.last_match }
346 |           attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
347 |         end
348 |         headline[:text] = m['text'].remove_entities
349 |         res << headline
350 |       end
351 |       res
352 |     end
353 | 
354 |     ##
355 |     ## Convert a nokogiri element to Curl::Html format
356 |     ##
357 |     ## @param      el    [Nokogiri] element to convert
358 |     ##
359 |     def nokogiri_to_tag(el)
360 |       attributes = {}
361 |       attributes = el.attribute_nodes.each_with_object({}) do |a, hsh|
362 |         hsh[a.name] = a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value
363 |       end
364 | 
365 |       {
366 |         tag: el.name,
367 |         source: @clean ? el.to_html&.strip&.clean : el.to_html,
368 |         attrs: attributes,
369 |         content: @clean ? el.text&.strip&.clean : el.text.strip,
370 |         tags: recurse_children(el)
371 |       }
372 |     end
373 | 
374 |     def recurse_children(element)
375 |       children = []
376 |       element.children.each do |child|
377 |         next if child.name == 'text'
378 | 
379 |         children.push(nokogiri_to_tag(child))
380 |       end
381 |       children
382 |     end
383 | 
384 |     #-------------------------------------------------------
385 |     ## Perform a CSS query using Nokogiri
386 |     ##
387 |     ## @param      path  [String]  The CSS path
388 |     ##
389 |     ## @return     [Array] array of matched elements
390 |     ##
391 |     def search(path, source: @source, return_source: false)
392 |       doc = Nokogiri::HTML(source)
393 |       output = []
394 |       if return_source
395 |         output = doc.search(path).to_html
396 |       else
397 |         doc.search(path).each do |el|
398 |           out = nokogiri_to_tag(el)
399 |           output.push(out)
400 |         end
401 |       end
402 |       output
403 |     end
404 | 
405 |     private
406 | 
407 |     ##
408 |     ## Flatten the array of tags
409 |     ##
410 |     ## @param      tags  [Array] Document tags
411 |     ##
412 |     def flatten_tags(tags)
413 |       flattened = []
414 | 
415 |       tags.each do |t|
416 |         flattened << { tag: t[:tag], attrs: t[:attrs],
417 |                        content: @clean ? t[:content]&.strip&.clean : t[:content]&.strip }
418 |         flattened.concat(flatten_tags(t[:tags])) unless t[:tags].nil?
419 |       end
420 | 
421 |       flattened
422 |     end
423 | 
424 |     ##
425 |     ## Return an array of all tags in the content
426 |     ##
427 |     ## @param      content  [String] The content to parse
428 |     ##
429 |     def content_tags(content)
430 |       return nil if content.nil?
431 | 
432 |       res = content.to_enum(:scan, %r{(?mix)
433 |         <(?(?!\s[^>]+)?
434 |         (?:\s*/>|>(?.*?)>)}).map { Regexp.last_match }
435 |       res.map do |tag|
436 |         if tag['attrs'].nil?
437 |           attrs = nil
438 |         else
439 |           attrs = tag['attrs'].strip.to_enum(:scan, /(?ix)
440 |                                              (?[@a-z0-9-]+)(?:=(?["'])
441 |                                              (?[^"']+)\k|[ >])?/i).map { Regexp.last_match }
442 |           attributes = attrs.each_with_object({}) do |a, hsh|
443 |             if a['value'].nil?
444 |               hsh[a['key']] = nil
445 |             else
446 |               hsh[a['key']] = a['key'] =~ /^(class|rel)$/ ? a['value'].split(/ /) : a['value']
447 |             end
448 |           end
449 |         end
450 |         {
451 |           tag: tag['tag'],
452 |           source: tag.to_s,
453 |           attrs: attributes,
454 |           content: @clean ? tag['content']&.clean : tag['content'],
455 |           tags: content_tags(tag['content'])
456 |         }
457 |       end
458 |     end
459 | 
460 |     ##
461 |     ## Extract all meta tags from the document head
462 |     ##
463 |     ## @param      head [String] The head content
464 |     ##
465 |     ## @return     [Hash] hash of meta tags and values
466 |     ##
467 |     def meta_tags(head)
468 |       meta = {}
469 |       title = head.match(%r{(?<=)(.*?)(?=)})
470 |       meta['title'] = title.nil? ? nil : title[1]
471 |       refresh = head.match(/http-equiv=(['"])refresh\1(.*?)>/)
472 |       url = refresh.nil? ? nil : refresh[2].match(/url=(.*?)['"]/)
473 |       meta['refresh_url'] = url
474 |       meta_tags = head.scan(//)
475 |       meta_tags.each do |tag|
476 |         meta_name = tag.match(/(?:name|property|http-equiv)=(["'])(.*?)\1/)
477 |         next if meta_name.nil?
478 | 
479 |         meta_value = tag.match(/(?:content)=(['"])(.*?)\1/)
480 |         next if meta_value.nil?
481 | 
482 |         meta[meta_name[2].downcase] = meta_value[2]
483 |       end
484 |       meta
485 |     rescue StandardError => e
486 |       warn e
487 |       {}
488 |     end
489 | 
490 |     ##
491 |     ## Extract all  tags from head
492 |     ##
493 |     ## @param      head  [String] The head content
494 |     ##
495 |     ## @return     [Array] Array of links
496 |     ##
497 |     def link_tags(head)
498 |       links = []
499 |       link_tags = head.scan(//)
500 |       link_tags.each do |tag|
501 |         link_rel = tag.match(/rel=(['"])(.*?)\1/)
502 |         link_rel = link_rel.nil? ? nil : link_rel[2]
503 | 
504 |         next if link_rel =~ /preload/
505 | 
506 |         link_href = tag.match(/href=(["'])(.*?)\1/)
507 |         next if link_href.nil?
508 | 
509 |         link_href = link_href[2]
510 | 
511 |         if @local_links_only
512 |           next if @ignore_fragment_links && link_href =~ /^#/
513 | 
514 |           next unless same_origin?(link_href)
515 | 
516 |         else
517 |           next if link_href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
518 | 
519 |           next if link_href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
520 | 
521 |           next if same_origin?(link_href) && @external_links_only
522 | 
523 |         end
524 | 
525 |         link_title = tag.match(/title=(['"])(.*?)\1/)
526 |         link_title = link_title.nil? ? nil : link_title[2]
527 | 
528 |         link_type = tag.match(/type=(['"])(.*?)\1/)
529 |         link_type = link_type.nil? ? nil : link_type[2]
530 | 
531 |         links << { rel: link_rel, href: link_href, type: link_type, title: link_title }
532 |       end
533 |       links
534 |     end
535 | 
536 |     ##
537 |     ## Get all links in the body of the page
538 |     ##
539 |     ## rel and class are returned as arrays
540 |     ##
541 |     ## @return     [Array] array of links with href, title,
542 |     ##             rel, content and class
543 |     ##
544 |     def content_links
545 |       links = []
546 | 
547 |       link_tags = @body.to_enum(:scan, %r{.*?)>(?.*?)}).map { Regexp.last_match }
548 |       link_tags.each do |m|
549 |         href = m['tag'].match(/href=(["'])(.*?)\1/)
550 |         href = href[2] unless href.nil?
551 |         if @local_links_only
552 |           next if href =~ /^#/ && @ignore_fragment_links
553 | 
554 |           next unless same_origin?(href)
555 | 
556 |         else
557 |           next if href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
558 | 
559 |           next if href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
560 | 
561 |           next if same_origin?(href) && @external_links_only
562 | 
563 |         end
564 | 
565 |         title = m['tag'].match(/title=(["'])(.*?)\1/)
566 |         title = title[2] unless title.nil?
567 |         rel = m['tag'].match(/rel=(["'])(.*?)\1/)
568 |         rel = rel[2].split(/ +/) unless rel.nil?
569 |         link_class = m['tag'].match(/class=(["'])(.*?)\1/)
570 |         link_class = link_class[2].split(/ +/) unless link_class.nil?
571 |         text = m['text'].remove_entities
572 |         link = {
573 |           href: href,
574 |           title: title,
575 |           rel: rel,
576 |           content: text,
577 |           class: link_class
578 |         }
579 |         links << link
580 |       end
581 |       links
582 |     end
583 | 
584 |     ##
585 |     ## Get all img tags in the body of the page
586 |     ##
587 |     ## @return     [Array] array of images with src and all attributes
588 |     ##
589 |     def content_images
590 |       images = []
591 |       image_tags = @body.to_enum(:scan, %r{.*?)/?>}).map { Regexp.last_match }
592 |       image_tags.each do |m|
593 |         attrs = m['tag'].to_enum(:scan, /(?\w+)=(?["'])(?.*?)\k/).map { Regexp.last_match }
594 |         image = {}
595 |         attrs.each { |a| image[a['attr'].to_sym] = a['content'] }
596 |         images << image
597 |       end
598 |       images
599 |     end
600 | 
601 |     ##
602 |     ## Uses Selenium to load a page, allowing capture of dynamic (JS) pages
603 |     ##
604 |     ## @param      url   The url
605 |     ##
606 |     ## @return [String] page source
607 |     ##
608 |     def curl_dynamic_html
609 |       browser = @browser.is_a?(String) ? @browser.normalize_browser_type : @browser
610 |       res = nil
611 | 
612 |       driver = Selenium::WebDriver.for browser
613 |       driver.manage.timeouts.implicit_wait = 4
614 |       begin
615 |         driver.get @url
616 |         res = driver.page_source
617 |       ensure
618 |         driver.quit
619 |       end
620 | 
621 |       res
622 |     end
623 | 
624 |     ##
625 |     ## Run JavaScript on a URL
626 |     ##
627 |     ## @param      script      The JavaScript to execute
628 |     ## @param      wait        Seconds to wait after executing JS
629 |     ## @param      element_id  The element identifier
630 |     ##
631 |     def run_js(script, wait_seconds = 2, element_id = nil)
632 |       raise 'No script provided' if script.nil?
633 | 
634 |       browser = @browser.is_a?(String) ? @browser.normalize_browser_type : @browser
635 | 
636 |       driver = Selenium::WebDriver.for browser
637 | 
638 |       driver.manage.timeouts.implicit_wait = 15
639 |       res = nil
640 |       begin
641 |         driver.get @url
642 |         if element_id
643 |           wait = Selenium::WebDriver::Wait.new(timeout: 10) # seconds
644 |           wait.until { driver.find_element(id: element_id) }
645 |         end
646 |         res = driver.execute_script(script)
647 |         sleep wait_seconds.to_i
648 |       ensure
649 |         driver.quit
650 |       end
651 | 
652 |       warn "Executed JS on #{@url}"
653 | 
654 |       res
655 |     end
656 | 
657 |     ##
658 |     ## Save a screenshot of a url
659 |     ##
660 |     ## @param      destination  [String] File path destination
661 |     ## @param      type         [Symbol] The type of screenshot (:full_page, :print_page, or :visible)
662 |     ##
663 |     def save_screenshot(destination = nil, type: :full_page, script: nil, wait_seconds: 0, id: nil)
664 |       raise 'No URL provided' if url.nil?
665 | 
666 |       raise 'No file destination provided' if destination.nil?
667 | 
668 |       destination = File.expand_path(destination)
669 | 
670 |       raise 'Path doesn\'t exist' unless File.directory?(File.dirname(destination))
671 | 
672 |       browser = @browser.is_a?(String) ? @browser.normalize_browser_type : @browser
673 |       type = type.normalize_screenshot_type if type.is_a?(String)
674 |       raise 'Can not save full screen with Chrome, use Firefox' if type == :full_page && browser == :chrome
675 | 
676 |       destination = case type
677 |                     when :print_page
678 |                       "#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.pdf"
679 |                     else
680 |                       "#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.png"
681 |                     end
682 | 
683 |       driver = Selenium::WebDriver.for browser
684 |       driver.manage.timeouts.implicit_wait = 4
685 |       begin
686 |         driver.get @url
687 |         if id
688 |           wait = Selenium::WebDriver::Wait.new(timeout: 10) # seconds
689 |           wait.until { driver.find_element(id: id) }
690 |         end
691 | 
692 |         res = driver.execute_script(script) if script
693 | 
694 |         sleep wait_seconds.to_i
695 | 
696 |         case type
697 |         when :print_page
698 |           driver.save_print_page(destination)
699 |         when :full_page
700 |           driver.save_full_page_screenshot(destination)
701 |         else
702 |           driver.save_screenshot(destination)
703 |         end
704 |       ensure
705 |         driver.quit
706 |       end
707 | 
708 |       warn "Screenshot saved to #{destination}"
709 |     end
710 | 
711 |     ##
712 |     ## Curls the html for the page
713 |     ##
714 |     ## @param      url           [String] The url
715 |     ## @param      headers       [Hash] The headers
716 |     ## @param      headers_only  [Boolean] Return headers only
717 |     ## @param      compressed    [Boolean] expect compressed results
718 |     ##
719 |     ## @return     [Hash] hash of url, code, headers, meta, links, head, body, and source
720 |     ##
721 |     def curl_html(url = nil, source: nil, headers: nil,
722 |                   headers_only: false, compressed: false, fallback: false)
723 |       if !url.nil?
724 |         flags = 'SsL'
725 |         flags += @headers_only ? 'I' : 'i'
726 |         agents = [
727 |           'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
728 |           'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
729 |           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
730 |           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
731 |         ]
732 |         headers = @headers.nil? ? '' : @headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
733 |         compress = @compressed ? '--compressed' : ''
734 |         @source = `#{@curl} -#{flags} #{compress} #{headers} '#{@url}' 2>/dev/null`.strip.utf8
735 |         agent = 0
736 | 
737 |         while @source.nil? || @source.empty?
738 |           @source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{@url}' 2>/dev/null`.strip.utf8
739 |           break if agent >= agents.count - 1
740 |         end
741 | 
742 |         unless $?.success? || @fallback
743 |           warn "Error curling #{@url}"
744 |           Process.exit 1
745 |         end
746 | 
747 |         headers = { 'location' => @url }
748 |         lines = @source.split(/\r\n/)
749 |         code = lines[0].match(/(\d\d\d)/)[1]
750 |         lines.shift
751 |         lines.each_with_index do |line, idx|
752 |           if line =~ /^([\w-]+): (.*?)$/
753 |             m = Regexp.last_match
754 |             headers[m[1]] = m[2]
755 |           else
756 |             @source = lines[idx..].join("\n")
757 |             break
758 |           end
759 |         end
760 | 
761 |         if headers['content-encoding'] =~ /gzip/i && !compressed
762 |           warn 'Response is gzipped, you may need to try again with --compressed'
763 |         end
764 | 
765 |         if headers['content-type'] =~ /json/
766 |           return { url: @url, code: code, headers: headers, meta: nil, links: nil,
767 |                    head: nil, body: @source.strip, source: @source.strip, body_links: nil, body_images: nil }
768 |         end
769 |       else
770 |         @source = source unless source.nil?
771 |       end
772 | 
773 |       @source = curl_dynamic_html(@url, @fallback, @headers) if @fallback && (@source.nil? || @source.empty?)
774 | 
775 |       return false if @source.nil? || @source.empty?
776 | 
777 |       @source.strip!
778 | 
779 |       head = @source.match(%r{(?<=)(.*?)(?=)}mi)
780 | 
781 |       if head.nil?
782 |         { url: @url, code: code, headers: headers, meta: nil, links: nil, head: nil, body: @source.strip,
783 |           source: @source.strip, body_links: nil, body_images: nil }
784 |       else
785 |         @body = @source.match(%r{(.*?)}mi)[1]
786 |         meta = meta_tags(head[1])
787 |         links = link_tags(head[1])
788 | 
789 |         { url: @url, code: code, headers: headers, meta: meta, links: links, head: head[1], body: @body,
790 |           source: @source.strip, body_links: nil, body_images: nil }
791 |       end
792 |     end
793 | 
794 |     ##
795 |     ## Reencode the content (borrowed from Nokogiri)
796 |     ##
797 |     ## @param      body          [String] The body
798 |     ## @param      content_type  [String] Force content type
799 |     ##
800 |     def reencode(body, content_type = nil)
801 |       if body.encoding == Encoding::ASCII_8BIT
802 |         encoding = nil
803 | 
804 |         # look for a Byte Order Mark (BOM)
805 |         initial_bytes = body[0..2].bytes
806 |         if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
807 |           encoding = Encoding::UTF_8
808 |         elsif initial_bytes[0..1] == [0xFE, 0xFF]
809 |           encoding = Encoding::UTF_16BE
810 |         elsif initial_bytes[0..1] == [0xFF, 0xFE]
811 |           encoding = Encoding::UTF_16LE
812 |         end
813 | 
814 |         # look for a charset in a content-encoding header
815 |         encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1] if content_type
816 | 
817 |         # look for a charset in a meta tag in the first 1024 bytes
818 |         unless encoding
819 |           data = body[0..1023].gsub(/|\Z)/m, '')
820 |           data.scan(//im).each do |meta|
821 |             encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
822 |           end
823 |         end
824 | 
825 |         # if all else fails, default to the official default encoding for HTML
826 |         encoding ||= Encoding::ISO_8859_1
827 | 
828 |         # change the encoding to match the detected or inferred encoding
829 |         body = body.dup
830 |         begin
831 |           body.force_encoding(encoding)
832 |         rescue ArgumentError
833 |           body.force_encoding(Encoding::ISO_8859_1)
834 |         end
835 |       end
836 | 
837 |       body.encode(Encoding::UTF_8)
838 |     end
839 | 
840 |     ##
841 |     ## Test if a given url has the same hostname as @url
842 |     ##
843 |     ## @param      href  [String] The url to test
844 |     ##
845 |     ## @return     [Boolean] true if hostnames match
846 |     ##
847 |     def same_origin?(href)
848 |       uri = URI(href)
849 |       origin = URI(@url)
850 |       uri.host == origin.host
851 |     rescue StandardError
852 |       false
853 |     end
854 |   end
855 | end
856 | 


--------------------------------------------------------------------------------
/lib/curly/curl/json.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | module Curl
  4 |   # Class for CURLing a JSON response
  5 |   class Json
  6 |     attr_accessor :url
  7 | 
  8 |     attr_writer :compressed, :request_headers, :symbolize_names
  9 | 
 10 |     attr_reader :code, :json, :headers
 11 | 
 12 |     def to_data
 13 |       {
 14 |         url: @url,
 15 |         code: @code,
 16 |         json: @json,
 17 |         headers: @headers
 18 |       }
 19 |     end
 20 | 
 21 |     ##
 22 |     ## Create a new Curl::Json page object
 23 |     ##
 24 |     ## @param      url         [String] The url to curl
 25 |     ## @param      headers     [Hash] The headers to send
 26 |     ## @param      compressed  [Boolean] Expect compressed results
 27 |     ##
 28 |     ## @return     [Curl::Json] Curl::Json object with url, code, parsed json, and response headers
 29 |     ##
 30 |     def initialize(url, options = {})
 31 |       @url = url
 32 |       @request_headers = options[:headers]
 33 |       @compressed = options[:compressed]
 34 |       @symbolize_names = options[:symbolize_names]
 35 | 
 36 |       @curl = TTY::Which.which('curl')
 37 |     end
 38 | 
 39 |     def curl
 40 |       page = curl_json
 41 | 
 42 |       raise "Error retrieving #{url}" if page.nil? || page.empty?
 43 | 
 44 |       @url = page[:url]
 45 |       @code = page[:code]
 46 |       @json = page[:json]
 47 |       @headers = page[:headers]
 48 |     end
 49 | 
 50 |     def path(path, json = @json)
 51 |       parts = path.split(/./)
 52 |       target = json
 53 |       parts.each do |part|
 54 |         if part =~ /(?[^\[]+)\[(?\d+)\]/
 55 |           target = target[key][int.to_i]
 56 |         else
 57 |           target = target[part]
 58 |         end
 59 |       end
 60 | 
 61 |       target
 62 |     end
 63 | 
 64 |     private
 65 | 
 66 |     ##
 67 |     ## Curl the JSON contents
 68 |     ##
 69 |     ## @param      url         [String] The url
 70 |     ## @param      headers     [Hash] The headers to send
 71 |     ## @param      compressed  [Boolean] Expect compressed results
 72 |     ##
 73 |     ## @return     [Hash] hash of url, code, headers, and parsed json
 74 |     ##
 75 |     def curl_json
 76 |       flags = 'SsLi'
 77 |       agents = [
 78 |         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
 79 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
 80 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
 81 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
 82 |       ]
 83 | 
 84 |       headers = @headers.nil? ? '' : @headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
 85 |       compress = @compressed ? '--compressed' : ''
 86 |       source = `#{@curl} -#{flags} #{compress} #{headers} '#{@url}' 2>/dev/null`
 87 |       agent = 0
 88 |       while source.nil? || source.empty?
 89 |         source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{@url}' 2>/dev/null`
 90 |         break if agent >= agents.count - 1
 91 |       end
 92 | 
 93 |       return false if source.nil? || source.empty?
 94 | 
 95 |       source.strip!
 96 | 
 97 |       headers = {}
 98 |       lines = source.split(/\r\n/)
 99 |       code = lines[0].match(/(\d\d\d)/)[1]
100 |       lines.shift
101 |       lines.each_with_index do |line, idx|
102 |         if line =~ /^([\w-]+): (.*?)$/
103 |           m = Regexp.last_match
104 |           headers[m[1]] = m[2]
105 |         else
106 |           source = lines[idx..].join("\n")
107 |           break
108 |         end
109 |       end
110 | 
111 |       json = source.strip.force_encoding('utf-8')
112 |       begin
113 |         json.gsub!(/[\u{1F600}-\u{1F6FF}]/, '')
114 |         { url: @url, code: code, headers: headers, json: JSON.parse(json, symbolize_names: @symbolize_names) }
115 |       rescue StandardError
116 |         { url: @url, code: code, headers: headers, json: nil }
117 |       end
118 |     end
119 |   end
120 | end
121 | 


--------------------------------------------------------------------------------
/lib/curly/hash.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | # Hash helpers
  4 | class ::Hash
  5 |   ## Convert a Curly object to data hash
  6 |   ##
  7 |   ## @return     [Hash] return a hash with keys renamed and
  8 |   ##             cleaned up
  9 |   ##
 10 |   ## @param      url    [String] A url to fall back to
 11 |   ## @param      clean  [Boolean] Clean extra spaces and newlines in sources
 12 |   ##
 13 |   def to_data(url: nil, clean: false)
 14 |     if key?(:body_links)
 15 |       {
 16 |         url: self[:url] || url,
 17 |         code: self[:code],
 18 |         headers: self[:headers],
 19 |         meta: self[:meta],
 20 |         meta_links: self[:links],
 21 |         head: clean ? self[:head]&.strip&.clean : self[:head],
 22 |         body: clean ? self[:body]&.strip&.clean : self[:body],
 23 |         source: clean ? self[:source]&.strip&.clean : self[:source],
 24 |         title: self[:title],
 25 |         description: self[:description],
 26 |         links: self[:body_links],
 27 |         images: self[:body_images]
 28 |       }
 29 |     else
 30 |       self
 31 |     end
 32 |   end
 33 | 
 34 |   ##
 35 |   ## Return the raw HTML of the object
 36 |   ##
 37 |   ## @return    [String] Html representation of the object.
 38 |   ##
 39 |   def to_html
 40 |     if key?(:source)
 41 |       self[:source]
 42 |     end
 43 |   end
 44 | 
 45 |   ##
 46 |   ## Get a value from the hash using a dot-syntax query
 47 |   ##
 48 |   ## @param      query  [String] The query (dot notation)
 49 |   ##
 50 |   ## @return     [Object] result of querying the hash
 51 |   ##
 52 |   def get_value(query)
 53 |     return nil if self.empty?
 54 |     stringify_keys!
 55 | 
 56 |     query.split('.').inject(self) do |v, k|
 57 |       return v.map { |el| el.get_value(k) } if v.is_a? Array
 58 |       # k = k.to_i if v.is_a? Array
 59 |       next v unless v.key?(k)
 60 | 
 61 |       v.fetch(k)
 62 |     end
 63 |   end
 64 | 
 65 |   # Extract data using a dot-syntax path
 66 |   #
 67 |   # @param      path  [String] The path
 68 |   #
 69 |   # @return     [Object] Result of path query
 70 |   #
 71 |   def dot_query(path, root = nil, full_tag: true)
 72 |     res = stringify_keys
 73 |     res = res[root] unless root.nil?
 74 | 
 75 |     unless path =~ /\[/
 76 |       return res.get_value(path)
 77 |     end
 78 | 
 79 |     path.gsub!(/\[(.*?)\]/) do
 80 |       inter = Regexp.last_match(1).gsub(/\./, '%')
 81 |       "[#{inter}]"
 82 |     end
 83 | 
 84 |     out = []
 85 |     q = path.split(/(?]=? *\w+)?/
103 |         m = pth.match(/\[(?[,+&])? *(?[\w.]+)( *(?[\^*$=<>]{1,2}) *(?[^,&\]]+))? */)
104 | 
105 |         comp = [m['key'], m['op'], m['val']]
106 |         case m['com']
107 |         when ','
108 |           ats.push(comp)
109 |           at = []
110 |         else
111 |           at.push(comp)
112 |         end
113 | 
114 |         pth.sub!(/\[(?[,&+])? *(?[\w.]+)( *(?[\^*$=<>]{1,2}) *(?[^,&\]]+))?/, '[')
115 |       end
116 |       ats.push(at) unless at.empty?
117 |       pth.sub!(/\[\]/, '')
118 | 
119 |       res = res[0] if res.is_a?(Array) && res.count == 1
120 |       if ats.empty? && el.nil? && res.is_a?(Array) && res[0]&.key?(pth)
121 |         res.map! { |r| r[pth] }
122 |         next
123 |       end
124 | 
125 |       res.map!(&:stringify_keys) if res.is_a?(Array) && res[0].is_a?(Hash)
126 |       # if res.is_a?(String) || (res.is_a?(Array) && res[0].is_a?(String))
127 |       #   out.push(res)
128 |       #   next
129 |       # end
130 | 
131 |       # if res.is_a?(Array) && !pth.nil?
132 |       #   return res.delete_if { |r| !r.key?(pth) }
133 |       # else
134 |       #   return false if el.nil? && ats.empty? && res.is_a?(Hash) && (res.nil? || !res.key?(pth))
135 |       # end
136 |       tag = res
137 |       res = res[pth] unless pth.nil? || pth.empty?
138 | 
139 |       pth = ''
140 | 
141 |       return false if res.nil?
142 | 
143 |       if ats.count.positive?
144 |         while ats.count.positive?
145 |           atr = ats.shift
146 |           res = [res] if res.is_a?(Hash)
147 |           res.each do |r|
148 |             out.push(full_tag ? tag : r) if evaluate_comp(r, atr)
149 |           end
150 |         end
151 |       else
152 |         out = res
153 |       end
154 | 
155 |       out = out.get_value(pth) unless pth.nil?
156 | 
157 |       if el.nil? && out.is_a?(Array) && out[0].is_a?(Hash)
158 |         out.map! { |o|
159 |           o.stringify_keys
160 |           # o.key?(pth) ? o[pth] : o
161 |         }
162 |       elsif out.is_a?(Array) && el =~ /^[\d.,]+$/
163 |         out = out[eval(el)]
164 |       end
165 |       res = out
166 |     end
167 | 
168 |     out = out[0] if out&.count == 1
169 |     out
170 |   end
171 | 
172 |   ##
173 |   ## Test if values in an array match an operator
174 |   ##
175 |   ## @param      array [Array] The array
176 |   ## @param      key   [String] The key
177 |   ## @param      comp  [String] The comparison, e.g. *= or $=
178 |   ##
179 |   ## @return [Boolean] true if array contains match
180 |   def array_match(array, key, comp)
181 |     keep = false
182 |     array.each do |el|
183 |       keep = case comp
184 |              when /^\^/
185 |                key =~ /^#{el}/i ? true : false
186 |              when /^\$/
187 |                key =~ /#{el}$/i ? true : false
188 |              when /^\*/
189 |                key =~ /#{el}/i ? true : false
190 |              else
191 |                key =~ /^#{el}$/i ? true : false
192 |              end
193 |       break if keep
194 |     end
195 |     keep
196 |   end
197 | 
198 |   ##
199 |   ## Evaluate a comparison
200 |   ##
201 |   ## @param      r     [Hash] hash of source elements and
202 |   ##                   comparison operators
203 |   ## @param      atr   [Array] Array of arrays conaining [attribute,comparitor,value]
204 |   ##
205 |   ## @return     [Boolean] whether the comparison passes or fails
206 |   ##
207 |   def evaluate_comp(r, atr)
208 |     keep = true
209 | 
210 |     r = r.symbolize_keys
211 | 
212 |     atr.each do |a|
213 |       key = a[0].to_sym
214 |       val = if a[2] =~ /^\d+$/
215 |               a[2].to_i
216 |             elsif a[2] =~ /^\d+\.\d+$/
217 |               a[2].to_f
218 |             else
219 |               a[2]
220 |             end
221 |       r = r.get_value(key.to_s) if key.to_s =~ /\./
222 | 
223 |       if val.nil?
224 |         if r.is_a?(Hash)
225 |           return r.key?(key) && !r[key].nil? && !r[key].empty?
226 |         elsif r.is_a?(String)
227 |           return r.nil? ? false : true
228 |         elsif r.is_a?(Array)
229 |           return r.empty? ? false : true
230 |         end
231 |       end
232 | 
233 |       if r.nil?
234 |         keep = false
235 |       elsif r.is_a?(Array)
236 |         valid = r.filter do |k|
237 |           if k.is_a? Array
238 |             array_match(k, a[2], a[1])
239 |           else
240 |             case a[1]
241 |             when /^\^/
242 |               k =~ /^#{a[2]}/i ? true : false
243 |             when /^\$/
244 |               k =~ /#{a[2]}$/i ? true : false
245 |             when /^\*/
246 |               k =~ /#{a[2]}/i ? true : false
247 |             else
248 |               k =~ /^#{a[2]}$/i ? true : false
249 |             end
250 |           end
251 |         end
252 | 
253 |         keep = valid.count.positive?
254 |       elsif val.is_a?(Numeric) && a[1] =~ /^[<>=]{1,2}$/
255 |         k = r.to_i
256 |         comp = a[1] =~ /^=$/ ? '==' : a[1]
257 |         keep = eval("#{k}#{comp}#{val}")
258 |       else
259 |         v = r.is_a?(Hash) ? r[key] : r
260 |         if v.is_a? Array
261 |           keep = array_match(v, a[2], a[1])
262 |         else
263 |           keep = case a[1]
264 |                  when /^\^/
265 |                    v =~ /^#{a[2]}/i ? true : false
266 |                  when /^\$/
267 |                    v =~ /#{a[2]}$/i ? true : false
268 |                  when /^\*/
269 |                    v =~ /#{a[2]}/i ? true : false
270 |                  else
271 |                    v =~ /^#{a[2]}$/i ? true : false
272 |                  end
273 |         end
274 |       end
275 | 
276 |       return false unless keep
277 |     end
278 | 
279 |     keep
280 |   end
281 | 
282 |   ##
283 |   ## Test if a tag contains an attribute matching filter queries
284 |   ##
285 |   ## @param      tag_name    [String] The tag name
286 |   ## @param      classes     [String] The classes to match
287 |   ## @param      id          [String] The id attribute to
288 |   ##                         match
289 |   ## @param      attribute   [String] The attribute
290 |   ## @param      operator    [String] The operator, <>= *=
291 |   ##                         $= ^=
292 |   ## @param      value       [String] The value to match
293 |   ## @param      descendant  [Boolean] Check descendant tags
294 |   ##
295 |   def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
296 |     tag = self
297 |     keep = true
298 | 
299 |     keep = false if tag_name && !tag['tag'] =~ /^#{tag_name}$/i
300 | 
301 |     if tag.key?('attrs') && tag['attrs']
302 |       if keep && id
303 |         tag_id = tag['attrs'].filter { |a| a['key'] == 'id' }.first['value']
304 |         keep = tag_id && tag_id =~ /#{id}/i
305 |       end
306 | 
307 |       if keep && classes
308 |         cls = tag['attrs'].filter { |a| a['key'] == 'class' }.first
309 |         if cls
310 |           all = true
311 |           classes.each { |c| all = cls['value'].include?(c) }
312 |           keep = all
313 |         else
314 |           keep = false
315 |         end
316 |       end
317 | 
318 |       if keep && attribute
319 |         attributes = tag['attrs'].filter { |a| a['key'] =~ /^#{attribute}$/i }
320 |         any = false
321 |         attributes.each do |a|
322 |           break if any
323 | 
324 |           any = case operator
325 |                 when /^*/
326 |                   a['value'] =~ /#{value}/i
327 |                 when /^\^/
328 |                   a['value'] =~ /^#{value}/i
329 |                 when /^\$/
330 |                   a['value'] =~ /#{value}$/i
331 |                 else
332 |                   a['value'] =~ /^#{value}$/i
333 |                 end
334 |         end
335 |         keep = any
336 |       end
337 |     end
338 | 
339 |     return false if descendant && !keep
340 | 
341 |     if !descendant && tag.key?('tags')
342 |       tags = tag['tags'].filter { |t| t.tag_match(tag_name, classes, id, attribute, operator, value) }
343 |       tags.count.positive?
344 |     else
345 |       keep
346 |     end
347 |   end
348 | 
349 |   # Turn all keys into symbols
350 |   #
351 |   # If the hash has both a string and a symbol for key,
352 |   # keep the symbol value, discarding the string value
353 |   #
354 |   # @return     [Hash] a copy of the hash where all its
355 |   #             keys are strings
356 |   #
357 |   def symbolize_keys
358 |     each_with_object({}) do |(k, v), hsh|
359 |       next if k.is_a?(String) && key?(k.to_sym)
360 | 
361 |       hsh[k.to_sym] = v.is_a?(Hash) ? v.symbolize_keys : v
362 |     end
363 |   end
364 | 
365 |   # Turn all keys into strings
366 |   #
367 |   # If the hash has both a string and a symbol for key,
368 |   # keep the string value, discarding the symbol value
369 |   #
370 |   # @return     [Hash] a copy of the hash where all its
371 |   #             keys are strings
372 |   #
373 |   def stringify_keys
374 |     each_with_object({}) do |(k, v), hsh|
375 |       next if k.is_a?(Symbol) && key?(k.to_s)
376 | 
377 |       hsh[k.to_s] = v.is_a?(Hash) ? v.stringify_keys : v
378 |     end
379 |   end
380 | 
381 |   ##
382 |   ## Destructive version of #stringify_keys
383 |   ##
384 |   ## @see        #stringify_keys
385 |   ##
386 |   def stringify_keys!
387 |     replace stringify_keys
388 |   end
389 | 
390 |   ##
391 |   ## Clean up empty arrays and return an array with one or
392 |   ## more elements
393 |   ##
394 |   ## @return     [Array] output array
395 |   ##
396 |   def clean_output
397 |     output = ensure_array
398 |     output.clean_output
399 |   end
400 | 
401 |   ##
402 |   ## Ensure that an object is an array
403 |   ##
404 |   ## @return     [Array] object as Array
405 |   ##
406 |   def ensure_array
407 |     return [self]
408 |   end
409 | end
410 | 


--------------------------------------------------------------------------------
/lib/curly/numeric.rb:
--------------------------------------------------------------------------------
 1 | # Numeric helpers
 2 | class ::Numeric
 3 |   ##
 4 |   ## Return an array version of self
 5 |   ##
 6 |   ## @return     [Array] self enclosed in an array
 7 |   ##
 8 |   def ensure_array
 9 |     [self]
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/lib/curly/string.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | ##
  4 | ## Remove extra spaces and newlines from a string
  5 | ##
  6 | ## @return     [String] cleaned string
  7 | ##
  8 | class ::String
  9 |    ##
 10 |   ## Discard invalid characters and output a UTF-8 String
 11 |   ##
 12 |   ## @return     [String] UTF-8 encoded string
 13 |   ##
 14 |   def utf8
 15 |     encode('utf-16', invalid: :replace).encode('utf-8')
 16 |   end
 17 | 
 18 |   ##
 19 |   ## Destructive version of #utf8
 20 |   ##
 21 |   ## @return     [String] UTF-8 encoded string, in place
 22 |   ##
 23 |   def utf8!
 24 |     replace utf8
 25 |   end
 26 | 
 27 |   ## Remove extra spaces and newlines, compress space
 28 |   ## between tags
 29 |   ##
 30 |   ## @return     [String] cleaned string
 31 |   ##
 32 |   def clean
 33 |     gsub(/[\t\n ]+/m, ' ').gsub(/> +<')
 34 |   end
 35 | 
 36 |   ##
 37 |   ## Remove HTML tags from a string
 38 |   ##
 39 |   ## @return     [String] stripped string
 40 |   ##
 41 |   def strip_tags
 42 |     gsub(%r{}, '')
 43 |   end
 44 | 
 45 |   ##
 46 |   ## Destructive version of #clean
 47 |   ##
 48 |   ## @see #clean
 49 |   ##
 50 |   def clean!
 51 |     replace clean
 52 |   end
 53 | 
 54 |   ##
 55 |   ## Destructive version of #strip_tags
 56 |   ##
 57 |   ## @see #strip_tags
 58 |   ##
 59 |   def strip_tags!
 60 |     replace strip_tags
 61 |   end
 62 | 
 63 |   ##
 64 |   ## Convert an image type string to a symbol
 65 |   ##
 66 |   ## @return     [Symbol] :srcset, :img, :opengraph, :all
 67 |   ##
 68 |   def normalize_image_type(default = :all)
 69 |     case self.to_s
 70 |     when /^[sp]/i
 71 |       :srcset
 72 |     when /^i/i
 73 |       :img
 74 |     when /^o/i
 75 |       :opengraph
 76 |     else
 77 |       default.is_a?(Symbol) ? default.to_sym : default.normalize_image_type
 78 |     end
 79 |   end
 80 | 
 81 |   ##
 82 |   ## Convert a browser type string to a symbol
 83 |   ##
 84 |   ## @return     [Symbol] :chrome, :firefox
 85 |   ##
 86 |   def normalize_browser_type(default = :none)
 87 |     case self.to_s
 88 |     when /^c/i
 89 |       :chrome
 90 |     when /^f/i
 91 |       :firefox
 92 |     else
 93 |       default.is_a?(Symbol) ? default.to_sym : default.normalize_browser_type
 94 |     end
 95 |   end
 96 | 
 97 |   ##
 98 |   ## Convert a screenshot type string to a symbol
 99 |   ##
100 |   ## @return     [Symbol] :full_page, :print_page, :visible
101 |   ##
102 |   def normalize_screenshot_type(default = :none)
103 |     case self.to_s
104 |     when /^f/i
105 |       :full_page
106 |     when /^p/i
107 |       :print_page
108 |     when /^v/i
109 |       :visible
110 |     else
111 |       default.is_a?(Symbol) ? default.to_sym : default.normalize_browser_type
112 |     end
113 |   end
114 | 
115 |   ##
116 |   ## Clean up output and return a single-item array
117 |   ##
118 |   ## @return     [Array] output array
119 |   ##
120 |   def clean_output
121 |     output = ensure_array
122 |     output.clean_output
123 |   end
124 | 
125 |   ##
126 |   ## Ensure that an object is an array
127 |   ##
128 |   ## @return     [Array] object as Array
129 |   ##
130 |   def ensure_array
131 |     return [self]
132 |   end
133 | end
134 | 


--------------------------------------------------------------------------------
/lib/curly/version.rb:
--------------------------------------------------------------------------------
1 | # Top level module for CurlyQ
2 | module Curly
3 |   # Current version number
4 |   VERSION = '0.0.16'
5 | end
6 | 


--------------------------------------------------------------------------------
/src/_README.md:
--------------------------------------------------------------------------------
  1 | # CurlyQ
  2 | 
  3 | [![Gem](https://img.shields.io/gem/v/na.svg)](https://rubygems.org/gems/curlyq)
  4 | [![GitHub license](https://img.shields.io/github/license/ttscoff/curlyq.svg)](./LICENSE.txt)
  5 | 
  6 | **A command line helper for curl and web scraping**
  7 | 
  8 | _If you find this useful, feel free to [buy me some coffee][donate]._
  9 | 
 10 | [donate]: https://brettterpstra.com/donate
 11 | 
 12 | 
 13 | [jq]: https://github.com/jqlang/jq "Command-line JSON processor"
 14 | [yq]: https://github.com/mikefarah/yq "yq is a portable command-line YAML, JSON, XML, CSV, TOML and properties processor"
 15 | 
 16 | The current version of `curlyq` is 0.0.15.
 17 | 
 18 | CurlyQ is a utility that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It's designed to be part of a scripting pipeline, outputting everything as structured data (JSON or YAML). It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like [jq] to parse the output.
 19 | 
 20 | [github]: https://github.com/ttscoff/curlyq/
 21 | 
 22 | ### Installation
 23 | 
 24 | Assuming you have Ruby and RubyGems installed, you can just run `gem install curlyq`. If you run into errors, try `gem install --user-install curlyq`, or use `sudo gem install curlyq`.
 25 | 
 26 | If you're using Homebrew, you have the option to install via [brew-gem](https://github.com/sportngin/brew-gem):
 27 | 
 28 |     brew install brew-gem
 29 |     brew gem install curlyq
 30 | 
 31 | If you don't have Ruby/RubyGems, you can install them pretty easily with [Homebrew], [rvm], or [asdf].
 32 | 
 33 | [Homebrew]: https://brew.sh/ "Homebrew—The Missing Package Manager for macOS (or Linux)"
 34 | [rvm]: https://rvm.io/ "Ruby Version Manager (RVM)"
 35 | [asdf]: https://github.com/asdf-vm/asdf "asdf-vm/asdf:Extendable version manager with support for ..."
 36 | 
 37 | ### Usage
 38 | 
 39 | Run `curlyq help` for a list of subcommands. Run `curlyq help SUBCOMMAND` for details on a particular subcommand and its options.
 40 | 
 41 | ```
 42 | @cli(bundle exec bin/curlyq help)
 43 | ```
 44 | 
 45 | ### Query and Search syntax
 46 | 
 47 | You can shape the results using `--search` (`-s`) and `--query` (`-q`) on some commands.
 48 | 
 49 | A search uses either CSS or XPath syntax to locate elements. For example, if you wanted to locate all of the `
` elements with a class of `post` inside of the div with an id of `main`, you would run `--search '#main article.post'`. Searches can target tags, ids, and classes, and can accept `>` to target direct descendents. You can also use XPaths, but I hate those so I'm not going to document them. 50 | 51 | > I've tried to make the query function useful, but if you want to do any kind of advanced shaping, you're better off piping the JSON output to [jq] or [yq]. 52 | 53 | 54 | Queries are specifically for shaping CurlyQ output. If you're using the `html` command, it returns a key called `images`, so you can target just the images in the response with `-q 'images'`. The queries accept array syntax, so to get the first image, you would use `-q 'images[0]'`. Ranges are accepted as well, so `-q 'images[1..4]'` will return the 2nd through 5th images found on the page. You can also do comparisons, e.g. `images[rel=me]'` to target only images with a `rel` attribute of `me`. 55 | 56 | The comparisons for the query flag are: 57 | 58 | - `<` less than 59 | - `>` greater than 60 | - `<=` less than or equal to 61 | - `>=` greater than or equal to 62 | - `=` or `==` is equal to 63 | - `*=` contains text 64 | - `^=` starts with text 65 | - `$=` ends with text 66 | 67 | Comparisons can be numeric or string comparisons. A numeric comparison like `curlyq images -q '[width>500]' URL` would return all of the images on the page with a width attribute greater than 500. 68 | 69 | You can also use dot syntax inside of comparisons, e.g. `[links.rel*=me]` to target the links object (`html` command), and return only the links with a `rel=me` attribute. If the comparison is to an array object (like `class` or `rel`), it will match if any of the elements of the array match your comparison. 70 | 71 | If you end the query with a specific key, only that key will be output. If there's only one match, it will be output as a raw string. If there are multiple matches, output will be an array: 72 | 73 | curlyq tags --search '#main .post h3' -q '[attrs.id*=what].source' 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/' 74 | 75 |

What’s Next

76 | 77 | #### Commands 78 | 79 | curlyq makes use of subcommands, e.g. `curlyq html [options] URL` or `curlyq extract [options] URL`. Each subcommand takes its own options, but I've made an effort to standardize the choices between each command as much as possible. 80 | 81 | ##### extract 82 | 83 | Example: 84 | 85 | curlyq extract -i -b 'Adding' -a 'accessing the source.' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python' 86 | 87 | [ 88 | "Adding time.sleep(10) in various places in case the page had not fully loaded when I was accessing the source." 89 | ] 90 | 91 | This specifies a before and after string and includes them (`-i`) in the result. 92 | 93 | ``` 94 | @cli(bundle exec bin/curlyq help extract) 95 | ``` 96 | 97 | 98 | ##### execute 99 | 100 | You can execute JavaScript on a given web page using the `execute` subcommand. 101 | 102 | Example: 103 | 104 | curlyq execute -s "NiftyAPI.find('file/save').arrow().shoot('file-save')" file:///Users/ttscoff/Desktop/Code/niftymenu/dist/MultiMarkdown-Composer.html 105 | 106 | You can specify an element id to wait for using `--id`, and define a pause to wait after executing a script with `--wait` (defaults to 2 seconds). Scripts can be read from the command line arguments with `--script "SCRIPT"`, from STDIN with `--script -`, or from a file using `--script PATH`. 107 | 108 | If you expect a return value, be sure to include a `return` statement in your executed script. Results will be output to STDOUT. 109 | 110 | ``` 111 | @cli(bundle exec bin/curlyq help execute) 112 | ``` 113 | 114 | ##### headlinks 115 | 116 | Example: 117 | 118 | curlyq headlinks -q '[rel=stylesheet]' https://brettterpstra.com 119 | 120 | { 121 | "rel": "stylesheet", 122 | "href": "https://cdn3.brettterpstra.com/stylesheets/screen.7261.css", 123 | "type": "text/css", 124 | "title": null 125 | } 126 | 127 | This pulls all `` from the `` of the page, and uses a query `-q` to only show links with `rel="stylesheet"`. 128 | 129 | ``` 130 | @cli(bundle exec bin/curlyq help headlinks) 131 | ``` 132 | 133 | ##### html 134 | 135 | The html command (aliased as `curl`) gets the entire text of the web page and provides a JSON response with a breakdown of: 136 | 137 | - URL, after any redirects 138 | - Response code 139 | - Response headers as a keyed hash 140 | - Meta elements for the page as a keyed hash 141 | - All meta links in the head as an array of objects containing (as available): 142 | - rel 143 | - href 144 | - type 145 | - title 146 | - source of `` 147 | - source of `` 148 | - the page title (determined first by og:title, then by a title tag) 149 | - description (using og:description first) 150 | - All links on the page as an array of objects with: 151 | - href 152 | - title 153 | - rel 154 | - text content 155 | - classes as array 156 | - All images on the page as an array of objects containing: 157 | - class 158 | - all attributes as key/value pairs 159 | - width and height (if specified) 160 | - src 161 | - alt and title 162 | 163 | You can add a query (`-q`) to only get the information needed, e.g. `-q images[width>600]`. 164 | 165 | Example: 166 | 167 | curlyq html -s '#main article .aligncenter' -q 'images[1]' 'https://brettterpstra.com' 168 | 169 | [ 170 | { 171 | "class": "aligncenter", 172 | "original": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb_tw.jpg", 173 | "at2x": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb@2x.jpg", 174 | "width": "800", 175 | "height": "226", 176 | "src": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb.jpg", 177 | "alt": "Giveaway Robot with Keyboard Maestro icon", 178 | "title": "Giveaway Robot with Keyboard Maestro icon" 179 | } 180 | ] 181 | 182 | The above example queries the full html of the page, but narrows the elements using `--search` and then takes the 2nd image from the results. 183 | 184 | curlyq html -q 'meta.title' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/ 185 | 186 | Introducing CurlyQ, a pipeline-oriented curl helper - BrettTerpstra.com 187 | 188 | The above example curls the page and returns the title attribute found in the meta (`-q 'meta.title'`). 189 | 190 | ``` 191 | @cli(bundle exec bin/curlyq help html) 192 | ``` 193 | 194 | ##### images 195 | 196 | The images command returns only the images on the page as an array of objects. It can be queried to match certain requirements (see Query and Search syntax above). 197 | 198 | The base command will return all images on the page, including OpenGraph images from the head, `` tags from the body, and `` tags along with their child images. 199 | 200 | OpenGraph images will be returned with the structure: 201 | 202 | { 203 | "type": "opengraph", 204 | "attrs": null, 205 | "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg" 206 | } 207 | 208 | `img` tags will be returned with the structure: 209 | 210 | { 211 | "type": "img", 212 | "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb.jpg", 213 | "width": "800", 214 | "height": "226", 215 | "alt": "Banner image for CurlyQ", 216 | "title": "CurlyQ, curl better", 217 | "attrs": [ 218 | { 219 | "class": [ 220 | "aligncenter" 221 | ], // all attributes included 222 | } 223 | ] 224 | } 225 | 226 | 227 | 228 | `srcset` images will be returned with the structure: 229 | 230 | { 231 | "type": "srcset", 232 | "attrs": [ 233 | { 234 | "key": "srcset", 235 | "value": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg 1x, https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg 2x" 236 | } 237 | ], 238 | "images": [ 239 | { 240 | "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg", 241 | "media": "1x" 242 | }, 243 | { 244 | "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg", 245 | "media": "2x" 246 | } 247 | ] 248 | } 249 | } 250 | 251 | Example: 252 | 253 | curlyq images -t img -q '[alt$=screenshot]' https://brettterpstra.com 254 | 255 | This will return an array of images that are `` tags, and only show the ones that have an `alt` attribute that ends with `screenshot`. 256 | 257 | curlyq images -q '[width>750]' https://brettterpstra.com 258 | 259 | This example will only return images that have a width greater than 750 pixels. This query depends on the images having proper `width` attributes set on them in the source. 260 | 261 | ``` 262 | @cli(bundle exec bin/curlyq help images) 263 | ``` 264 | 265 | ##### json 266 | 267 | The `json` command just returns an object with header/response info, and the contents of the JSON response after it's been read by the Ruby JSON library and output. If there are fetching or parsing errors it will fail gracefully with an error code. 268 | 269 | ``` 270 | @cli(bundle exec bin/curlyq help json) 271 | ``` 272 | 273 | ##### links 274 | 275 | Returns all the links on the page, which can be queried on any attribute. 276 | 277 | Example: 278 | 279 | curlyq links -q '[content*=twitter]' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python' 280 | 281 | [ 282 | { 283 | "href": "https://twitter.com/stackoverflow", 284 | "title": null, 285 | "rel": null, 286 | "content": "Twitter", 287 | "class": [ 288 | "-link", 289 | "js-gps-track" 290 | ] 291 | } 292 | ] 293 | 294 | This example gets all links from the page but only returns ones with link content containing 'twitter' (`-q '[content*=twitter]'`). 295 | 296 | ``` 297 | @cli(bundle exec bin/curlyq help links) 298 | ``` 299 | 300 | ##### scrape 301 | 302 | Loads the page in a web browser, allowing scraping of dynamically loaded pages that return nothing but scripts when `curl`ed. The `-b` (`--browser`) option is required and should be 'chrome' or 'firefox' (or just 'c' or 'f'). The selected browser must be installed on your system. 303 | 304 | Example: 305 | 306 | curlyq scrape -b firefox -q 'links[rel=me&content*=mastodon][0]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/ 307 | 308 | { 309 | "href": "https://nojack.easydns.ca/@ttscoff", 310 | "title": null, 311 | "rel": [ 312 | "me" 313 | ], 314 | "content": "Mastodon", 315 | "class": [ 316 | "u-url" 317 | ] 318 | } 319 | 320 | This example scrapes the page using firefox and finds the first link with a rel of 'me' and text containing 'mastodon'. 321 | 322 | ``` 323 | @cli(bundle exec bin/curlyq help scrape) 324 | ``` 325 | 326 | ##### screenshot 327 | 328 | Full-page screenshots require Firefox, installed and specified with `--browser firefox`. 329 | 330 | Type defaults to `full`, but will only work if `-b` is Firefox. If you want to use Chrome, you must specify a `--type` as 'visible' or 'print'. 331 | 332 | The `-o` (`--output`) flag is required. It should be a path to a target PNG file (or PDF for `-t print` output). Extension will be modified automatically, all you need is the base name. 333 | 334 | Example: 335 | 336 | curlyq screenshot -b f -o ~/Desktop/test https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/ 337 | 338 | Screenshot saved to /Users/ttscoff/Desktop/test.png 339 | 340 | You can wait for an element ID to be visible using `--id`. This can be any `#ID` on the page. If the ID doesn't exist on the page, though, the screenshot will hang for a timeout of 10 seconds. 341 | 342 | You can execute a script before taking the screenshot with the `--script` flag. If this is set to `-`, it will read the script from STDIN. If it's set to an existing file path, that file will be read for script input. Specify an interval (in seconds) to wait after executing the script with `--wait`. 343 | 344 | ``` 345 | @cli(bundle exec bin/curlyq help screenshot) 346 | ``` 347 | 348 | ##### tags 349 | 350 | Return a hierarchy of all tags in a page. Use `-t` to limit to a specific tag. 351 | 352 | curlyq tags --search '#main .post h3' -q '[attrs.id*=what]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/ 353 | 354 | [ 355 | { 356 | "tag": "h3", 357 | "source": "

What’s Next

", 358 | "attrs": [ 359 | { 360 | "id": "whats-next" 361 | } 362 | ], 363 | "content": "What’s Next", 364 | "tags": [ 365 | 366 | ] 367 | } 368 | ] 369 | 370 | The above command filters the tags based on a CSS query, then further filters them to just tags with an id containing 'what'. 371 | 372 | ``` 373 | @cli(bundle exec bin/curlyq help tags) 374 | ``` 375 | 376 | 377 | PayPal link: [paypal.me/ttscoff](https://paypal.me/ttscoff) 378 | 379 | ## Changelog 380 | 381 | See [CHANGELOG.md](https://github.com/ttscoff/curlyq/blob/main/CHANGELOG.md) 382 | 383 | -------------------------------------------------------------------------------- /test/curlyq_extract_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'yaml' 5 | 6 | require 'helpers/curlyq-helpers' 7 | require 'test_helper' 8 | 9 | # Tests for tags command 10 | class CurlyQExtractTest < Test::Unit::TestCase 11 | include CurlyQHelpers 12 | 13 | def setup 14 | end 15 | 16 | def test_extract_inclusive 17 | result = curlyq('extract', '-i', '-b', 'Adding', '-a', 'accessing the source.', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python') 18 | json = JSON.parse(result) 19 | 20 | assert_match(/^Adding time.sleep\(10\)<\/code>.*?accessing the source.$/, json[0], 'Match should be found and include the before and after strings') 21 | end 22 | 23 | def test_extract_exclusive 24 | result = curlyq('extract', '-b', 'Adding', '-a', 'accessing the source.', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python') 25 | json = JSON.parse(result) 26 | 27 | assert_match(/^ time.sleep\(10\)<\/code>.*?when I was $/, json[0], 'Match should be found and not include the before and after strings') 28 | end 29 | 30 | def test_extract_regex_inclusive 31 | result = curlyq('extract', '-ri', '-b', '.dding <', '-a', 'accessing.*?source.', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python') 32 | json = JSON.parse(result) 33 | 34 | assert_match(/^Adding time.sleep\(10\)<\/code>.*?accessing the source.$/, json[0], 'Match should be found and include the before and after strings') 35 | end 36 | 37 | def test_extract_regex_exclusive 38 | result = curlyq('extract', '-r', '-b', '.dding <', '-a', 'accessing.*?source.', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python') 39 | json = JSON.parse(result) 40 | 41 | assert_match(/^code>time.sleep\(10\)<\/code>.*?when I was $/, json[0], 'Match should be found and not include the before and after strings') 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /test/curlyq_headlinks_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'yaml' 5 | 6 | require 'helpers/curlyq-helpers' 7 | require 'test_helper' 8 | 9 | # Tests for tags command 10 | class CurlyQHeadlinksTest < Test::Unit::TestCase 11 | include CurlyQHelpers 12 | 13 | def setup 14 | end 15 | 16 | def test_headlinks_query 17 | result = curlyq('headlinks', '-q', '[rel=stylesheet]', 'https://brettterpstra.com') 18 | json = JSON.parse(result) 19 | 20 | assert_equal(Array, json.class, 'Result should be an array') 21 | assert_match(/stylesheet/, json[0]['rel'], 'Should have retrieved a single result with rel stylesheet') 22 | assert_match(/screen\.\d+\.css$/, json[0]['href'], 'Stylesheet should be correct primary stylesheet') 23 | end 24 | 25 | def test_headlinks 26 | result = curlyq('headlinks', 'https://brettterpstra.com') 27 | json = JSON.parse(result) 28 | 29 | assert_equal(Array, json.class, 'Should have an array of results') 30 | assert(json.count > 1, 'Should have more than one link') 31 | # assert(json[0].count.positive?) 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /test/curlyq_html_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'yaml' 5 | 6 | require 'helpers/curlyq-helpers' 7 | require 'test_helper' 8 | 9 | # Tests for tags command 10 | class CurlyQHtmlTest < Test::Unit::TestCase 11 | include CurlyQHelpers 12 | 13 | def test_html_search_query 14 | result = curlyq('html', '-s', '#main article .aligncenter', '-q', 'images[0]', 'https://brettterpstra.com/2024/10/19/web-excursions-for-october-19-2024/') 15 | json = JSON.parse(result) 16 | 17 | assert_match(/aligncenter/, json[0]['class'], 'Should have found an image with class "aligncenter"') 18 | end 19 | 20 | def test_html_query 21 | result = curlyq('html', '-q', 'meta.title', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/') 22 | json = JSON.parse(result) 23 | assert_match(/Introducing CurlyQ/, json[0], 'Should have retrived the page title') 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /test/curlyq_images_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'yaml' 5 | 6 | require 'helpers/curlyq-helpers' 7 | require 'test_helper' 8 | 9 | # Tests for tags command 10 | class CurlyQImagesTest < Test::Unit::TestCase 11 | include CurlyQHelpers 12 | 13 | def test_images_query 14 | result = curlyq('images', '-t', 'img', '-q', '[alt$=screenshot]', 'https://brettterpstra.com/2024/01/08/keyboard-maestro-giveaway/') 15 | json = JSON.parse(result) 16 | 17 | assert(json.count == 1, 'Should have found 1 image') 18 | assert_match(/Keyboard Maestro screenshot/, json[0]['alt'], 'Should match Keyboard Meastro screenshot') 19 | end 20 | 21 | def test_images_type 22 | result = curlyq('images', '-t', 'srcset', 'https://brettterpstra.com/') 23 | json = JSON.parse(result) 24 | 25 | assert(json.count.positive?, 'Should have found at least 1 image') 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /test/curlyq_json_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'yaml' 5 | 6 | require 'helpers/curlyq-helpers' 7 | require 'test_helper' 8 | 9 | # Tests for tags command 10 | class CurlyQJsonTest < Test::Unit::TestCase 11 | include CurlyQHelpers 12 | 13 | def setup 14 | end 15 | 16 | def test_json 17 | result = curlyq('json', 'https://brettterpstra.com/scripts/giveaways_wrapper.cgi?v=203495&giveaway=hazel2023&action=count') 18 | json = JSON.parse(result)[0] 19 | 20 | assert_equal(json.class, Hash, 'Single result should be a hash') 21 | assert_equal(286, json['json']['total'], 'json.total should match 286') 22 | end 23 | 24 | def test_query 25 | result1 = curlyq('json', '-q', 'total', 'https://brettterpstra.com/scripts/giveaways_wrapper.cgi?v=203495&giveaway=hazel2023&action=count') 26 | result2 = curlyq('json', '-q', 'json.total', 'https://brettterpstra.com/scripts/giveaways_wrapper.cgi?v=203495&giveaway=hazel2023&action=count') 27 | json1 = JSON.parse(result1)[0] 28 | json2 = JSON.parse(result2)[0] 29 | 30 | assert_equal(286, json1, 'Should be 286') 31 | assert_equal(286, json2, 'Including json in dot path should yeild same result') 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /test/curlyq_links_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'yaml' 5 | 6 | require 'helpers/curlyq-helpers' 7 | require 'test_helper' 8 | 9 | # Tests for tags command 10 | class CurlyQLinksTest < Test::Unit::TestCase 11 | include CurlyQHelpers 12 | 13 | def test_links 14 | result = curlyq('links', '-q', '[content*=twitter]', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python') 15 | json = JSON.parse(result) 16 | 17 | assert(json.count.positive?, 'Should be at least 1 match') 18 | assert_match(/twitter.com/, json[0]['href'], 'Should be a link to Twitter') 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /test/curlyq_scrape_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'yaml' 5 | 6 | require 'helpers/curlyq-helpers' 7 | require 'test_helper' 8 | 9 | # Tests for tags command 10 | class CurlyQScrapeTest < Test::Unit::TestCase 11 | include CurlyQHelpers 12 | 13 | def setup 14 | @screenshot = File.join(File.dirname(__FILE__), 'screenshot_test') 15 | FileUtils.rm_f("#{@screenshot}.pdf") if File.exist?("#{@screenshot}.pdf") 16 | FileUtils.rm_f("#{@screenshot}.png") if File.exist?("#{@screenshot}.png") 17 | FileUtils.rm_f("#{@screenshot}_full.png") if File.exist?("#{@screenshot}_full.png") 18 | end 19 | 20 | def teardown 21 | FileUtils.rm_f("#{@screenshot}.pdf") if File.exist?("#{@screenshot}.pdf") 22 | FileUtils.rm_f("#{@screenshot}.png") if File.exist?("#{@screenshot}.png") 23 | FileUtils.rm_f("#{@screenshot}_full.png") if File.exist?("#{@screenshot}_full.png") 24 | end 25 | 26 | def test_scrape_firefox 27 | result = curlyq('scrape', '-b', 'firefox', '-q', 'links[rel=me&content*=mastodon][0]', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/') 28 | json = JSON.parse(result) 29 | 30 | assert_equal(Array, json.class, 'Result should be an Array') 31 | assert_match(/Mastodon/, json[0]['content'], 'Should have retrieved a Mastodon link') 32 | end 33 | 34 | def test_scrape_chrome 35 | result = curlyq('scrape', '-b', 'chrome', '-q', 'links[rel=me&content*=mastodon][0]', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/') 36 | json = JSON.parse(result) 37 | 38 | assert_equal(Array, json.class, 'Result should be an Array') 39 | assert_match(/Mastodon/, json[0]['content'], 'Should have retrieved a Mastodon link') 40 | end 41 | 42 | def test_screenshot 43 | curlyq('screenshot', '-b', 'firefox', '-o', @screenshot, '-t', 'print', 'https://brettterpstra.com') 44 | assert(File.exist?("#{@screenshot}.pdf"), 'PDF Screenshot should exist') 45 | 46 | curlyq('screenshot', '-b', 'chrome', '-o', @screenshot, '-t', 'visible', 'https://brettterpstra.com') 47 | assert(File.exist?("#{@screenshot}.png"), 'PNG Screenshot should exist') 48 | 49 | curlyq('screenshot', '-b', 'firefox', '-o', "#{@screenshot}_full", '-t', 'full', 'https://brettterpstra.com') 50 | assert(File.exist?("#{@screenshot}_full.png"), 'PNG Screenshot should exist') 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /test/curlyq_tags_test.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'json' 4 | require 'yaml' 5 | 6 | require 'helpers/curlyq-helpers' 7 | require 'test_helper' 8 | 9 | # Tests for tags command 10 | class CurlyQTagsTest < Test::Unit::TestCase 11 | include CurlyQHelpers 12 | 13 | def setup 14 | end 15 | 16 | def test_tags 17 | result = curlyq('tags', '--search', '#main .post h3', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/') 18 | json = JSON.parse(result) 19 | 20 | assert_equal(Array, json.class, 'Should be an array of matches') 21 | assert_equal(6, json.count, 'Should be six results') 22 | end 23 | 24 | def test_clean 25 | result = curlyq('tags', '--search', '#main section.related', '--clean', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/') 26 | json = JSON.parse(result) 27 | 28 | assert_equal(Array, json.class, 'Should be a single Array') 29 | assert_equal(1, json.count, 'Should be one element') 30 | assert_match(%r{Last.fm}, json[0]['source'], 'Should have matched #whats-next') 31 | end 32 | 33 | def test_query 34 | result = curlyq('tags', '--search', '#main .post h3', '-q', '[attrs.id*=what].source', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/') 35 | json = JSON.parse(result) 36 | assert_equal(Array, json.class, 'Should be an array') 37 | assert_match(%r{^

What’s Next

$}, json[0], 'Should have returned just source') 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /test/default_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class DefaultTest < Minitest::Test 4 | 5 | def setup 6 | end 7 | 8 | def teardown 9 | end 10 | 11 | def test_the_truth 12 | assert true 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /test/helpers/curlyq-helpers.rb: -------------------------------------------------------------------------------- 1 | require 'open3' 2 | require 'time' 3 | require 'fileutils' 4 | $LOAD_PATH.unshift File.join(__dir__, '..', '..', 'lib') 5 | require 'curly' 6 | 7 | module CurlyQHelpers 8 | CURLYQ_EXEC = File.join(File.dirname(__FILE__), '..', '..', 'bin', 'curlyq') 9 | BUNDLE = '/Users/ttscoff/.asdf/shims/bundle' 10 | 11 | def curlyq_with_env(env, *args, stdin: nil) 12 | Dir.chdir(File.expand_path('~/Desktop/Code/curlyq')) 13 | pread(env, BUNDLE, 'exec', 'bin/curlyq', *args, stdin: stdin) 14 | end 15 | 16 | def curlyq(*args) 17 | curlyq_with_env({ 'GLI_DEBUG' => 'true' }, *args) 18 | end 19 | 20 | def pread(env, *cmd, stdin: nil) 21 | out, err, status = Open3.capture3(env, *cmd, stdin_data: stdin) 22 | unless status.success? 23 | raise [ 24 | "Error (#{status}): #{cmd.inspect} failed", "STDOUT:", out.inspect, "STDERR:", err.inspect 25 | ].join("\n") 26 | end 27 | 28 | out 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /test/helpers/fake_std_out.rb: -------------------------------------------------------------------------------- 1 | class FakeStdOut 2 | attr_reader :strings 3 | 4 | def initialize 5 | @strings = [] 6 | end 7 | 8 | def puts(string=nil) 9 | @strings << string unless string.nil? 10 | end 11 | 12 | def write(x) 13 | puts(x) 14 | end 15 | 16 | def printf(*args) 17 | puts(Kernel.printf(*args)) 18 | end 19 | 20 | # Returns true if the regexp matches anything in the output 21 | def contained?(regexp) 22 | strings.find{ |x| x =~ regexp } 23 | end 24 | 25 | def flush; end 26 | 27 | def to_s 28 | @strings.join("\n") 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /test/helpers/threaded_tests.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'tty-spinner' 4 | require 'tty-progressbar' 5 | require 'open3' 6 | require 'shellwords' 7 | require 'fileutils' 8 | require 'pastel' 9 | 10 | class ThreadedTests 11 | def run(pattern: '*', max_threads: 8, max_tests: 0) 12 | pastel = Pastel.new 13 | 14 | start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) 15 | @results = File.expand_path('results.log') 16 | 17 | max_threads = 1000 if max_threads.to_i == 0 18 | 19 | shuffle = false 20 | 21 | unless pattern =~ /shuffle/i 22 | pattern = "test/curlyq_*#{pattern}*_test.rb" 23 | else 24 | pattern = "test/curlyq_*_test.rb" 25 | shuffle = true 26 | end 27 | 28 | tests = Dir.glob(pattern) 29 | 30 | tests.shuffle! if shuffle 31 | 32 | if max_tests.to_i > 0 33 | tests = tests.slice(0, max_tests.to_i - 1) 34 | end 35 | 36 | puts pastel.cyan("#{tests.count} test files") 37 | 38 | banner = "Running tests [:bar] T/A (#{max_threads.to_s} threads)" 39 | 40 | progress = TTY::ProgressBar::Multi.new(banner, 41 | width: 12, 42 | clear: true, 43 | hide_cursor: true) 44 | @children = [] 45 | tests.each do |t| 46 | test_name = File.basename(t, '.rb').sub(/curlyq_(.*?)_test/, '\1') 47 | new_sp = progress.register("[:bar] #{test_name}:status", 48 | total: tests.count + 8, 49 | width: 1, 50 | head: ' ', 51 | unknown: ' ', 52 | hide_cursor: true, 53 | clear: true) 54 | status = ': waiting' 55 | @children.push([test_name, new_sp, status]) 56 | end 57 | 58 | @elapsed = 0.0 59 | @test_total = 0 60 | @assrt_total = 0 61 | @error_out = [] 62 | @threads = [] 63 | @running_tests = [] 64 | 65 | begin 66 | finish_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) 67 | while @children.count.positive? 68 | 69 | slices = @children.slice!(0, max_threads) 70 | slices.each { |c| c[1].start } 71 | slices.each do |s| 72 | @threads << Thread.new do 73 | run_test(s) 74 | finish_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) 75 | end 76 | end 77 | 78 | @threads.each { |t| t.join } 79 | end 80 | 81 | finish_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) 82 | 83 | progress.finish 84 | rescue 85 | progress.stop 86 | ensure 87 | msg = @running_tests.map { |t| t[1].format.sub(/^\[:bar\] (.*?):status/, "\\1#{t[2]}") }.join("\n") 88 | 89 | output = [] 90 | output << if @error_out.count.positive? 91 | pastel.red("#{@error_out.count} Issues") 92 | else 93 | pastel.green('Success') 94 | end 95 | output << pastel.green("#{@test_total} tests") 96 | output << pastel.cyan("#{@assrt_total} assertions") 97 | output << pastel.yellow("#{(finish_time - start_time).round(3)}s") 98 | puts output.join(', ') 99 | 100 | if @error_out.count.positive? 101 | puts @error_out.join(pastel.white("\n----\n")) 102 | Process.exit 1 103 | end 104 | end 105 | end 106 | 107 | def run_test(s) 108 | pastel = Pastel.new 109 | 110 | bar = s[1] 111 | s[2] = ": #{pastel.green('running')}" 112 | bar.advance(status: s[2]) 113 | 114 | if @running_tests.count.positive? 115 | @running_tests.each do |b| 116 | prev_bar = b[1] 117 | if prev_bar.complete? 118 | prev_bar.reset 119 | prev_bar.advance(status: b[2]) 120 | prev_bar.finish 121 | else 122 | prev_bar.update(head: ' ', unfinished: ' ') 123 | prev_bar.advance(status: b[2]) 124 | end 125 | end 126 | end 127 | 128 | @running_tests.push(s) 129 | out, _err, status = Open3.capture3(ENV, 'rake', "test:#{s[0]}", stdin_data: nil) 130 | time = out.match(/^Finished in (?