├── .github
    └── FUNDING.yml
├── .gitignore
├── .irbrc
├── CHANGELOG.md
├── Gemfile
├── Gemfile.lock
├── LICENSE.txt
├── README.md
├── README.rdoc
├── Rakefile
├── bin
    └── curlyq
├── curlyq.gemspec
├── curlyq.rdoc
├── lib
    ├── curly.rb
    └── curly
    │   ├── array.rb
    │   ├── curl.rb
    │   ├── curl
    │       ├── html.rb
    │       └── json.rb
    │   ├── hash.rb
    │   ├── numeric.rb
    │   ├── string.rb
    │   └── version.rb
├── src
    └── _README.md
└── test
    ├── curlyq_extract_test.rb
    ├── curlyq_headlinks_test.rb
    ├── curlyq_html_test.rb
    ├── curlyq_images_test.rb
    ├── curlyq_json_test.rb
    ├── curlyq_links_test.rb
    ├── curlyq_scrape_test.rb
    ├── curlyq_tags_test.rb
    ├── default_test.rb
    ├── helpers
        ├── curlyq-helpers.rb
        ├── fake_std_out.rb
        └── threaded_tests.rb
    └── test_helper.rb


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [ttscoff]
2 | custom: ['https://brettterpstra.com/support/', 'https://brettterpstra.com/donate/']
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | html
2 | *.bak
3 | 


--------------------------------------------------------------------------------
/.irbrc:
--------------------------------------------------------------------------------
1 | $LOAD_PATH.unshift File.join(__dir__, 'lib')
2 | require_relative 'lib/curly'
3 | include Curly
4 | 
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | ### 0.0.16
  2 | 
  3 | 2024-11-07 06:45
  4 | 
  5 | #### FIXED
  6 | 
  7 | - Encoding error
  8 | 
  9 | ### 0.0.15
 10 | 
 11 | 2024-10-25 10:31
 12 | 
 13 | #### IMPROVED
 14 | 
 15 | - Better error when no results, return nothing to STDOUT
 16 | 
 17 | ### 0.0.14
 18 | 
 19 | 2024-10-25 10:26
 20 | 
 21 | #### FIXED
 22 | 
 23 | - Fix permissions
 24 | 
 25 | ### 0.0.13
 26 | 
 27 | 2024-10-25 10:23
 28 | 
 29 | #### FIXED
 30 | 
 31 | - Fix tests, handle empty results better
 32 | 
 33 | ### 0.0.12
 34 | 
 35 | 2024-04-04 13:06
 36 | 
 37 | #### NEW
 38 | 
 39 | - Add --script option to screenshot command
 40 | - Add `execute` command for executing JavaScript on a page
 41 | 
 42 | ### 0.0.11
 43 | 
 44 | 2024-01-21 15:29
 45 | 
 46 | #### IMPROVED
 47 | 
 48 | - Add option for --local_links_only to html and links command, only returning links with the same origin site
 49 | 
 50 | ### 0.0.10
 51 | 
 52 | 2024-01-17 13:50
 53 | 
 54 | #### IMPROVED
 55 | 
 56 | - Update YARD documentation
 57 | - Breaking change, ensure all return types are Arrays, even with single objects, to aid in scriptability
 58 | - Screenshot test suite
 59 | 
 60 | ### 0.0.9
 61 | 
 62 | 2024-01-16 12:38
 63 | 
 64 | #### IMPROVED
 65 | 
 66 | - You can now use dot syntax inside of a square bracket comparison in --query (`[attrs.id*=what]`)
 67 | - *=, ^=, $=, and == work with array values
 68 | - [] comparisons with no comparison, e.g. [attrs.id], will return every match that has that element populated
 69 | 
 70 | ### 0.0.8
 71 | 
 72 | 2024-01-15 16:45
 73 | 
 74 | #### IMPROVED
 75 | 
 76 | - Dot syntax query can now operate on a full array using empty set []
 77 | - Dot syntax query should output a specific key, e.g. attrs[id*=news].content (work in progress)
 78 | - Dot query syntax handling touch-ups. Piping to jq is still more flexible, but the basics are there.
 79 | 
 80 | ### 0.0.7
 81 | 
 82 | 2024-01-12 17:03
 83 | 
 84 | #### FIXED
 85 | 
 86 | - Revert back to offering single response (no array) in cases where there are single results (for some commands)
 87 | 
 88 | ### 0.0.6
 89 | 
 90 | 2024-01-12 14:44
 91 | 
 92 | #### CHANGED
 93 | 
 94 | - Attributes array is now a hash directly keyed to the attribute key
 95 | 
 96 | #### NEW
 97 | 
 98 | - Tags command has option to output only raw html of matched tags
 99 | 
100 | #### FIXED
101 | 
102 | - --query works with --search on scrape and tags command
103 | - Json command dot query works now
104 | 
105 | ### 0.0.5
106 | 
107 | 2024-01-11 18:06
108 | 
109 | #### IMPROVED
110 | 
111 | - Add --query capabilities to images command
112 | - Add --query to links command
113 | - Allow hyphens in query syntax
114 | - Allow any character other than comma, ampersand, or right square bracket in query value
115 | 
116 | #### FIXED
117 | 
118 | - Html --search returns a full Curl::Html object
119 | - --query works better with --search and is consistent with other query functions
120 | - Scrape command outputting malformed data
121 | - Hash output when --query is used with scrape
122 | - Nil match on tags command
123 | 
124 | ### 0.0.4
125 | 
126 | 2024-01-10 13:54
127 | 
128 | #### FIXED
129 | 
130 | - Queries combined with + or & not requiring all matches to be true
131 | 
132 | ### 0.0.3
133 | 
134 | 2024-01-10 13:38
135 | 
136 | #### IMPROVED
137 | 
138 | - Refactor Curl and Json libs to allow setting of options after creation of object
139 | - Allow setting of headers on most subcommands
140 | - --clean now affects source, head, and body keys of output
141 | - Also remove tabs when cleaning whitespace
142 | 
143 | ### 0.0.2
144 | 
145 | 2024-01-10 09:18
146 | 
147 | ### 0.0.1
148 | 
149 | 2024-01-10 08:20
150 | 
151 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | gemspec
3 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | PATH
 2 |   remote: .
 3 |   specs:
 4 |     curlyq (0.0.16)
 5 |       gli (~> 2.21.0)
 6 |       nokogiri (~> 1.16.0)
 7 |       selenium-webdriver (~> 4.16.0)
 8 |       tty-which (~> 0.5.0)
 9 | 
10 | GEM
11 |   remote: https://rubygems.org/
12 |   specs:
13 |     gli (2.21.5)
14 |     nokogiri (1.16.7-arm64-darwin)
15 |       racc (~> 1.4)
16 |     parallel (1.26.3)
17 |     parallel_tests (3.13.0)
18 |       parallel
19 |     pastel (0.8.0)
20 |       tty-color (~> 0.5)
21 |     power_assert (2.0.4)
22 |     racc (1.8.1)
23 |     rake (13.2.1)
24 |     rdoc (6.3.4.1)
25 |     rexml (3.3.9)
26 |     rubyzip (2.3.2)
27 |     selenium-webdriver (4.16.0)
28 |       rexml (~> 3.2, >= 3.2.5)
29 |       rubyzip (>= 1.2.2, < 3.0)
30 |       websocket (~> 1.0)
31 |     strings-ansi (0.2.0)
32 |     test-unit (3.4.9)
33 |       power_assert
34 |     tty-color (0.6.0)
35 |     tty-cursor (0.7.1)
36 |     tty-progressbar (0.18.2)
37 |       strings-ansi (~> 0.2)
38 |       tty-cursor (~> 0.7)
39 |       tty-screen (~> 0.8)
40 |       unicode-display_width (>= 1.6, < 3.0)
41 |     tty-screen (0.8.2)
42 |     tty-spinner (0.9.3)
43 |       tty-cursor (~> 0.7)
44 |     tty-which (0.5.0)
45 |     unicode-display_width (2.6.0)
46 |     websocket (1.2.11)
47 |     yard (0.9.37)
48 | 
49 | PLATFORMS
50 |   arm64-darwin-20
51 |   x86_64-darwin-20
52 | 
53 | DEPENDENCIES
54 |   curlyq!
55 |   parallel_tests (~> 3.7, >= 3.7.3)
56 |   pastel (~> 0.8.0)
57 |   rake (~> 13.0, >= 13.0.1)
58 |   rdoc (~> 6.3.1)
59 |   test-unit (~> 3.4.4)
60 |   tty-progressbar (~> 0.18, >= 0.18.2)
61 |   tty-spinner (~> 0.9, >= 0.9.3)
62 |   yard (~> 0.9, >= 0.9.26)
63 | 
64 | BUNDLED WITH
65 |    2.2.29
66 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License 
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is furnished
 8 | to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice (including the next
11 | paragraph) shall be included in all copies or substantial portions of the
12 | Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
17 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 | OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CurlyQ
  2 | 
  3 | [![Gem](https://img.shields.io/gem/v/na.svg)](https://rubygems.org/gems/curlyq)
  4 | [![GitHub license](https://img.shields.io/github/license/ttscoff/curlyq.svg)](./LICENSE.txt)
  5 | 
  6 | **A command line helper for curl and web scraping**
  7 | 
  8 | _If you find this useful, feel free to [buy me some coffee][donate]._
  9 | 
 10 | [donate]: https://brettterpstra.com/donate
 11 | 
 12 | 
 13 | [jq]: https://github.com/jqlang/jq "Command-line JSON processor"
 14 | [yq]: https://github.com/mikefarah/yq "yq is a portable command-line YAML, JSON, XML, CSV, TOML and properties processor"
 15 | 
 16 | The current version of `curlyq` is 0.0.16.
 17 | 
 18 | CurlyQ is a utility that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It's designed to be part of a scripting pipeline, outputting everything as structured data (JSON or YAML). It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like [jq] to parse the output.
 19 | 
 20 | [github]: https://github.com/ttscoff/curlyq/
 21 | 
 22 | ### Installation
 23 | 
 24 | Assuming you have Ruby and RubyGems installed, you can just run `gem install curlyq`. If you run into errors, try `gem install --user-install curlyq`, or use `sudo gem install curlyq`.
 25 | 
 26 | If you're using Homebrew, you have the option to install via [brew-gem](https://github.com/sportngin/brew-gem):
 27 | 
 28 |     brew install brew-gem
 29 |     brew gem install curlyq
 30 | 
 31 | If you don't have Ruby/RubyGems, you can install them pretty easily with [Homebrew], [rvm], or [asdf].
 32 | 
 33 | [Homebrew]: https://brew.sh/ "Homebrew???The Missing Package Manager for macOS (or Linux)"
 34 | [rvm]: https://rvm.io/ "Ruby Version Manager (RVM)"
 35 | [asdf]: https://github.com/asdf-vm/asdf "asdf-vm/asdf:Extendable version manager with support for ..."
 36 | 
 37 | ### Usage
 38 | 
 39 | Run `curlyq help` for a list of subcommands. Run `curlyq help SUBCOMMAND` for details on a particular subcommand and its options.
 40 | 
 41 | ```
 42 | NAME
 43 |     curlyq - A scriptable interface to curl
 44 | 
 45 | SYNOPSIS
 46 |     curlyq [global options] command [command options] [arguments...]
 47 | 
 48 | VERSION
 49 |     0.0.16
 50 | 
 51 | GLOBAL OPTIONS
 52 |     --help          - Show this message
 53 |     --[no-]pretty   - Output "pretty" JSON (default: enabled)
 54 |     --version       - Display the program version
 55 |     -y, --[no-]yaml - Output YAML instead of json
 56 | 
 57 | COMMANDS
 58 |     execute    - Execute JavaScript on a URL
 59 |     extract    - Extract contents between two regular expressions
 60 |     headlinks  - Return all <head> links on URL's page
 61 |     help       - Shows a list of commands or help for one command
 62 |     html, curl - Curl URL and output its elements, multiple URLs allowed
 63 |     images     - Extract all images from a URL
 64 |     json       - Get a JSON response from a URL, multiple URLs allowed
 65 |     links      - Return all links on a URL's page
 66 |     scrape     - Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.
 67 |     screenshot - Save a screenshot of a URL
 68 |     tags       - Extract all instances of a tag
 69 | ```
 70 | 
 71 | ### Query and Search syntax
 72 | 
 73 | You can shape the results using `--search` (`-s`) and `--query` (`-q`) on some commands.
 74 | 
 75 | A search uses either CSS or XPath syntax to locate elements. For example, if you wanted to locate all of the `<article>` elements with a class of `post` inside of the div with an id of `main`, you would run `--search '#main article.post'`. Searches can target tags, ids, and classes, and can accept `>` to target direct descendents. You can also use XPaths, but I hate those so I'm not going to document them.
 76 | 
 77 | > I've tried to make the query function useful, but if you want to do any kind of advanced shaping, you're better off piping the JSON output to [jq] or [yq].
 78 | 
 79 | 
 80 | Queries are specifically for shaping CurlyQ output. If you're using the `html` command, it returns a key called `images`, so you can target just the images in the response with `-q 'images'`. The queries accept array syntax, so to get the first image, you would use `-q 'images[0]'`. Ranges are accepted as well, so `-q 'images[1..4]'` will return the 2nd through 5th images found on the page. You can also do comparisons, e.g. `images[rel=me]'` to target only images with a `rel` attribute of `me`.
 81 | 
 82 | The comparisons for the query flag are:
 83 | 
 84 | - `<` less than
 85 | - `>` greater than
 86 | - `<=` less than or equal to
 87 | - `>=` greater than or equal to
 88 | - `=` or `==` is equal to
 89 | - `*=` contains text
 90 | - `^=` starts with text
 91 | - `$=` ends with text
 92 | 
 93 | Comparisons can be numeric or string comparisons. A numeric comparison like `curlyq images -q '[width>500]' URL` would return all of the images on the page with a width attribute greater than 500.
 94 | 
 95 | You can also use dot syntax inside of comparisons, e.g. `[links.rel*=me]` to target the links object (`html` command), and return only the links with a `rel=me` attribute. If the comparison is to an array object (like `class` or `rel`), it will match if any of the elements of the array match your comparison.
 96 | 
 97 | If you end the query with a specific key, only that key will be output. If there's only one match, it will be output as a raw string. If there are multiple matches, output will be an array:
 98 | 
 99 |     curlyq tags --search '#main .post h3' -q '[attrs.id*=what].source' 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/'
100 |     
101 |     <h3 id="whats-next">What???s Next</h3>
102 | 
103 | #### Commands
104 | 
105 | curlyq makes use of subcommands, e.g. `curlyq html [options] URL` or `curlyq extract [options] URL`. Each subcommand takes its own options, but I've made an effort to standardize the choices between each command as much as possible.
106 | 
107 | ##### extract
108 | 
109 | Example: 
110 | 
111 |     curlyq extract -i -b 'Adding' -a 'accessing the source.' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python'
112 | 
113 |     [
114 |       "Adding <code>time.sleep(10)</code> in various places in case the page had not fully loaded when I was accessing the source."
115 |     ]
116 | 
117 | This specifies a before and after string and includes them (`-i`) in the result.
118 | 
119 | ```
120 | NAME
121 |     extract - Extract contents between two regular expressions
122 | 
123 | SYNOPSIS
124 | 
125 |     curlyq [global options] extract [command options] URL...
126 | 
127 | COMMAND OPTIONS
128 |     -a, --after=arg       - Text after extraction (default: none)
129 |     -b, --before=arg      - Text before extraction (default: none)
130 |     -c, --[no-]compressed - Expect compressed results
131 |     --[no-]clean          - Remove extra whitespace from results
132 |     -h, --header=arg      - Define a header to send as key=value (may be used more than once, default: none)
133 |     -i, --[no-]include    - Include the before/after matches in the result
134 |     -r, --[no-]regex      - Process before/after strings as regular expressions
135 |     --[no-]strip          - Strip HTML tags from results
136 | ```
137 | 
138 | 
139 | ##### execute
140 | 
141 | You can execute JavaScript on a given web page using the `execute` subcommand.
142 | 
143 | Example:
144 | 
145 |     curlyq execute -s "NiftyAPI.find('file/save').arrow().shoot('file-save')" file:///Users/ttscoff/Desktop/Code/niftymenu/dist/MultiMarkdown-Composer.html
146 | 
147 | You can specify an element id to wait for using `--id`, and define a pause to wait after executing a script with `--wait` (defaults to 2 seconds). Scripts can be read from the command line arguments with `--script "SCRIPT"`, from STDIN with `--script -`, or from a file using `--script PATH`.
148 | 
149 | If you expect a return value, be sure to include a `return` statement in your executed script. Results will be output to STDOUT.
150 | 
151 | ```
152 | NAME
153 |     execute - Execute JavaScript on a URL
154 | 
155 | SYNOPSIS
156 | 
157 |     curlyq [global options] execute [command options] URL...
158 | 
159 | COMMAND OPTIONS
160 |     -b, --browser=arg - Browser to use (firefox, chrome) (default: chrome)
161 |     -h, --header=arg  - Define a header to send as key=value (may be used more than once, default: none)
162 |     -i, --id=arg      - Element ID to wait for before executing (default: none)
163 |     -s, --script=arg  - Script to execute, use - to read from STDIN (may be used more than once, default: none)
164 |     -w, --wait=arg    - Seconds to wait after executing JS (default: 2)
165 | ```
166 | 
167 | ##### headlinks
168 | 
169 | Example:
170 | 
171 |     curlyq headlinks -q '[rel=stylesheet]' https://brettterpstra.com
172 | 
173 |     {
174 |       "rel": "stylesheet",
175 |       "href": "https://cdn3.brettterpstra.com/stylesheets/screen.7261.css",
176 |       "type": "text/css",
177 |       "title": null
178 |     }
179 | 
180 | This pulls all `<links>` from the `<head>` of the page, and uses a query `-q` to only show links with `rel="stylesheet"`.
181 | 
182 | ```
183 | NAME
184 |     headlinks - Return all <head> links on URL's page
185 | 
186 | SYNOPSIS
187 | 
188 |     curlyq [global options] headlinks [command options] URL...
189 | 
190 | COMMAND OPTIONS
191 |     -q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
192 | ```
193 | 
194 | ##### html
195 | 
196 | The html command (aliased as `curl`) gets the entire text of the web page and provides a JSON response with a breakdown of:
197 | 
198 | - URL, after any redirects
199 | - Response code
200 | - Response headers as a keyed hash
201 | - Meta elements for the page as a keyed hash
202 | - All meta links in the head as an array of objects containing (as available): 
203 |     - rel
204 |     - href
205 |     - type
206 |     - title
207 | - source of `<head>`
208 | - source of `<body>`
209 | - the page title (determined first by og:title, then by a title tag)
210 | - description (using og:description first)
211 | - All links on the page as an array of objects with: 
212 |     - href
213 |     - title
214 |     - rel
215 |     - text content
216 |     - classes as array
217 | - All images on the page as an array of objects containing:
218 |     - class
219 |     - all attributes as key/value pairs
220 |     - width and height (if specified)
221 |     - src
222 |     - alt and title
223 | 
224 | You can add a query (`-q`) to only get the information needed, e.g. `-q images[width>600]`.
225 | 
226 | Example:
227 | 
228 |     curlyq html -s '#main article .aligncenter' -q 'images[1]' 'https://brettterpstra.com'
229 | 
230 |     [
231 |       {
232 |         "class": "aligncenter",
233 |         "original": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb_tw.jpg",
234 |         "at2x": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb@2x.jpg",
235 |         "width": "800",
236 |         "height": "226",
237 |         "src": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb.jpg",
238 |         "alt": "Giveaway Robot with Keyboard Maestro icon",
239 |         "title": "Giveaway Robot with Keyboard Maestro icon"
240 |       }
241 |     ]
242 | 
243 | The above example queries the full html of the page, but narrows the elements using `--search` and then takes the 2nd image from the results.
244 | 
245 |     curlyq html -q 'meta.title'  https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
246 | 
247 |     Introducing CurlyQ, a pipeline-oriented curl helper - BrettTerpstra.com
248 | 
249 | The above example curls the page and returns the title attribute found in the meta (`-q 'meta.title'`).
250 | 
251 | ```
252 | NAME
253 |     html - Curl URL and output its elements, multiple URLs allowed
254 | 
255 | SYNOPSIS
256 | 
257 |     curlyq [global options] html [command options] URL...
258 | 
259 | COMMAND OPTIONS
260 |     -I, --info                - Only retrieve headers/info
261 |     -b, --browser=arg         - Use a browser to retrieve a dynamic web page (firefox, chrome) (default: none)
262 |     -c, --compressed          - Expect compressed results
263 |     --[no-]clean              - Remove extra whitespace from results
264 |     -f, --fallback=arg        - If curl doesn't work, use a fallback browser (firefox, chrome) (default: none)
265 |     -h, --header=arg          - Define a header to send as "key=value" (may be used more than once, default: none)
266 |     --[no-]ignore_fragments   - Ignore fragment hrefs when gathering content links
267 |     --[no-]ignore_relative    - Ignore relative hrefs when gathering content links
268 |     -l, --local_links_only    - Only gather internal (same-site) links
269 |     -q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
270 |     -r, --raw=arg             - Output a raw value for a key (default: none)
271 |     -s, --search=arg          - Regurn an array of matches to a CSS or XPath query (default: none)
272 |     -x, --external_links_only - Only gather external links
273 | ```
274 | 
275 | ##### images
276 | 
277 | The images command returns only the images on the page as an array of objects. It can be queried to match certain requirements (see Query and Search syntax above).
278 | 
279 | The base command will return all images on the page, including OpenGraph images from the head, `<img>` tags from the body, and `<srcset>` tags along with their child images.
280 | 
281 | OpenGraph images will be returned with the structure:
282 | 
283 |     {
284 |         "type": "opengraph",
285 |         "attrs": null,
286 |         "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg"
287 |       }
288 | 
289 | `img` tags will be returned with the structure:
290 | 
291 |     {
292 |         "type": "img",
293 |         "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb.jpg",
294 |         "width": "800",
295 |         "height": "226",
296 |         "alt": "Banner image for CurlyQ",
297 |         "title": "CurlyQ, curl better",
298 |         "attrs": [
299 |           {
300 |             "class": [
301 |               "aligncenter"
302 |              ], // all attributes included
303 |           }
304 |         ]
305 |       }
306 | 
307 | 
308 | 
309 | `srcset` images will be returned with the structure:
310 | 
311 |     {
312 |         "type": "srcset",
313 |             "attrs": [
314 |               {
315 |                 "key": "srcset",
316 |                 "value": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg 1x, https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg 2x"
317 |               }
318 |             ],
319 |             "images": [
320 |               {
321 |                 "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg",
322 |                 "media": "1x"
323 |               },
324 |               {
325 |                 "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg",
326 |                 "media": "2x"
327 |               }
328 |           ]
329 |         }
330 |     }
331 | 
332 | Example:
333 | 
334 |     curlyq images -t img -q '[alt$=screenshot]' https://brettterpstra.com
335 | 
336 | This will return an array of images that are `<img>` tags, and only show the ones that have an `alt` attribute that ends with `screenshot`.
337 | 
338 |     curlyq images -q '[width>750]' https://brettterpstra.com
339 | 
340 | This example will only return images that have a width greater than 750 pixels. This query depends on the images having proper `width` attributes set on them in the source.
341 | 
342 | ```
343 | NAME
344 |     images - Extract all images from a URL
345 | 
346 | SYNOPSIS
347 | 
348 |     curlyq [global options] images [command options] URL...
349 | 
350 | COMMAND OPTIONS
351 |     -c, --[no-]compressed     - Expect compressed results
352 |     --[no-]clean              - Remove extra whitespace from results
353 |     -h, --header=arg          - Define a header to send as key=value (may be used more than once, default: none)
354 |     -q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
355 |     -t, --type=arg            - Type of images to return (img, srcset, opengraph, all) (may be used more than once, default: ["all"])
356 | ```
357 | 
358 | ##### json
359 | 
360 | The `json` command just returns an object with header/response info, and the contents of the JSON response after it's been read by the Ruby JSON library and output. If there are fetching or parsing errors it will fail gracefully with an error code.
361 | 
362 | ```
363 | NAME
364 |     json - Get a JSON response from a URL, multiple URLs allowed
365 | 
366 | SYNOPSIS
367 | 
368 |     curlyq [global options] json [command options] URL...
369 | 
370 | COMMAND OPTIONS
371 |     -c, --[no-]compressed     - Expect compressed results
372 |     -h, --header=arg          - Define a header to send as key=value (may be used more than once, default: none)
373 |     -q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
374 | ```
375 | 
376 | ##### links
377 | 
378 | Returns all the links on the page, which can be queried on any attribute.
379 | 
380 | Example:
381 | 
382 |     curlyq links -q '[content*=twitter]' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python'
383 | 
384 |     [
385 |       {
386 |         "href": "https://twitter.com/stackoverflow",
387 |         "title": null,
388 |         "rel": null,
389 |         "content": "Twitter",
390 |         "class": [
391 |           "-link",
392 |           "js-gps-track"
393 |         ]
394 |       }
395 |     ]
396 | 
397 | This example gets all links from the page but only returns ones with link content containing 'twitter' (`-q '[content*=twitter]'`).
398 | 
399 | ```
400 | NAME
401 |     links - Return all links on a URL's page
402 | 
403 | SYNOPSIS
404 | 
405 |     curlyq [global options] links [command options] URL...
406 | 
407 | COMMAND OPTIONS
408 |     -d, --[no-]dedup          - Filter out duplicate links, preserving only first one
409 |     --[no-]ignore_fragments   - Ignore fragment hrefs when gathering content links
410 |     --[no-]ignore_relative    - Ignore relative hrefs when gathering content links
411 |     -l, --local_links_only    - Only gather internal (same-site) links
412 |     -q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
413 |     -x, --external_links_only - Only gather external links
414 | ```
415 | 
416 | ##### scrape
417 | 
418 | Loads the page in a web browser, allowing scraping of dynamically loaded pages that return nothing but scripts when `curl`ed. The `-b` (`--browser`) option is required and should be 'chrome' or 'firefox' (or just 'c' or 'f'). The selected browser must be installed on your system.
419 | 
420 | Example:
421 | 
422 |     curlyq scrape -b firefox -q 'links[rel=me&content*=mastodon][0]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
423 | 
424 |     {
425 |       "href": "https://nojack.easydns.ca/@ttscoff",
426 |       "title": null,
427 |       "rel": [
428 |         "me"
429 |       ],
430 |       "content": "Mastodon",
431 |       "class": [
432 |         "u-url"
433 |       ]
434 |     }
435 | 
436 | This example scrapes the page using firefox and finds the first link with a rel of 'me' and text containing 'mastodon'.
437 | 
438 | ```
439 | NAME
440 |     scrape - Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.
441 | 
442 | SYNOPSIS
443 | 
444 |     curlyq [global options] scrape [command options] URL...
445 | 
446 | COMMAND OPTIONS
447 |     -b, --browser=arg         - Browser to use (firefox, chrome) (required, default: none)
448 |     --[no-]clean              - Remove extra whitespace from results
449 |     -h, --header=arg          - Define a header to send as "key=value" (may be used more than once, default: none)
450 |     -q, --query, --filter=arg - Filter output using dot-syntax path (default: none)
451 |     -r, --raw=arg             - Output a raw value for a key (default: none)
452 |     --search=arg              - Regurn an array of matches to a CSS or XPath query (default: none)
453 | ```
454 | 
455 | ##### screenshot
456 | 
457 | Full-page screenshots require Firefox, installed and specified with `--browser firefox`.
458 | 
459 | Type defaults to `full`, but will only work if `-b` is Firefox. If you want to use Chrome, you must specify a `--type` as 'visible' or 'print'.
460 | 
461 | The `-o` (`--output`) flag is required. It should be a path to a target PNG file (or PDF for `-t print` output). Extension will be modified automatically, all you need is the base name.
462 | 
463 | Example:
464 | 
465 |     curlyq screenshot -b f -o ~/Desktop/test https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
466 | 
467 |     Screenshot saved to /Users/ttscoff/Desktop/test.png
468 | 
469 | You can wait for an element ID to be visible using `--id`. This can be any `#ID` on the page. If the ID doesn't exist on the page, though, the screenshot will hang for a timeout of 10 seconds.
470 | 
471 | You can execute a script before taking the screenshot with the `--script` flag. If this is set to `-`, it will read the script from STDIN. If it's set to an existing file path, that file will be read for script input. Specify an interval (in seconds) to wait after executing the script with `--wait`.
472 | 
473 | ```
474 | NAME
475 |     screenshot - Save a screenshot of a URL
476 | 
477 | SYNOPSIS
478 | 
479 |     curlyq [global options] screenshot [command options] URL...
480 | 
481 | COMMAND OPTIONS
482 |     -b, --browser=arg     - Browser to use (firefox, chrome) (default: chrome)
483 |     -h, --header=arg      - Define a header to send as key=value (may be used more than once, default: none)
484 |     -i, --id=arg          - Element ID to wait for before taking screenshot (default: none)
485 |     -o, --out, --file=arg - File destination (required, default: none)
486 |     -s, --script=arg      - Script to execute before taking screenshot (may be used more than once, default: none)
487 |     -t, --type=arg        - Type of screenshot to save (full (requires firefox), print, visible) (default: visible)
488 |     -w, --wait=arg        - Time to wait before taking screenshot (default: 0)
489 | ```
490 | 
491 | ##### tags
492 | 
493 | Return a hierarchy of all tags in a page. Use `-t` to limit to a specific tag.
494 | 
495 |     curlyq tags --search '#main .post h3' -q '[attrs.id*=what]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
496 | 
497 |     [
498 |       {
499 |         "tag": "h3",
500 |         "source": "<h3 id=\"whats-next\">What???s Next</h3>",
501 |         "attrs": [
502 |           {
503 |             "id": "whats-next"
504 |           }
505 |         ],
506 |         "content": "What???s Next",
507 |         "tags": [
508 | 
509 |         ]
510 |       }
511 |     ]
512 | 
513 | The above command filters the tags based on a CSS query, then further filters them to just tags with an id containing 'what'.
514 | 
515 | ```
516 | NAME
517 |     tags - Extract all instances of a tag
518 | 
519 | SYNOPSIS
520 | 
521 |     curlyq [global options] tags [command options] URL...
522 | 
523 | COMMAND OPTIONS
524 |     -c, --[no-]compressed            - Expect compressed results
525 |     --[no-]clean                     - Remove extra whitespace from results
526 |     -h, --header=KEY=VAL             - Define a header to send as key=value (may be used more than once, default: none)
527 |     -q, --query, --filter=DOT_SYNTAX - Dot syntax query to filter results (default: none)
528 |     --search=CSS/XPATH               - Regurn an array of matches to a CSS or XPath query (default: none)
529 |     --[no-]source, --[no-]html       - Output the HTML source of the results
530 |     -t, --tag=TAG                    - Specify a tag to collect (may be used more than once, default: none)
531 | ```
532 | 
533 | 
534 | PayPal link: [paypal.me/ttscoff](https://paypal.me/ttscoff)
535 | 
536 | ## Changelog
537 | 
538 | See [CHANGELOG.md](https://github.com/ttscoff/curlyq/blob/main/CHANGELOG.md)
539 | 


--------------------------------------------------------------------------------
/README.rdoc:
--------------------------------------------------------------------------------
1 | = curly
2 | 
3 | A CLI helper for curl and web scraping
4 | 
5 | :include:curlyq.rdoc
6 | 
7 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
  1 | require 'rake/clean'
  2 | require 'rake/testtask'
  3 | require 'rubygems'
  4 | require 'rubygems/package_task'
  5 | require 'rdoc/task'
  6 | require 'yard'
  7 | require 'parallel_tests'
  8 | require 'parallel_tests/tasks'
  9 | require 'tty-spinner'
 10 | 
 11 | YARD::Rake::YardocTask.new do |t|
 12 |  t.files = ['lib/curly/*.rb']
 13 |  t.options = ['--markup=markdown', '--no-private', '-p', 'yard_templates']
 14 |  # t.stats_options = ['--list-undoc']
 15 | end
 16 | 
 17 | task :doc, [*Rake.application[:yard].arg_names] => [:yard]
 18 | 
 19 | Rake::RDocTask.new do |rd|
 20 |   rd.main = "README.rdoc"
 21 |   rd.rdoc_files.include("README.rdoc","lib/**/*.rb","bin/**/*")
 22 |   rd.title = 'curlyq'
 23 | end
 24 | 
 25 | spec = eval(File.read('curlyq.gemspec'))
 26 | 
 27 | Gem::PackageTask.new(spec) do |pkg|
 28 | end
 29 | 
 30 | namespace :test do
 31 |   FileList['test/*_test.rb'].each do |rakefile|
 32 |     test_name = File.basename(rakefile, '.rb').sub(/^.*?_(.*?)_.*?$/, '\1')
 33 | 
 34 |     Rake::TestTask.new(:"#{test_name}") do |t|
 35 |       t.libs << ['test', 'test/helpers']
 36 |       t.pattern = rakefile
 37 |       t.verbose = ENV['VERBOSE'] =~ /(true|1)/i ? true : false
 38 |     end
 39 |     # Define default task for :test
 40 |     task default: test_name
 41 |   end
 42 | end
 43 | 
 44 | desc 'Run one test verbosely'
 45 | task :test_one, :test do |_, args|
 46 |   args.with_defaults(test: '*')
 47 |   puts `bundle exec rake test TESTOPTS="-v" TEST="test/curlyq_#{args[:test]}_test.rb"`
 48 | end
 49 | 
 50 | desc 'Run all tests, threaded'
 51 | task :test, :pattern, :threads, :max_tests do |_, args|
 52 |   args.with_defaults(pattern: '*', threads: 8, max_tests: 0)
 53 |   pattern = args[:pattern] =~ /(n[iu]ll?|0|\.)/i ? '*' : args[:pattern]
 54 | 
 55 |   require_relative 'test/helpers/threaded_tests'
 56 |   ThreadedTests.new.run(pattern: pattern, max_threads: args[:threads].to_i, max_tests: args[:max_tests])
 57 | end
 58 | 
 59 | desc 'Install current gem in all versions of asdf-controlled ruby'
 60 | task :install do
 61 |   Rake::Task['clobber'].invoke
 62 |   Rake::Task['package'].invoke
 63 |   Dir.chdir 'pkg'
 64 |   file = Dir.glob('*.gem').last
 65 | 
 66 |   current_ruby = `asdf current ruby`.match(/(\d.\d+.\d+)/)[1]
 67 | 
 68 |   `asdf list ruby`.split.map { |ruby| ruby.strip.sub(/^*/, '') }.each do |ruby|
 69 |     `asdf shell ruby #{ruby}`
 70 |     puts `gem install #{file}`
 71 |   end
 72 | 
 73 |   `asdf shell ruby #{current_ruby}`
 74 | end
 75 | 
 76 | desc 'Development version check'
 77 | task :ver do
 78 |   gver = `git ver`
 79 |   cver = IO.read(File.join(File.dirname(__FILE__), 'CHANGELOG.md')).match(/^#+ (\d+\.\d+\.\d+(\w+)?)/)[1]
 80 |   res = `grep VERSION lib/curly/version.rb`
 81 |   version = res.match(/VERSION *= *['"](\d+\.\d+\.\d+(\w+)?)/)[1]
 82 |   puts "git tag: #{gver}"
 83 |   puts "version.rb: #{version}"
 84 |   puts "changelog: #{cver}"
 85 | end
 86 | 
 87 | desc 'Changelog version check'
 88 | task :cver do
 89 |   puts IO.read(File.join(File.dirname(__FILE__), 'CHANGELOG.md')).match(/^#+ (\d+\.\d+\.\d+(\w+)?)/)[1]
 90 | end
 91 | 
 92 | desc 'Bump incremental version number'
 93 | task :bump, :type do |_, args|
 94 |   args.with_defaults(type: 'inc')
 95 |   version_file = 'lib/curly/version.rb'
 96 |   content = IO.read(version_file)
 97 |   content.sub!(/VERSION = '(?<major>\d+)\.(?<minor>\d+)\.(?<inc>\d+)(?<pre>\S+)?'/) do
 98 |     m = Regexp.last_match
 99 |     major = m['major'].to_i
100 |     minor = m['minor'].to_i
101 |     inc = m['inc'].to_i
102 |     pre = m['pre']
103 | 
104 |     case args[:type]
105 |     when /^maj/
106 |       major += 1
107 |       minor = 0
108 |       inc = 0
109 |     when /^min/
110 |       minor += 1
111 |       inc = 0
112 |     else
113 |       inc += 1
114 |     end
115 | 
116 |     $stdout.puts "At version #{major}.#{minor}.#{inc}#{pre}"
117 |     "VERSION = '#{major}.#{minor}.#{inc}#{pre}'"
118 |   end
119 |   File.open(version_file, 'w+') { |f| f.puts content }
120 | end
121 | 
122 | task default: %i[test clobber package]
123 | 


--------------------------------------------------------------------------------
/bin/curlyq:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | require 'gli'
  3 | require 'curly'
  4 | require 'curly/curl'
  5 | 
  6 | include GLI::App
  7 | 
  8 | program_desc 'A scriptable interface to curl'
  9 | 
 10 | version Curly::VERSION
 11 | 
 12 | subcommand_option_handling :normal
 13 | arguments :strict
 14 | 
 15 | ImageType = Class.new(Symbol)
 16 | accept ImageType do |value|
 17 |   value.normalize_image_type(:all)
 18 | end
 19 | 
 20 | BrowserType = Class.new(Symbol)
 21 | accept BrowserType do |value|
 22 |   value.normalize_browser_type(:none)
 23 | end
 24 | 
 25 | ScreenshotType = Class.new(Symbol)
 26 | accept ScreenshotType do |value|
 27 |   value.normalize_screenshot_type(:full_page)
 28 | end
 29 | 
 30 | desc 'Output YAML instead of json'
 31 | switch %i[y yaml]
 32 | 
 33 | desc 'Output "pretty" JSON'
 34 | switch %i[pretty], default_value: true, negatable: true
 35 | 
 36 | # TODO: Post method, html and json with --data flags, accept key=value and files
 37 | # TODO: Handle binary responses, deal gracefully with compressed data
 38 | # TODO: File uploads?
 39 | 
 40 | def self.break_headers(headers)
 41 |   out = {}
 42 |   headers.each do |h|
 43 |     m = h.match(/(?<key>[^=]+)=(?<value>.*?)$/)
 44 |     out[m['key'].strip] = m['value'].strip
 45 |   end
 46 |   out
 47 | end
 48 | 
 49 | def self.print_out(output, yaml, raw: false, pretty: true)
 50 |   output = output.to_data if output.respond_to?(:to_data)
 51 |   # Was intended to flatten single responses, but not getting an array back is unpredictable
 52 |   output = output.clean_output
 53 |   if output.is_a?(String)
 54 |     print output
 55 |   elsif raw
 56 |     output = output.join("\n") if output.is_a?(Array)
 57 |     print output
 58 |   else
 59 |     if yaml
 60 |       print YAML.dump(output)
 61 |     else
 62 |       print pretty ? JSON.pretty_generate(output) : JSON.generate(output)
 63 |     end
 64 |   end
 65 | end
 66 | 
 67 | desc 'Curl URL and output its elements, multiple URLs allowed'
 68 | arg_name 'URL', multiple: true
 69 | command %i[html curl] do |c|
 70 |   c.desc 'Only retrieve headers/info'
 71 |   c.switch %i[I info], negatable: false
 72 | 
 73 |   c.desc 'Regurn an array of matches to a CSS or XPath query'
 74 |   c.flag %i[s search]
 75 | 
 76 |   c.desc 'Define a header to send as "key=value"'
 77 |   c.flag %i[h header], multiple: true
 78 | 
 79 |   c.desc 'Use a browser to retrieve a dynamic web page (firefox, chrome)'
 80 |   c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/
 81 | 
 82 |   c.desc %(If curl doesn't work, use a fallback browser (firefox, chrome))
 83 |   c.flag %i[f fallback], type: BrowserType, must_match: /^[fc].*?$/
 84 | 
 85 |   c.desc 'Expect compressed results'
 86 |   c.switch %i[c compressed], negatable: false
 87 | 
 88 |   c.desc 'Remove extra whitespace from results'
 89 |   c.switch %i[clean]
 90 | 
 91 |   c.desc 'Filter output using dot-syntax path'
 92 |   c.flag %i[q query filter]
 93 | 
 94 |   c.desc 'Output a raw value for a key'
 95 |   c.flag %i[r raw]
 96 | 
 97 |   c.desc 'Ignore relative hrefs when gathering content links'
 98 |   c.switch %i[ignore_relative], negatable: true
 99 | 
100 |   c.desc 'Ignore fragment hrefs when gathering content links'
101 |   c.switch %i[ignore_fragments], negatable: true
102 | 
103 |   c.desc 'Only gather external links'
104 |   c.switch %i[x external_links_only], default_value: false, negatable: false
105 | 
106 |   c.desc 'Only gather internal (same-site) links'
107 |   c.switch %i[l local_links_only], default_value: false, negatable: false
108 | 
109 |   c.action do |global_options, options, args|
110 |     urls = args.join(' ').split(/[, ]+/)
111 |     headers = break_headers(options[:header])
112 | 
113 |     output = []
114 | 
115 |     urls.each do |url|
116 |       curl_settings = { browser: options[:browser], fallback: options[:fallback],
117 |                         headers: headers, headers_only: options[:info],
118 |                         compressed: options[:compressed], clean: options[:clean],
119 |                         ignore_local_links: options[:ignore_relative],
120 |                         ignore_fragment_links: options[:ignore_fragments],
121 |                         external_links_only: options[:external_links_only],
122 |                         local_links_only: options[:local_links_only] }
123 |       res = Curl::Html.new(url, curl_settings)
124 |       res.curl
125 | 
126 |       if options[:info]
127 |         output.push(res.headers)
128 |         next
129 |       end
130 | 
131 |       if options[:search]
132 |         source = res.search(options[:search], return_source: true)
133 | 
134 |         out = res.parse(source)
135 | 
136 |         if options[:query]
137 |           out = out.to_data(url: url, clean: options[:clean]).dot_query(options[:query], full_tag: false)
138 |         else
139 |           out = out.to_data
140 |         end
141 |         output.push([out])
142 |       elsif options[:query]
143 |         queried = res.to_data.dot_query(options[:query], full_tag: false)
144 |         output.push(queried) if queried
145 |       else
146 |         output.push(res.to_data(url: url))
147 |       end
148 |     end
149 |     output.delete_if(&:nil?)
150 |     output.delete_if(&:empty?)
151 | 
152 |     exit_now!('No results') if output.nil? || output.empty?
153 | 
154 |     output.map! { |o| o[options[:raw].to_sym] } if options[:raw]
155 | 
156 |     output = output.clean_output
157 | 
158 |     print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
159 |   end
160 | end
161 | 
162 | desc 'Execute JavaScript on a URL'
163 | arg_name 'URL', multiple: true
164 | command :execute do |c|
165 |   c.desc 'Browser to use (firefox, chrome)'
166 |   c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
167 | 
168 |   c.desc 'Define a header to send as key=value'
169 |   c.flag %i[h header], multiple: true
170 | 
171 |   c.desc 'Script to execute, use - to read from STDIN'
172 |   c.flag %i[s script], multiple: true
173 | 
174 |   c.desc 'Element ID to wait for before executing'
175 |   c.flag %i[i id]
176 | 
177 |   c.desc 'Seconds to wait after executing JS'
178 |   c.flag %i[w wait], default_value: 2
179 | 
180 |   c.action do |_, options, args|
181 |     urls = args.join(' ').split(/[, ]+/)
182 | 
183 |     raise 'Script input required' unless options[:file] || options[:script]
184 | 
185 |     compiled_script = []
186 | 
187 |     if options[:script].count.positive?
188 |       options[:script].each do |scr|
189 |         scr.strip!
190 |         if scr == '-'
191 |           compiled_script << $stdin.read
192 |         elsif File.exist?(File.expand_path(scr))
193 |           compiled_script << IO.read(File.expand_path(scr))
194 |         else
195 |           compiled_script << scr
196 |         end
197 |       end
198 |     end
199 | 
200 |     script = compiled_script.count.positive? ? compiled_script.join(';') : nil
201 | 
202 |     headers = break_headers(options[:header])
203 | 
204 |     browser = options[:browser]
205 | 
206 |     browser = browser.is_a?(Symbol) ? browser : browser.normalize_browser_type
207 | 
208 |     urls.each do |url|
209 |       c = Curl::Html.new(url)
210 |       c.headers = headers
211 |       c.browser = browser
212 |       $stdout.puts c.execute(script, options[:wait], options[:id])
213 |     end
214 |   end
215 | end
216 | 
217 | desc 'Save a screenshot of a URL'
218 | arg_name 'URL', multiple: true
219 | command :screenshot do |c|
220 |   c.desc 'Type of screenshot to save (full (requires firefox), print, visible)'
221 |   c.flag %i[t type], type: ScreenshotType, must_match: /^[fpv].*?$/, default_value: 'visible'
222 | 
223 |   c.desc 'Browser to use (firefox, chrome)'
224 |   c.flag %i[b browser], type: BrowserType, must_match: /^[fc].*?$/, default_value: 'chrome'
225 | 
226 |   c.desc 'File destination'
227 |   c.flag %i[o out file], required: true
228 | 
229 |   c.desc 'Define a header to send as key=value'
230 |   c.flag %i[h header], multiple: true
231 | 
232 |   c.desc 'Script to execute before taking screenshot'
233 |   c.flag %i[s script], multiple: true
234 | 
235 |   c.desc 'Element ID to wait for before taking screenshot'
236 |   c.flag %i[i id]
237 | 
238 |   c.desc 'Time to wait before taking screenshot'
239 |   c.flag %i[w wait], default_value: 0, type: Integer
240 | 
241 |   c.action do |_, options, args|
242 |     urls = args.join(' ').split(/[, ]+/)
243 |     headers = break_headers(options[:header])
244 | 
245 |     type = options[:type]
246 |     browser = options[:browser]
247 | 
248 |     type = type.is_a?(Symbol) ? type : type.normalize_screenshot_type
249 |     browser = browser.is_a?(Symbol) ? browser : browser.normalize_browser_type
250 | 
251 |     compiled_script = []
252 | 
253 |     if options[:script].count.positive?
254 |       options[:script].each do |scr|
255 |         scr.strip!
256 |         if scr == '-'
257 |           compiled_script << $stdin.read
258 |         elsif File.exist?(File.expand_path(scr))
259 |           compiled_script << IO.read(File.expand_path(scr))
260 |         else
261 |           compiled_script << scr
262 |         end
263 |       end
264 |     end
265 | 
266 |     script = compiled_script.count.positive? ? compiled_script.join(';') : nil
267 | 
268 |     raise 'Full page screen shots only available with Firefox' if type == :full_page && browser != :firefox
269 | 
270 |     urls.each do |url|
271 |       c = Curl::Html.new(url)
272 |       c.headers = headers
273 |       c.browser = browser
274 |       c.screenshot(options[:out], type: type, script: script, id: options[:id], wait: options[:wait])
275 |     end
276 |   end
277 | end
278 | 
279 | desc 'Get a JSON response from a URL, multiple URLs allowed'
280 | arg_name 'URL', multiple: true
281 | command :json do |c|
282 |   c.desc 'Define a header to send as key=value'
283 |   c.flag %i[h header], multiple: true
284 | 
285 |   c.desc 'Expect compressed results'
286 |   c.switch %i[c compressed]
287 | 
288 |   c.desc 'Filter output using dot-syntax path'
289 |   c.flag %i[q query filter]
290 | 
291 |   c.action do |global_options, options, args|
292 |     urls = args.join(' ').split(/[, ]+/)
293 |     headers = break_headers(options[:header])
294 | 
295 |     output = []
296 | 
297 |     urls.each do |url|
298 |       res = Curl::Json.new(url)
299 |       res.request_headers = headers
300 |       res.compressed = options[:compressed],
301 |       res.symbolize_names = false
302 |       res.curl
303 | 
304 |       json = res.json
305 | 
306 |       if json.nil?
307 |         output.push({
308 |           status: 'error parsing JSON',
309 |           url: res.url,
310 |           code: res.code,
311 |           headers: res.headers
312 |         })
313 |       else
314 |         if options[:query]
315 |           if options[:query] =~ /^json$/
316 |             res = json
317 |           elsif options[:query] =~ /^json\./
318 |             query = options[:query].sub(/^json\./, '')
319 |           else
320 |             query = options[:query]
321 |           end
322 | 
323 |           res = json.dot_query(query)
324 |         else
325 |           res = res.to_data
326 |         end
327 | 
328 |         output.push(res)
329 |       end
330 |     end
331 | 
332 |     output = output.clean_output
333 | 
334 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
335 |   end
336 | end
337 | 
338 | desc 'Extract contents between two regular expressions'
339 | arg_name 'URL', multiple: true
340 | command :extract do |c|
341 |   c.desc 'Text before extraction'
342 |   c.flag %i[b before]
343 | 
344 |   c.desc 'Text after extraction'
345 |   c.flag %i[a after]
346 | 
347 |   c.desc 'Process before/after strings as regular expressions'
348 |   c.switch %i[r regex]
349 | 
350 |   c.desc 'Include the before/after matches in the result'
351 |   c.switch %i[i include]
352 | 
353 |   c.desc 'Define a header to send as key=value'
354 |   c.flag %i[h header], multiple: true
355 | 
356 |   c.desc 'Expect compressed results'
357 |   c.switch %i[c compressed]
358 | 
359 |   c.desc 'Remove extra whitespace from results'
360 |   c.switch %i[clean]
361 | 
362 |   c.desc 'Strip HTML tags from results'
363 |   c.switch %i[strip]
364 | 
365 |   c.action do |global_options,options,args|
366 |     urls = args.join(' ').split(/[, ]+/)
367 |     headers = break_headers(options[:header])
368 | 
369 |     output = []
370 | 
371 |     urls.each do |url|
372 |       res = Curl::Html.new(url, { headers: headers, headers_only: false,
373 |                                   compressed: options[:compressed], clean: options[:clean] })
374 |       res.curl
375 |       if options[:regex]
376 |         before = Regexp.new(options[:before])
377 |         after = Regexp.new(options[:after])
378 |       else
379 |         before = /#{Regexp.escape(options[:before])}/
380 |         after = /#{Regexp.escape(options[:after])}/
381 |       end
382 | 
383 |       extracted = res.extract(before, after, inclusive: options[:include])
384 |       extracted.strip_tags! if options[:strip]
385 |       output.concat(extracted)
386 |     end
387 | 
388 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
389 |   end
390 | end
391 | 
392 | desc 'Extract all instances of a tag'
393 | arg_name 'URL', multiple: true
394 | command :tags do |c|
395 |   c.desc 'Define a header to send as key=value'
396 |   c.flag %i[h header], multiple: true, arg_name: 'KEY=VAL'
397 | 
398 |   c.desc 'Specify a tag to collect'
399 |   c.flag %i[t tag], multiple: true, arg_name: 'TAG'
400 | 
401 |   c.desc 'Expect compressed results'
402 |   c.switch %i[c compressed]
403 | 
404 |   c.desc 'Remove extra whitespace from results'
405 |   c.switch %i[clean]
406 | 
407 |   c.desc 'Output the HTML source of the results'
408 |   c.switch %i[source html]
409 | 
410 |   c.desc 'Dot syntax query to filter results'
411 |   c.flag %i[q query filter], arg_name: 'DOT_SYNTAX'
412 | 
413 |   c.desc 'Regurn an array of matches to a CSS or XPath query'
414 |   c.flag %i[search], arg_name: 'CSS/XPATH'
415 | 
416 |   c.action do |global_options, options, args|
417 |     urls = args.join(' ').split(/[, ]+/)
418 |     headers = break_headers(options[:header])
419 |     tags = options[:tag].join(' ').split(/[, ]+/)
420 |     output = []
421 | 
422 |     urls.each do |url|
423 |       res = Curl::Html.new(url, { headers: headers, headers_only: options[:headers],
424 |                                   compressed: options[:compressed], clean: options[:clean] })
425 |       res.curl
426 | 
427 |       output = []
428 |       if options[:search]
429 |         out = res.search(options[:search])
430 | 
431 |         out = out.dot_query(options[:query]) if options[:query]
432 |         output.push(out)
433 |       elsif options[:query]
434 |         output = res.to_data.dot_query(options[:query])
435 |       elsif tags.count.positive?
436 |         tags.each { |tag| output.concat(res.tags(tag)) }
437 |       else
438 |         output.concat(res.tags)
439 |       end
440 |     end
441 | 
442 |     output = output.clean_output
443 | 
444 |     if options[:source]
445 |       puts output.to_html
446 |     else
447 |       print_out(output, global_options[:yaml], pretty: global_options[:pretty])
448 |     end
449 |   end
450 | end
451 | 
452 | desc 'Extract all images from a URL'
453 | arg_name 'URL', multiple: true
454 | command :images do |c|
455 |   c.desc 'Type of images to return (img, srcset, opengraph, all)'
456 |   c.flag %i[t type], multiple: true, type: ImageType, default_value: ['all']
457 | 
458 |   c.desc 'Expect compressed results'
459 |   c.switch %i[c compressed]
460 | 
461 |   c.desc 'Remove extra whitespace from results'
462 |   c.switch %i[clean]
463 | 
464 |   c.desc 'Filter output using dot-syntax path'
465 |   c.flag %i[q query filter]
466 | 
467 |   c.desc 'Define a header to send as key=value'
468 |   c.flag %i[h header], multiple: true
469 | 
470 |   c.action do |global_options, options, args|
471 |     urls = args.join(' ').split(/[, ]+/)
472 |     headers = break_headers(options[:header])
473 | 
474 |     output = []
475 | 
476 |     types = options[:type].join(' ').split(/[ ,]+/).map(&:normalize_image_type)
477 | 
478 |     urls.each do |url|
479 |       res = Curl::Html.new(url, { compressed: options[:compressed], clean: options[:clean] })
480 |       res.curl
481 | 
482 |       res = res.images(types: types)
483 |       res = { images: res }.dot_query(options[:query], 'images', full_tag: false) if options[:query]
484 | 
485 |       if res.is_a?(Array)
486 |         output.concat(res)
487 |       else
488 |         output.push(res)
489 |       end
490 |     end
491 | 
492 |     exit_now!('No results') if output.nil? || output.empty?
493 | 
494 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
495 |   end
496 | end
497 | 
498 | desc %(Return all links on a URL's page)
499 | arg_name 'URL', multiple: true
500 | command :links do |c|
501 |   c.desc 'Ignore relative hrefs when gathering content links'
502 |   c.switch %i[ignore_relative], negatable: true
503 | 
504 |   c.desc 'Ignore fragment hrefs when gathering content links'
505 |   c.switch %i[ignore_fragments], negatable: true
506 | 
507 |   c.desc 'Only gather external links'
508 |   c.switch %i[x external_links_only], default_value: false, negatable: false
509 | 
510 |   c.desc 'Only gather internal (same-site) links'
511 |   c.switch %i[l local_links_only], default_value: false, negatable: false
512 | 
513 |   c.desc 'Filter output using dot-syntax path'
514 |   c.flag %i[q query filter]
515 | 
516 |   c.desc 'Filter out duplicate links, preserving only first one'
517 |   c.switch %i[d dedup], negatable: true
518 | 
519 |   c.action do |global_options, options, args|
520 |     urls = args.join(' ').split(/[, ]+/)
521 | 
522 |     output = []
523 | 
524 |     urls.each do |url|
525 |       res = Curl::Html.new(url, {
526 |                              compressed: options[:compressed], clean: options[:clean],
527 |                              ignore_local_links: options[:ignore_relative],
528 |                              ignore_fragment_links: options[:ignore_fragments],
529 |                              external_links_only: options[:external_links_only],
530 |                              local_links_only: options[:local_links_only]
531 |                            })
532 |       res.curl
533 | 
534 |       if options[:query]
535 |         queried = res.to_data.dot_query(options[:query], 'links', full_tag: false)
536 | 
537 |         queried.is_a?(Array) ? output.concat(queried) : output.push(queried) if queried
538 |       else
539 |         output.concat(res.body_links)
540 |       end
541 |     end
542 | 
543 |     output.dedup_links! if options[:dedup]
544 | 
545 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
546 |   end
547 | end
548 | 
549 | desc %(Return all <head> links on URL's page)
550 | arg_name 'URL', multiple: true
551 | command :headlinks do |c|
552 |   c.desc 'Filter output using dot-syntax path'
553 |   c.flag %i[q query filter]
554 | 
555 |   c.action do |global_options, options, args|
556 |     urls = args.join(' ').split(/[, ]+/)
557 | 
558 |     output = []
559 | 
560 |     urls.each do |url|
561 |       res = Curl::Html.new(url, { compressed: options[:compressed], clean: options[:clean] })
562 |       res.curl
563 | 
564 |       if options[:query]
565 |         queried = { links: res.to_data[:meta_links] }.dot_query(options[:query], 'links', full_tag: false)
566 |         output.push(queried) if queried
567 |       else
568 |         output.push(res.to_data[:meta_links])
569 |       end
570 |     end
571 | 
572 |     output = output.clean_output
573 | 
574 |     print_out(output, global_options[:yaml], pretty: global_options[:pretty])
575 |   end
576 | end
577 | 
578 | desc %(Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.)
579 | arg_name 'URL', multiple: true
580 | command :scrape do |c|
581 |   c.desc 'Browser to use (firefox, chrome)'
582 |   c.flag %i[b browser], type: BrowserType, required: true
583 | 
584 |   c.desc 'Regurn an array of matches to a CSS or XPath query'
585 |   c.flag %i[search]
586 | 
587 |   c.desc 'Define a header to send as "key=value"'
588 |   c.flag %i[h header], multiple: true
589 | 
590 |   c.desc 'Remove extra whitespace from results'
591 |   c.switch %i[clean]
592 | 
593 |   c.desc 'Filter output using dot-syntax path'
594 |   c.flag %i[q query filter]
595 | 
596 |   c.desc 'Output a raw value for a key'
597 |   c.flag %i[r raw]
598 | 
599 |   c.action do |global_options, options, args|
600 |     urls = args.join(' ').split(/[, ]+/)
601 | 
602 |     output = []
603 | 
604 |     urls.each do |url|
605 |       res = Curl::Html.new(url, { browser: options[:browser], clean: options[:clean] })
606 |       res.curl
607 | 
608 |       if options[:search]
609 |         out = res.search(options[:search])
610 | 
611 |         out = out.dot_query(options[:query], full_tag: false) if options[:query]
612 |         output.push(out)
613 |       elsif options[:query]
614 |         queried = res.to_data(url: url).dot_query(options[:query], full_tag: false)
615 |         output.push(queried) if queried
616 |       else
617 |         output.push(res.to_data(url: url))
618 |       end
619 |     end
620 | 
621 |     output.delete_if(&:empty?)
622 | 
623 |     output = output.clean_output
624 | 
625 |     if options[:raw]
626 |       output.map! { |o| o[options[:raw].to_sym] }
627 |     end
628 | 
629 |     print_out(output, global_options[:yaml], raw: options[:raw], pretty: global_options[:pretty])
630 |   end
631 | end
632 | 
633 | pre do |global, command, options, args|
634 |   # Pre logic here
635 |   # Return true to proceed; false to abort and not call the
636 |   # chosen command
637 |   # Use skips_pre before a command to skip this block
638 |   # on that command only
639 |   true
640 | end
641 | 
642 | post do |global, command, options, args|
643 |   # Post logic here
644 |   # Use skips_post before a command to skip this
645 |   # block on that command only
646 | end
647 | 
648 | on_error do |exception|
649 |   # Error logic here
650 |   # return false to skip default error handling
651 |   true
652 | end
653 | 
654 | exit run(ARGV)
655 | 


--------------------------------------------------------------------------------
/curlyq.gemspec:
--------------------------------------------------------------------------------
 1 | # Ensure we require the local version and not one we might have installed already
 2 | require File.join([File.dirname(__FILE__),'lib','curly','version.rb'])
 3 | spec = Gem::Specification.new do |s|
 4 |   s.name = 'curlyq'
 5 |   s.version = Curly::VERSION
 6 |   s.author = 'Brett Terpstra'
 7 |   s.email = 'me@brettterpstra.com'
 8 |   s.homepage = 'https://brettterpstra.com'
 9 |   s.platform = Gem::Platform::RUBY
10 |   s.licenses = 'MIT'
11 |   s.summary = 'A CLI helper for curl and web scraping'
12 |   s.files = `git ls-files`.split("
13 | ")
14 |   s.require_paths << 'lib'
15 |   s.extra_rdoc_files = ['README.rdoc','curlyq.rdoc']
16 |   s.rdoc_options << '--title' << 'curlyq' << '--main' << 'README.rdoc' << '-ri'
17 |   s.bindir = 'bin'
18 |   s.executables << 'curlyq'
19 |   s.add_development_dependency('rake','~> 13.0', '>= 13.0.1')
20 |   s.add_development_dependency('rdoc', '~> 6.3.1')
21 |   s.add_development_dependency('test-unit', '~> 3.4.4')
22 |   s.add_development_dependency('yard', '~> 0.9', '>= 0.9.26')
23 |   s.add_development_dependency('tty-spinner', '~> 0.9', '>= 0.9.3')
24 |   s.add_development_dependency('tty-progressbar', '~> 0.18', '>= 0.18.2')
25 |   s.add_development_dependency('pastel', '~> 0.8.0')
26 |   s.add_development_dependency('parallel_tests', '~> 3.7', '>= 3.7.3')
27 |   s.add_runtime_dependency('gli','~> 2.21.0')
28 |   s.add_runtime_dependency('tty-which','~> 0.5.0')
29 |   s.add_runtime_dependency('nokogiri','~> 1.16.0')
30 |   s.add_runtime_dependency('selenium-webdriver', '~> 4.16.0')
31 | end
32 | 


--------------------------------------------------------------------------------
/curlyq.rdoc:
--------------------------------------------------------------------------------
  1 | == curlyq - A scriptable interface to curl
  2 | 
  3 | v0.0.1
  4 | 
  5 | === Global Options
  6 | === --help
  7 | Show this message
  8 | 
  9 | 
 10 | 
 11 | === --[no-]pretty
 12 | Output "pretty" JSON
 13 | 
 14 | 
 15 | 
 16 | === --version
 17 | Display the program version
 18 | 
 19 | 
 20 | 
 21 | === -y|--[no-]yaml
 22 | Output YAML instead of json
 23 | 
 24 | 
 25 | 
 26 | === Commands
 27 | ==== Command: <tt>extract  URL...</tt>
 28 | Extract contents between two regular expressions
 29 | 
 30 | 
 31 | ===== Options
 32 | ===== -a|--after arg
 33 | 
 34 | Text after extraction, parsed as regex
 35 | 
 36 | [Default Value] None
 37 | 
 38 | 
 39 | ===== -b|--before arg
 40 | 
 41 | Text before extraction, parsed as regex
 42 | 
 43 | [Default Value] None
 44 | 
 45 | 
 46 | ===== -h|--header arg
 47 | 
 48 | Define a header to send as key=value
 49 | 
 50 | [Default Value] None
 51 | 
 52 | 
 53 | ===== -c|--[no-]compressed
 54 | Expect compressed results
 55 | 
 56 | 
 57 | 
 58 | ===== --[no-]clean
 59 | Remove extra whitespace from results
 60 | 
 61 | 
 62 | 
 63 | ===== --[no-]strip
 64 | Strip HTML tags from results
 65 | 
 66 | 
 67 | 
 68 | ==== Command: <tt>headlinks  URL...</tt>
 69 | Return all <head> links on URL's page
 70 | 
 71 | 
 72 | ===== Options
 73 | ===== -q|--query|--filter arg
 74 | 
 75 | Filter output using dot-syntax path
 76 | 
 77 | [Default Value] None
 78 | 
 79 | 
 80 | ==== Command: <tt>help  command</tt>
 81 | Shows a list of commands or help for one command
 82 | 
 83 | Gets help for the application or its commands. Can also list the commands in a way helpful to creating a bash-style completion function
 84 | ===== Options
 85 | ===== -c
 86 | List commands one per line, to assist with shell completion
 87 | 
 88 | 
 89 | 
 90 | ==== Command: <tt>html|curl  URL...</tt>
 91 | Curl URL and output its elements, multiple URLs allowed
 92 | 
 93 | 
 94 | ===== Options
 95 | ===== -b|--browser arg
 96 | 
 97 | Use a browser to retrieve a dynamic web page (firefox, chrome)
 98 | 
 99 | [Default Value] None
100 | [Must Match] (?-mix:^[fc].*?$)
101 | 
102 | 
103 | ===== -f|--fallback arg
104 | 
105 | If curl doesn't work, use a fallback browser (firefox, chrome)
106 | 
107 | [Default Value] None
108 | [Must Match] (?-mix:^[fc].*?$)
109 | 
110 | 
111 | ===== -h|--header arg
112 | 
113 | Define a header to send as "key=value"
114 | 
115 | [Default Value] None
116 | 
117 | 
118 | ===== -q|--query|--filter arg
119 | 
120 | Filter output using dot-syntax path
121 | 
122 | [Default Value] None
123 | 
124 | 
125 | ===== -r|--raw arg
126 | 
127 | Output a raw value for a key
128 | 
129 | [Default Value] None
130 | 
131 | 
132 | ===== --search arg
133 | 
134 | Regurn an array of matches to a CSS or XPath query
135 | 
136 | [Default Value] None
137 | 
138 | 
139 | ===== -I|--info
140 | Only retrieve headers/info
141 | 
142 | 
143 | 
144 | ===== -c|--compressed
145 | Expect compressed results
146 | 
147 | 
148 | 
149 | ===== --[no-]clean
150 | Remove extra whitespace from results
151 | 
152 | 
153 | 
154 | ===== --[no-]ignore_fragments
155 | Ignore fragment hrefs when gathering content links
156 | 
157 | 
158 | 
159 | ===== --[no-]ignore_relative
160 | Ignore relative hrefs when gathering content links
161 | 
162 | 
163 | 
164 | ===== -x|--external_links_only
165 | Only gather external links
166 | 
167 | 
168 | 
169 | ==== Command: <tt>images  URL...</tt>
170 | Extract all images from a URL
171 | 
172 | 
173 | ===== Options
174 | ===== -t|--type arg
175 | 
176 | Type of images to return (img, srcset, opengraph, all)
177 | 
178 | [Default Value] ["all"]
179 | 
180 | 
181 | ===== -c|--[no-]compressed
182 | Expect compressed results
183 | 
184 | 
185 | 
186 | ===== --[no-]clean
187 | Remove extra whitespace from results
188 | 
189 | 
190 | 
191 | ==== Command: <tt>json  URL...</tt>
192 | Get a JSON response from a URL, multiple URLs allowed
193 | 
194 | 
195 | ===== Options
196 | ===== -h|--header arg
197 | 
198 | Define a header to send as key=value
199 | 
200 | [Default Value] None
201 | 
202 | 
203 | ===== -q|--query|--filter arg
204 | 
205 | Filter output using dot-syntax path
206 | 
207 | [Default Value] None
208 | 
209 | 
210 | ===== -c|--[no-]compressed
211 | Expect compressed results
212 | 
213 | 
214 | 
215 | ==== Command: <tt>links  URL...</tt>
216 | Return all links on a URL's page
217 | 
218 | 
219 | ===== Options
220 | ===== -q|--query|--filter arg
221 | 
222 | Filter output using dot-syntax path
223 | 
224 | [Default Value] None
225 | 
226 | 
227 | ===== -d|--[no-]dedup
228 | Filter out duplicate links, preserving only first one
229 | 
230 | 
231 | 
232 | ===== --[no-]ignore_fragments
233 | Ignore fragment hrefs when gathering content links
234 | 
235 | 
236 | 
237 | ===== --[no-]ignore_relative
238 | Ignore relative hrefs when gathering content links
239 | 
240 | 
241 | 
242 | ===== -x|--external_links_only
243 | Only gather external links
244 | 
245 | 
246 | 
247 | ==== Command: <tt>scrape  URL...</tt>
248 | Scrape a page using a web browser, for dynamic (JS) pages. Be sure to have the selected --browser installed.
249 | 
250 | 
251 | ===== Options
252 | ===== -b|--browser arg
253 | 
254 | Browser to use (firefox, chrome)
255 | 
256 | [Default Value] None
257 | 
258 | 
259 | ===== -h|--header arg
260 | 
261 | Define a header to send as "key=value"
262 | 
263 | [Default Value] None
264 | 
265 | 
266 | ===== -q|--query|--filter arg
267 | 
268 | Filter output using dot-syntax path
269 | 
270 | [Default Value] None
271 | 
272 | 
273 | ===== -r|--raw arg
274 | 
275 | Output a raw value for a key
276 | 
277 | [Default Value] None
278 | 
279 | 
280 | ===== --search arg
281 | 
282 | Regurn an array of matches to a CSS or XPath query
283 | 
284 | [Default Value] None
285 | 
286 | 
287 | ===== --[no-]clean
288 | Remove extra whitespace from results
289 | 
290 | 
291 | 
292 | ==== Command: <tt>screenshot  URL...</tt>
293 | Save a screenshot of the URL
294 | 
295 | 
296 | ===== Options
297 | ===== -b|--browser arg
298 | 
299 | Browser to use (firefox, chrome)
300 | 
301 | [Default Value] chrome
302 | [Must Match] (?-mix:^[fc].*?$)
303 | 
304 | 
305 | ===== -o|--out|--file arg
306 | 
307 | File destination
308 | 
309 | [Default Value] None
310 | 
311 | 
312 | ===== -t|--type arg
313 | 
314 | Type of screenshot to save (full (requires firefox), print, visible)
315 | 
316 | [Default Value] full
317 | [Must Match] (?-mix:^[fpv].*?$)
318 | 
319 | 
320 | ==== Command: <tt>tags  URL...</tt>
321 | Extract all instances of a tag
322 | 
323 | 
324 | ===== Options
325 | ===== -h|--header arg
326 | 
327 | Define a header to send as key=value
328 | 
329 | [Default Value] None
330 | 
331 | 
332 | ===== -q|--query|--search arg
333 | 
334 | CSS/XPath query
335 | 
336 | [Default Value] None
337 | 
338 | 
339 | ===== -t|--tag arg
340 | 
341 | Specify a tag to collect
342 | 
343 | [Default Value] None
344 | 
345 | 
346 | ===== -c|--[no-]compressed
347 | Expect compressed results
348 | 
349 | 
350 | 
351 | ===== --[no-]clean
352 | Remove extra whitespace from results
353 | 
354 | 
355 | 
356 | 


--------------------------------------------------------------------------------
/lib/curly.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'curly/version'
 4 | require 'curly/hash'
 5 | require 'curly/string'
 6 | require 'curly/array'
 7 | require 'curly/numeric'
 8 | require 'json'
 9 | require 'yaml'
10 | require 'uri'
11 | require 'tty-which'
12 | require 'nokogiri'
13 | require 'selenium-webdriver'
14 | 


--------------------------------------------------------------------------------
/lib/curly/array.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | # Array helpers
  4 | class ::Array
  5 |   ##
  6 |   ## Remove extra spaces from each element of an array of
  7 |   ## strings
  8 |   ##
  9 |   ## @return     [Array] cleaned array
 10 |   ##
 11 |   def clean
 12 |     map(&:clean)
 13 |   end
 14 | 
 15 |   ##
 16 |   ## @see #clean
 17 |   ##
 18 |   def clean!
 19 |     replace clean
 20 |   end
 21 | 
 22 |   ##
 23 |   ## Strip HTML tags from each element of an array of
 24 |   ## strings
 25 |   ##
 26 |   ## @return     [Array] array of strings with HTML tags removed
 27 |   ##
 28 |   def strip_tags
 29 |     map(&:strip_tags)
 30 |   end
 31 | 
 32 |   ##
 33 |   ## Destructive version of #strip_tags
 34 |   ##
 35 |   ## @see #strip_tags
 36 |   ##
 37 |   def strip_tags!
 38 |     replace strip_tags
 39 |   end
 40 | 
 41 |   ##
 42 |   ## Remove duplicate links from an array of link objects
 43 |   ##
 44 |   ## @return     [Array] deduped array of link objects
 45 |   ##
 46 |   def dedup_links
 47 |     used = []
 48 |     good = []
 49 |     each do |link|
 50 |       href = link[:href].sub(%r{/$}, '')
 51 |       next if used.include?(href)
 52 | 
 53 |       used.push(href)
 54 |       good.push(link)
 55 |     end
 56 | 
 57 |     good
 58 |   end
 59 | 
 60 |   ##
 61 |   ## Destructive version of #dedup_links
 62 |   ##
 63 |   ## @see #dedup_links
 64 |   ##
 65 |   def dedup_links!
 66 |     replace dedup_links
 67 |   end
 68 | 
 69 |   ##
 70 |   ## Run a query on array elements
 71 |   ##
 72 |   ## @param      path [String] dot.syntax path to compare
 73 |   ##
 74 |   ## @return [Array] elements matching dot query
 75 |   ##
 76 |   def dot_query(path)
 77 |     res = map { |el| el.dot_query(path) }
 78 |     res.delete_if { |r| !r }
 79 |     res.delete_if(&:empty?)
 80 |     res
 81 |   end
 82 | 
 83 |   ##
 84 |   ## Gets the value of every item in the array
 85 |   ##
 86 |   ## @param      path  The query path (dot syntax)
 87 |   ##
 88 |   ## @return     [Array] array of values
 89 |   ##
 90 |   def get_value(path)
 91 |     map { |el| el.get_value(path) }
 92 |   end
 93 | 
 94 |   ##
 95 |   ## Convert every item in the array to HTML
 96 |   ##
 97 |   ## @return     [String] Html representation of the object.
 98 |   ##
 99 |   def to_html
100 |     map(&:to_html)
101 |   end
102 | 
103 |   ##
104 |   ## Test if a tag contains an attribute matching filter
105 |   ## queries
106 |   ##
107 |   ## @param      tag_name    [String] The tag name
108 |   ## @param      classes     [String] The classes to match
109 |   ## @param      id          [String] The id attribute to
110 |   ##                         match
111 |   ## @param      attribute   [String] The attribute
112 |   ## @param      operator    [String] The operator, <>= *=
113 |   ##                         $= ^=
114 |   ## @param      value       [String] The value to match
115 |   ## @param      descendant  [Boolean] Check descendant tags
116 |   ##
117 |   ## @return     [Boolean] tag matches
118 |   ##
119 |   def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
120 |     tag = self
121 |     keep = true
122 | 
123 |     keep = false if tag_name && !tag['tag'] =~ /^#{tag_name}$/i
124 | 
125 |     if tag.key?('attrs') && tag['attrs']
126 |       if keep && id
127 |         tag_id = tag['attrs'].filter { |a| a['key'] == 'id' }.first['value']
128 |         keep = tag_id && tag_id =~ /#{id}/i
129 |       end
130 | 
131 |       if keep && classes
132 |         cls = tag['attrs'].filter { |a| a['key'] == 'class' }.first
133 |         if cls
134 |           all = true
135 |           classes.each { |c| all = cls['value'].include?(c) }
136 |           keep = all
137 |         else
138 |           keep = false
139 |         end
140 |       end
141 | 
142 |       if keep && attribute
143 |         attributes = tag['attrs'].filter { |a| a['key'] =~ /^#{attribute}$/i }
144 |         any = false
145 |         attributes.each do |a|
146 |           break if any
147 | 
148 |           any = case operator
149 |                 when /^*/
150 |                   a['value'] =~ /#{value}/i
151 |                 when /^\^/
152 |                   a['value'] =~ /^#{value}/i
153 |                 when /^\$/
154 |                   a['value'] =~ /#{value}$/i
155 |                 else
156 |                   a['value'] =~ /^#{value}$/i
157 |                 end
158 |         end
159 |         keep = any
160 |       end
161 |     end
162 | 
163 |     return false if descendant && !keep
164 | 
165 |     if !descendant && tag.key?('tags')
166 |       tags = tag['tags'].filter { |t| t.tag_match(tag_name, classes, id, attribute, operator, value) }
167 |       tags.count.positive?
168 |     else
169 |       keep
170 |     end
171 |   end
172 | 
173 |   ##
174 |   ## Clean up output, shrink single-item arrays, ensure array output
175 |   ##
176 |   ## @return [Array] cleaned up array
177 |   ##
178 |   def clean_output
179 |     output = dup
180 |     while output.is_a?(Array) && output.count == 1
181 |       output = output[0]
182 |     end
183 |     return [] unless output
184 | 
185 |     output.ensure_array
186 |   end
187 | 
188 |   ##
189 |   ## Ensure that an object is an array
190 |   ##
191 |   ## @return     [Array] object as Array
192 |   ##
193 |   def ensure_array
194 |     return self
195 |   end
196 | end
197 | 


--------------------------------------------------------------------------------
/lib/curly/curl.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | # import
4 | require_relative 'curl/html'
5 | 
6 | # import
7 | require_relative 'curl/json'
8 | 


--------------------------------------------------------------------------------
/lib/curly/curl/html.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | module Curl
  4 |   # String helpers
  5 |   class ::String
  6 |     def remove_entities
  7 |       gsub(/&nbsp;/, ' ')
  8 |     end
  9 |   end
 10 | 
 11 |   # Class for CURLing an HTML page
 12 |   class Html
 13 |     attr_accessor :settings, :browser, :source, :headers, :headers_only, :compressed, :clean, :fallback,
 14 |                   :ignore_local_links, :ignore_fragment_links, :external_links_only, :local_links_only
 15 | 
 16 |     attr_reader :url, :code, :meta, :links, :head, :body,
 17 |                 :title, :description, :body_links, :body_images
 18 | 
 19 |     # Convert self to a hash of data
 20 |     #
 21 |     # @param      url   [String]  A base url to fall back to
 22 |     #
 23 |     # @return     [Hash] a hash of data
 24 |     #
 25 |     def to_data(url: nil)
 26 |       {
 27 |         url: @url || url,
 28 |         code: @code,
 29 |         headers: @headers,
 30 |         meta: @meta,
 31 |         meta_links: @links,
 32 |         head: @clean ? @head&.strip&.clean : @head,
 33 |         body: @clean ? @body&.strip&.clean : @body,
 34 |         source: @clean ? @source&.strip&.clean : @source,
 35 |         title: @title,
 36 |         description: @description,
 37 |         links: @body_links,
 38 |         images: @body_images
 39 |       }
 40 |     end
 41 | 
 42 |     ##
 43 |     ## Create a new page object from a URL
 44 |     ##
 45 |     ## @param      url      [String] The url
 46 |     ## @param      options  [Hash] The options
 47 |     ##
 48 |     ## @option options :browser [Symbol] the browser to use instead of curl (:chrome, :firefox)
 49 |     ## @option options :source [String] source provided instead of curl
 50 |     ## @option options :headers [Hash] headers to send in the request
 51 |     ## @option options :headers_only [Boolean] whether to return just response headers
 52 |     ## @option options :compressed [Boolean] expect compressed response
 53 |     ## @option options :clean [Boolean] clean whitespace from response
 54 |     ## @option options :fallback [Symbol] browser to fall back to if curl doesn't work (:chrome, :firefox)
 55 |     ## @option options :ignore_local_links [Boolean] when collecting links, ignore local/relative links
 56 |     ## @option options :ignore_fragment_links [Boolean] when collecting links, ignore links that are just #fragments
 57 |     ## @option options :external_links_only [Boolean] only collect links outside of current site
 58 |     ##
 59 |     ## @return     [HTMLCurl] new page object
 60 |     ##
 61 |     def initialize(url, options = {})
 62 |       @browser = options[:browser] || :none
 63 |       @source = options[:source]
 64 |       @headers = options[:headers] || {}
 65 |       @headers_only = options[:headers_only]
 66 |       @compressed = options[:compressed]
 67 |       @clean = options[:clean]
 68 |       @fallback = options[:fallback]
 69 |       @ignore_local_links = options[:ignore_local_links]
 70 |       @ignore_fragment_links = options[:ignore_fragment_links]
 71 |       @external_links_only = options[:external_links_only]
 72 |       @local_links_only = options[:local_links_only]
 73 | 
 74 |       @curl = TTY::Which.which('curl')
 75 |       @url = url.nil? ? options[:url] : url
 76 |     end
 77 | 
 78 |     ##
 79 |     # Parse raw HTML source instead of curling
 80 |     #
 81 |     # @param      source  [String] The source
 82 |     #
 83 |     #
 84 |     # @return     [Hash] Hash of data after processing #
 85 |     #
 86 |     def parse(source)
 87 |       @body = source
 88 |       { url: @url, code: @code, headers: @headers, meta: @meta, links: @links, head: @head, body: source,
 89 |         source: source.strip, body_links: content_links, body_images: content_images }
 90 |     end
 91 | 
 92 |     ##
 93 |     ## Curl a url, either with curl or Selenium based on browser settings
 94 |     ##
 95 |     def curl
 96 |       res = if @url && @browser && @browser != :none
 97 |               source = curl_dynamic_html
 98 |               curl_html(nil, source: source, headers: @headers)
 99 |             elsif url.nil? && !source.nil?
100 |               curl_html(nil, source: @source, headers: @headers, headers_only: @headers_only,
101 |                              compressed: @compressed, fallback: false)
102 |             else
103 |               curl_html(@url, headers: @headers, headers_only: @headers_only,
104 |                               compressed: @compressed, fallback: @fallback)
105 |             end
106 |       @url = res[:url]
107 |       @code = res[:code]
108 |       @headers = res[:headers]
109 |       @meta = res[:meta]
110 |       @links = res[:links]
111 |       @head = res[:head] unless res[:head].nil?
112 |       @body = reencode(res[:body])
113 |       @source = res[:source]
114 |       @title = @meta['og:title'] || @meta['title'] unless @meta.nil?
115 |       @description = @meta['og:description'] || @meta['description'] unless @meta.nil?
116 |       @body_links = content_links
117 |       @body_images = content_images
118 |     end
119 | 
120 |     ##
121 |     ## Save a screenshot of the url
122 |     ##
123 |     ## @param      urls         [Array] The urls
124 |     ## @param      destination  The file destination
125 |     ## @param      browser      The browser (:firefox,
126 |     ##                          :chrome)
127 |     ## @param      type         The type of screenshot to
128 |     ##                          save (:full_page,
129 |     ##                          :print_page, :visible)
130 |     ##
131 |     def screenshot(destination = nil, type: :full_page, script: nil, id: nil, wait: 0)
132 |       # full_page = type.to_sym == :full_page
133 |       # print_page = type.to_sym == :print_page
134 |       save_screenshot(destination, type: type, script: script, id: id, wait_seconds: wait)
135 |     end
136 | 
137 |     ##
138 |     ## @brief      Execute JavaScript
139 |     ##
140 |     ## @param      script  The script to run
141 |     ##
142 |     def execute(script, wait, element_id)
143 |       run_js(script, wait, element_id)
144 |     end
145 | 
146 |     ##
147 |     ## Extract text between two regular expressions
148 |     ##
149 |     ## @param      before  [String, Regexp] The before
150 |     ## @param      after   [String, Regexp] The after
151 |     ##
152 |     ## @return     [Array] array of matches
153 |     ##
154 |     def extract(before, after, inclusive: false)
155 |       before = /#{Regexp.escape(before)}/ unless before.is_a?(Regexp)
156 |       after = /#{Regexp.escape(after)}/ unless after.is_a?(Regexp)
157 |       rx = if inclusive
158 |              /(#{before.source}.*?#{after.source})/m
159 |            else
160 |              /(?<=#{before.source})(.*?)(?=#{after.source})/m
161 |            end
162 |       @body.scan(rx).map { |r| @clean ? r[0].clean : r[0] }
163 |     end
164 | 
165 |     ##
166 |     ## Extract an array of tags or tag attributes
167 |     ##
168 |     ## @param      tag        [String] The tag
169 |     ## @param      attribute  [String] The attribute
170 |     ## @param      source     [Boolean] Return full tag source
171 |     ##                        (negates attribute if true)
172 |     ## @param      content    [Boolean] Return only tag
173 |     ##                        contents
174 |     ##
175 |     ## @return     [Hash, Array] if source, return array of full
176 |     ##             tags, if content, return array of tag contents,
177 |     ##             otherwise, return a hash of tags including
178 |     ##             attributes and content
179 |     ##
180 |     ## If attribute is not given, tag contents will be returned
181 |     ##
182 |     ## @example    page.extract_tag('h1') => [Array of h1 tag
183 |     ## contents]
184 |     ## @example    page.extract_tag('img', 'src') => [Array of img
185 |     ## src attributes]
186 |     ##
187 |     def extract_tag(tag, attribute = nil, source: false, content: false)
188 |       res = extract_tag_contents(tag, source: true)
189 | 
190 |       return res if source
191 | 
192 |       res.map! do |tag_source|
193 |         m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
194 |         attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
195 |         tags = tag_source.match(/<.*?>(?<content>.*?)</)
196 |         contents = tags.nil? ? nil : tags['content']
197 |         {
198 |           tag: tag,
199 |           source: tag_source,
200 |           attrs: attrs,
201 |           content: @clean ? contents&.clean : contents
202 |         }
203 |       end
204 | 
205 |       return res.map { |r| r[:content] } if content
206 | 
207 |       return res if attribute.nil?
208 | 
209 |       res.map { |r| r[:attrs][attribute] }
210 |     end
211 | 
212 |     ##
213 |     ## Extract tag contents or full tag source
214 |     ##
215 |     ## @param      tag     The tag
216 |     ## @param      source  [Boolean] Return full tag instead of contents
217 |     ##
218 |     ## @return [Array] array of tag matches/contents
219 |     def extract_tag_contents(tag, source: false)
220 |       return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source
221 | 
222 |       @body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] }
223 |     end
224 | 
225 |     ##
226 |     ## Return all tags in body, or a specific tag
227 |     ##
228 |     ## @param      tag   [String, Array] The tag to return,
229 |     ##                   can be an array
230 |     ##
231 |     ## @return     [Array] Array of tags. If no tag is specified, a
232 |     ##             hierarchical array of all tags in the document
233 |     ##             is returned. If one or more tags are specified,
234 |     ##             return a flattened list in document order.
235 |     ##
236 |     def tags(tag = nil)
237 |       tags = content_tags(@body)
238 |       return tags if tag.nil?
239 | 
240 |       tag = [tag] unless tag.is_a?(Array)
241 |       tag.map!(&:downcase)
242 |       flatten_tags(tags).dup.delete_if { |t| !tag.include?(t[:tag].downcase) }
243 |     end
244 | 
245 |     ##
246 |     ## Get all images from the page
247 |     ##
248 |     ## @return     [Array] Array of images, both from picture sources and img tags
249 |     ##
250 |     def images(types: :all)
251 |       output = []
252 |       types = [types] unless types.is_a?(Array)
253 |       # types.map!(&:normalize_image_type)
254 |       types.each do |type|
255 |         if %i[all opengraph].include?(type)
256 |           %w[og:image twitter:image].each do |src|
257 |             next unless @meta.key?(src)
258 | 
259 |             output << {
260 |               type: 'opengraph',
261 |               attrs: nil,
262 |               src: @meta[src]
263 |             }
264 |           end
265 |         end
266 |         images = tags(%w[img source])
267 |         images.each do |img|
268 |           case img[:tag].downcase
269 |           when /source/
270 |             next unless %i[all srcset].include?(type)
271 | 
272 |             srcsets = img[:attrs].filter { |k| k == 'srcset' }
273 |             if srcsets.count.positive?
274 |               srcset = []
275 |               srcsets.each do |k, v|
276 |                 v.split(/ *, */).each do |s|
277 |                   image, media = s.split(/ /)
278 |                   srcset << {
279 |                     src: image,
280 |                     media: media
281 |                   }
282 |                 end
283 |               end
284 |               output << {
285 |                 type: 'srcset',
286 |                 attrs: img[:attrs],
287 |                 images: srcset
288 |               }
289 |             end
290 |           when /img/
291 |             next unless %i[all img].include?(type)
292 | 
293 |             width = img[:attrs]['width']
294 |             height = img[:attrs]['height']
295 |             alt = img[:attrs]['alt']
296 |             title = img[:attrs]['title']
297 | 
298 |             output << {
299 |               type: 'img',
300 |               src: img[:attrs]['src'],
301 |               width: width || 'unknown',
302 |               height: height || 'unknown',
303 |               alt: alt,
304 |               title: title,
305 |               attrs: img[:attrs]
306 |             }
307 |           end
308 |         end
309 |       end
310 |       output
311 |     end
312 | 
313 |     ##
314 |     ## String representation
315 |     ##
316 |     ## @return     String representation of the object.
317 |     ##
318 |     def to_s
319 |       headers = @headers.nil? ? 0 : @headers.count
320 |       meta = @meta.nil? ? 0 : @meta.count
321 |       links = @links.nil? ? 0 : @links.count
322 |       [
323 |         %(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"),
324 |         %(@description=#{@description} @headers:#{headers} @meta:#{meta} @links:#{links}>)
325 |       ].join(' ')
326 |     end
327 | 
328 |     ##
329 |     ## Return all headers of given level
330 |     ##
331 |     ## @param      level  [Number] The level (1-6)
332 |     ##
333 |     ## @return [Array] array of headers with text and all tag attributes as symbols
334 |     ##
335 |     def h(level = '\d')
336 |       res = []
337 |       headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map do
338 |         Regexp.last_match
339 |       end
340 |       headlines.each do |m|
341 |         headline = { level: m['level'] }
342 |         if m['tag'].nil?
343 |           attrs = nil
344 |         else
345 |           attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
346 |           attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
347 |         end
348 |         headline[:text] = m['text'].remove_entities
349 |         res << headline
350 |       end
351 |       res
352 |     end
353 | 
354 |     ##
355 |     ## Convert a nokogiri element to Curl::Html format
356 |     ##
357 |     ## @param      el    [Nokogiri] element to convert
358 |     ##
359 |     def nokogiri_to_tag(el)
360 |       attributes = {}
361 |       attributes = el.attribute_nodes.each_with_object({}) do |a, hsh|
362 |         hsh[a.name] = a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value
363 |       end
364 | 
365 |       {
366 |         tag: el.name,
367 |         source: @clean ? el.to_html&.strip&.clean : el.to_html,
368 |         attrs: attributes,
369 |         content: @clean ? el.text&.strip&.clean : el.text.strip,
370 |         tags: recurse_children(el)
371 |       }
372 |     end
373 | 
374 |     def recurse_children(element)
375 |       children = []
376 |       element.children.each do |child|
377 |         next if child.name == 'text'
378 | 
379 |         children.push(nokogiri_to_tag(child))
380 |       end
381 |       children
382 |     end
383 | 
384 |     #-------------------------------------------------------
385 |     ## Perform a CSS query using Nokogiri
386 |     ##
387 |     ## @param      path  [String]  The CSS path
388 |     ##
389 |     ## @return     [Array] array of matched elements
390 |     ##
391 |     def search(path, source: @source, return_source: false)
392 |       doc = Nokogiri::HTML(source)
393 |       output = []
394 |       if return_source
395 |         output = doc.search(path).to_html
396 |       else
397 |         doc.search(path).each do |el|
398 |           out = nokogiri_to_tag(el)
399 |           output.push(out)
400 |         end
401 |       end
402 |       output
403 |     end
404 | 
405 |     private
406 | 
407 |     ##
408 |     ## Flatten the array of tags
409 |     ##
410 |     ## @param      tags  [Array] Document tags
411 |     ##
412 |     def flatten_tags(tags)
413 |       flattened = []
414 | 
415 |       tags.each do |t|
416 |         flattened << { tag: t[:tag], attrs: t[:attrs],
417 |                        content: @clean ? t[:content]&.strip&.clean : t[:content]&.strip }
418 |         flattened.concat(flatten_tags(t[:tags])) unless t[:tags].nil?
419 |       end
420 | 
421 |       flattened
422 |     end
423 | 
424 |     ##
425 |     ## Return an array of all tags in the content
426 |     ##
427 |     ## @param      content  [String] The content to parse
428 |     ##
429 |     def content_tags(content)
430 |       return nil if content.nil?
431 | 
432 |       res = content.to_enum(:scan, %r{(?mix)
433 |         <(?<tag>(?!</)[a-z0-9]+)(?<attrs>\s[^>]+)?
434 |         (?:\s*/>|>(?<content>.*?)</\k<tag>>)}).map { Regexp.last_match }
435 |       res.map do |tag|
436 |         if tag['attrs'].nil?
437 |           attrs = nil
438 |         else
439 |           attrs = tag['attrs'].strip.to_enum(:scan, /(?ix)
440 |                                              (?<key>[@a-z0-9-]+)(?:=(?<quot>["'])
441 |                                              (?<value>[^"']+)\k<quot>|[ >])?/i).map { Regexp.last_match }
442 |           attributes = attrs.each_with_object({}) do |a, hsh|
443 |             if a['value'].nil?
444 |               hsh[a['key']] = nil
445 |             else
446 |               hsh[a['key']] = a['key'] =~ /^(class|rel)$/ ? a['value'].split(/ /) : a['value']
447 |             end
448 |           end
449 |         end
450 |         {
451 |           tag: tag['tag'],
452 |           source: tag.to_s,
453 |           attrs: attributes,
454 |           content: @clean ? tag['content']&.clean : tag['content'],
455 |           tags: content_tags(tag['content'])
456 |         }
457 |       end
458 |     end
459 | 
460 |     ##
461 |     ## Extract all meta tags from the document head
462 |     ##
463 |     ## @param      head [String] The head content
464 |     ##
465 |     ## @return     [Hash] hash of meta tags and values
466 |     ##
467 |     def meta_tags(head)
468 |       meta = {}
469 |       title = head.match(%r{(?<=<title>)(.*?)(?=</title>)})
470 |       meta['title'] = title.nil? ? nil : title[1]
471 |       refresh = head.match(/http-equiv=(['"])refresh\1(.*?)>/)
472 |       url = refresh.nil? ? nil : refresh[2].match(/url=(.*?)['"]/)
473 |       meta['refresh_url'] = url
474 |       meta_tags = head.scan(/<meta.*?>/)
475 |       meta_tags.each do |tag|
476 |         meta_name = tag.match(/(?:name|property|http-equiv)=(["'])(.*?)\1/)
477 |         next if meta_name.nil?
478 | 
479 |         meta_value = tag.match(/(?:content)=(['"])(.*?)\1/)
480 |         next if meta_value.nil?
481 | 
482 |         meta[meta_name[2].downcase] = meta_value[2]
483 |       end
484 |       meta
485 |     rescue StandardError => e
486 |       warn e
487 |       {}
488 |     end
489 | 
490 |     ##
491 |     ## Extract all <link> tags from head
492 |     ##
493 |     ## @param      head  [String] The head content
494 |     ##
495 |     ## @return     [Array] Array of links
496 |     ##
497 |     def link_tags(head)
498 |       links = []
499 |       link_tags = head.scan(/<link.*?>/)
500 |       link_tags.each do |tag|
501 |         link_rel = tag.match(/rel=(['"])(.*?)\1/)
502 |         link_rel = link_rel.nil? ? nil : link_rel[2]
503 | 
504 |         next if link_rel =~ /preload/
505 | 
506 |         link_href = tag.match(/href=(["'])(.*?)\1/)
507 |         next if link_href.nil?
508 | 
509 |         link_href = link_href[2]
510 | 
511 |         if @local_links_only
512 |           next if @ignore_fragment_links && link_href =~ /^#/
513 | 
514 |           next unless same_origin?(link_href)
515 | 
516 |         else
517 |           next if link_href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
518 | 
519 |           next if link_href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
520 | 
521 |           next if same_origin?(link_href) && @external_links_only
522 | 
523 |         end
524 | 
525 |         link_title = tag.match(/title=(['"])(.*?)\1/)
526 |         link_title = link_title.nil? ? nil : link_title[2]
527 | 
528 |         link_type = tag.match(/type=(['"])(.*?)\1/)
529 |         link_type = link_type.nil? ? nil : link_type[2]
530 | 
531 |         links << { rel: link_rel, href: link_href, type: link_type, title: link_title }
532 |       end
533 |       links
534 |     end
535 | 
536 |     ##
537 |     ## Get all links in the body of the page
538 |     ##
539 |     ## rel and class are returned as arrays
540 |     ##
541 |     ## @return     [Array] array of links with href, title,
542 |     ##             rel, content and class
543 |     ##
544 |     def content_links
545 |       links = []
546 | 
547 |       link_tags = @body.to_enum(:scan, %r{<a ?(?<tag>.*?)>(?<text>.*?)</a>}).map { Regexp.last_match }
548 |       link_tags.each do |m|
549 |         href = m['tag'].match(/href=(["'])(.*?)\1/)
550 |         href = href[2] unless href.nil?
551 |         if @local_links_only
552 |           next if href =~ /^#/ && @ignore_fragment_links
553 | 
554 |           next unless same_origin?(href)
555 | 
556 |         else
557 |           next if href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
558 | 
559 |           next if href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
560 | 
561 |           next if same_origin?(href) && @external_links_only
562 | 
563 |         end
564 | 
565 |         title = m['tag'].match(/title=(["'])(.*?)\1/)
566 |         title = title[2] unless title.nil?
567 |         rel = m['tag'].match(/rel=(["'])(.*?)\1/)
568 |         rel = rel[2].split(/ +/) unless rel.nil?
569 |         link_class = m['tag'].match(/class=(["'])(.*?)\1/)
570 |         link_class = link_class[2].split(/ +/) unless link_class.nil?
571 |         text = m['text'].remove_entities
572 |         link = {
573 |           href: href,
574 |           title: title,
575 |           rel: rel,
576 |           content: text,
577 |           class: link_class
578 |         }
579 |         links << link
580 |       end
581 |       links
582 |     end
583 | 
584 |     ##
585 |     ## Get all img tags in the body of the page
586 |     ##
587 |     ## @return     [Array] array of images with src and all attributes
588 |     ##
589 |     def content_images
590 |       images = []
591 |       image_tags = @body.to_enum(:scan, %r{<img (?<tag>.*?)/?>}).map { Regexp.last_match }
592 |       image_tags.each do |m|
593 |         attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
594 |         image = {}
595 |         attrs.each { |a| image[a['attr'].to_sym] = a['content'] }
596 |         images << image
597 |       end
598 |       images
599 |     end
600 | 
601 |     ##
602 |     ## Uses Selenium to load a page, allowing capture of dynamic (JS) pages
603 |     ##
604 |     ## @param      url   The url
605 |     ##
606 |     ## @return [String] page source
607 |     ##
608 |     def curl_dynamic_html
609 |       browser = @browser.is_a?(String) ? @browser.normalize_browser_type : @browser
610 |       res = nil
611 | 
612 |       driver = Selenium::WebDriver.for browser
613 |       driver.manage.timeouts.implicit_wait = 4
614 |       begin
615 |         driver.get @url
616 |         res = driver.page_source
617 |       ensure
618 |         driver.quit
619 |       end
620 | 
621 |       res
622 |     end
623 | 
624 |     ##
625 |     ## Run JavaScript on a URL
626 |     ##
627 |     ## @param      script      The JavaScript to execute
628 |     ## @param      wait        Seconds to wait after executing JS
629 |     ## @param      element_id  The element identifier
630 |     ##
631 |     def run_js(script, wait_seconds = 2, element_id = nil)
632 |       raise 'No script provided' if script.nil?
633 | 
634 |       browser = @browser.is_a?(String) ? @browser.normalize_browser_type : @browser
635 | 
636 |       driver = Selenium::WebDriver.for browser
637 | 
638 |       driver.manage.timeouts.implicit_wait = 15
639 |       res = nil
640 |       begin
641 |         driver.get @url
642 |         if element_id
643 |           wait = Selenium::WebDriver::Wait.new(timeout: 10) # seconds
644 |           wait.until { driver.find_element(id: element_id) }
645 |         end
646 |         res = driver.execute_script(script)
647 |         sleep wait_seconds.to_i
648 |       ensure
649 |         driver.quit
650 |       end
651 | 
652 |       warn "Executed JS on #{@url}"
653 | 
654 |       res
655 |     end
656 | 
657 |     ##
658 |     ## Save a screenshot of a url
659 |     ##
660 |     ## @param      destination  [String] File path destination
661 |     ## @param      type         [Symbol] The type of screenshot (:full_page, :print_page, or :visible)
662 |     ##
663 |     def save_screenshot(destination = nil, type: :full_page, script: nil, wait_seconds: 0, id: nil)
664 |       raise 'No URL provided' if url.nil?
665 | 
666 |       raise 'No file destination provided' if destination.nil?
667 | 
668 |       destination = File.expand_path(destination)
669 | 
670 |       raise 'Path doesn\'t exist' unless File.directory?(File.dirname(destination))
671 | 
672 |       browser = @browser.is_a?(String) ? @browser.normalize_browser_type : @browser
673 |       type = type.normalize_screenshot_type if type.is_a?(String)
674 |       raise 'Can not save full screen with Chrome, use Firefox' if type == :full_page && browser == :chrome
675 | 
676 |       destination = case type
677 |                     when :print_page
678 |                       "#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.pdf"
679 |                     else
680 |                       "#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.png"
681 |                     end
682 | 
683 |       driver = Selenium::WebDriver.for browser
684 |       driver.manage.timeouts.implicit_wait = 4
685 |       begin
686 |         driver.get @url
687 |         if id
688 |           wait = Selenium::WebDriver::Wait.new(timeout: 10) # seconds
689 |           wait.until { driver.find_element(id: id) }
690 |         end
691 | 
692 |         res = driver.execute_script(script) if script
693 | 
694 |         sleep wait_seconds.to_i
695 | 
696 |         case type
697 |         when :print_page
698 |           driver.save_print_page(destination)
699 |         when :full_page
700 |           driver.save_full_page_screenshot(destination)
701 |         else
702 |           driver.save_screenshot(destination)
703 |         end
704 |       ensure
705 |         driver.quit
706 |       end
707 | 
708 |       warn "Screenshot saved to #{destination}"
709 |     end
710 | 
711 |     ##
712 |     ## Curls the html for the page
713 |     ##
714 |     ## @param      url           [String] The url
715 |     ## @param      headers       [Hash] The headers
716 |     ## @param      headers_only  [Boolean] Return headers only
717 |     ## @param      compressed    [Boolean] expect compressed results
718 |     ##
719 |     ## @return     [Hash] hash of url, code, headers, meta, links, head, body, and source
720 |     ##
721 |     def curl_html(url = nil, source: nil, headers: nil,
722 |                   headers_only: false, compressed: false, fallback: false)
723 |       if !url.nil?
724 |         flags = 'SsL'
725 |         flags += @headers_only ? 'I' : 'i'
726 |         agents = [
727 |           'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
728 |           'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
729 |           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
730 |           'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
731 |         ]
732 |         headers = @headers.nil? ? '' : @headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
733 |         compress = @compressed ? '--compressed' : ''
734 |         @source = `#{@curl} -#{flags} #{compress} #{headers} '#{@url}' 2>/dev/null`.strip.utf8
735 |         agent = 0
736 | 
737 |         while @source.nil? || @source.empty?
738 |           @source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{@url}' 2>/dev/null`.strip.utf8
739 |           break if agent >= agents.count - 1
740 |         end
741 | 
742 |         unless $?.success? || @fallback
743 |           warn "Error curling #{@url}"
744 |           Process.exit 1
745 |         end
746 | 
747 |         headers = { 'location' => @url }
748 |         lines = @source.split(/\r\n/)
749 |         code = lines[0].match(/(\d\d\d)/)[1]
750 |         lines.shift
751 |         lines.each_with_index do |line, idx|
752 |           if line =~ /^([\w-]+): (.*?)$/
753 |             m = Regexp.last_match
754 |             headers[m[1]] = m[2]
755 |           else
756 |             @source = lines[idx..].join("\n")
757 |             break
758 |           end
759 |         end
760 | 
761 |         if headers['content-encoding'] =~ /gzip/i && !compressed
762 |           warn 'Response is gzipped, you may need to try again with --compressed'
763 |         end
764 | 
765 |         if headers['content-type'] =~ /json/
766 |           return { url: @url, code: code, headers: headers, meta: nil, links: nil,
767 |                    head: nil, body: @source.strip, source: @source.strip, body_links: nil, body_images: nil }
768 |         end
769 |       else
770 |         @source = source unless source.nil?
771 |       end
772 | 
773 |       @source = curl_dynamic_html(@url, @fallback, @headers) if @fallback && (@source.nil? || @source.empty?)
774 | 
775 |       return false if @source.nil? || @source.empty?
776 | 
777 |       @source.strip!
778 | 
779 |       head = @source.match(%r{(?<=<head>)(.*?)(?=</head>)}mi)
780 | 
781 |       if head.nil?
782 |         { url: @url, code: code, headers: headers, meta: nil, links: nil, head: nil, body: @source.strip,
783 |           source: @source.strip, body_links: nil, body_images: nil }
784 |       else
785 |         @body = @source.match(%r{<body.*?>(.*?)</body>}mi)[1]
786 |         meta = meta_tags(head[1])
787 |         links = link_tags(head[1])
788 | 
789 |         { url: @url, code: code, headers: headers, meta: meta, links: links, head: head[1], body: @body,
790 |           source: @source.strip, body_links: nil, body_images: nil }
791 |       end
792 |     end
793 | 
794 |     ##
795 |     ## Reencode the content (borrowed from Nokogiri)
796 |     ##
797 |     ## @param      body          [String] The body
798 |     ## @param      content_type  [String] Force content type
799 |     ##
800 |     def reencode(body, content_type = nil)
801 |       if body.encoding == Encoding::ASCII_8BIT
802 |         encoding = nil
803 | 
804 |         # look for a Byte Order Mark (BOM)
805 |         initial_bytes = body[0..2].bytes
806 |         if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
807 |           encoding = Encoding::UTF_8
808 |         elsif initial_bytes[0..1] == [0xFE, 0xFF]
809 |           encoding = Encoding::UTF_16BE
810 |         elsif initial_bytes[0..1] == [0xFF, 0xFE]
811 |           encoding = Encoding::UTF_16LE
812 |         end
813 | 
814 |         # look for a charset in a content-encoding header
815 |         encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1] if content_type
816 | 
817 |         # look for a charset in a meta tag in the first 1024 bytes
818 |         unless encoding
819 |           data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
820 |           data.scan(/<meta.*?>/im).each do |meta|
821 |             encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
822 |           end
823 |         end
824 | 
825 |         # if all else fails, default to the official default encoding for HTML
826 |         encoding ||= Encoding::ISO_8859_1
827 | 
828 |         # change the encoding to match the detected or inferred encoding
829 |         body = body.dup
830 |         begin
831 |           body.force_encoding(encoding)
832 |         rescue ArgumentError
833 |           body.force_encoding(Encoding::ISO_8859_1)
834 |         end
835 |       end
836 | 
837 |       body.encode(Encoding::UTF_8)
838 |     end
839 | 
840 |     ##
841 |     ## Test if a given url has the same hostname as @url
842 |     ##
843 |     ## @param      href  [String] The url to test
844 |     ##
845 |     ## @return     [Boolean] true if hostnames match
846 |     ##
847 |     def same_origin?(href)
848 |       uri = URI(href)
849 |       origin = URI(@url)
850 |       uri.host == origin.host
851 |     rescue StandardError
852 |       false
853 |     end
854 |   end
855 | end
856 | 


--------------------------------------------------------------------------------
/lib/curly/curl/json.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | module Curl
  4 |   # Class for CURLing a JSON response
  5 |   class Json
  6 |     attr_accessor :url
  7 | 
  8 |     attr_writer :compressed, :request_headers, :symbolize_names
  9 | 
 10 |     attr_reader :code, :json, :headers
 11 | 
 12 |     def to_data
 13 |       {
 14 |         url: @url,
 15 |         code: @code,
 16 |         json: @json,
 17 |         headers: @headers
 18 |       }
 19 |     end
 20 | 
 21 |     ##
 22 |     ## Create a new Curl::Json page object
 23 |     ##
 24 |     ## @param      url         [String] The url to curl
 25 |     ## @param      headers     [Hash] The headers to send
 26 |     ## @param      compressed  [Boolean] Expect compressed results
 27 |     ##
 28 |     ## @return     [Curl::Json] Curl::Json object with url, code, parsed json, and response headers
 29 |     ##
 30 |     def initialize(url, options = {})
 31 |       @url = url
 32 |       @request_headers = options[:headers]
 33 |       @compressed = options[:compressed]
 34 |       @symbolize_names = options[:symbolize_names]
 35 | 
 36 |       @curl = TTY::Which.which('curl')
 37 |     end
 38 | 
 39 |     def curl
 40 |       page = curl_json
 41 | 
 42 |       raise "Error retrieving #{url}" if page.nil? || page.empty?
 43 | 
 44 |       @url = page[:url]
 45 |       @code = page[:code]
 46 |       @json = page[:json]
 47 |       @headers = page[:headers]
 48 |     end
 49 | 
 50 |     def path(path, json = @json)
 51 |       parts = path.split(/./)
 52 |       target = json
 53 |       parts.each do |part|
 54 |         if part =~ /(?<key>[^\[]+)\[(?<int>\d+)\]/
 55 |           target = target[key][int.to_i]
 56 |         else
 57 |           target = target[part]
 58 |         end
 59 |       end
 60 | 
 61 |       target
 62 |     end
 63 | 
 64 |     private
 65 | 
 66 |     ##
 67 |     ## Curl the JSON contents
 68 |     ##
 69 |     ## @param      url         [String] The url
 70 |     ## @param      headers     [Hash] The headers to send
 71 |     ## @param      compressed  [Boolean] Expect compressed results
 72 |     ##
 73 |     ## @return     [Hash] hash of url, code, headers, and parsed json
 74 |     ##
 75 |     def curl_json
 76 |       flags = 'SsLi'
 77 |       agents = [
 78 |         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
 79 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
 80 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
 81 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
 82 |       ]
 83 | 
 84 |       headers = @headers.nil? ? '' : @headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
 85 |       compress = @compressed ? '--compressed' : ''
 86 |       source = `#{@curl} -#{flags} #{compress} #{headers} '#{@url}' 2>/dev/null`
 87 |       agent = 0
 88 |       while source.nil? || source.empty?
 89 |         source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{@url}' 2>/dev/null`
 90 |         break if agent >= agents.count - 1
 91 |       end
 92 | 
 93 |       return false if source.nil? || source.empty?
 94 | 
 95 |       source.strip!
 96 | 
 97 |       headers = {}
 98 |       lines = source.split(/\r\n/)
 99 |       code = lines[0].match(/(\d\d\d)/)[1]
100 |       lines.shift
101 |       lines.each_with_index do |line, idx|
102 |         if line =~ /^([\w-]+): (.*?)$/
103 |           m = Regexp.last_match
104 |           headers[m[1]] = m[2]
105 |         else
106 |           source = lines[idx..].join("\n")
107 |           break
108 |         end
109 |       end
110 | 
111 |       json = source.strip.force_encoding('utf-8')
112 |       begin
113 |         json.gsub!(/[\u{1F600}-\u{1F6FF}]/, '')
114 |         { url: @url, code: code, headers: headers, json: JSON.parse(json, symbolize_names: @symbolize_names) }
115 |       rescue StandardError
116 |         { url: @url, code: code, headers: headers, json: nil }
117 |       end
118 |     end
119 |   end
120 | end
121 | 


--------------------------------------------------------------------------------
/lib/curly/hash.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | # Hash helpers
  4 | class ::Hash
  5 |   ## Convert a Curly object to data hash
  6 |   ##
  7 |   ## @return     [Hash] return a hash with keys renamed and
  8 |   ##             cleaned up
  9 |   ##
 10 |   ## @param      url    [String] A url to fall back to
 11 |   ## @param      clean  [Boolean] Clean extra spaces and newlines in sources
 12 |   ##
 13 |   def to_data(url: nil, clean: false)
 14 |     if key?(:body_links)
 15 |       {
 16 |         url: self[:url] || url,
 17 |         code: self[:code],
 18 |         headers: self[:headers],
 19 |         meta: self[:meta],
 20 |         meta_links: self[:links],
 21 |         head: clean ? self[:head]&.strip&.clean : self[:head],
 22 |         body: clean ? self[:body]&.strip&.clean : self[:body],
 23 |         source: clean ? self[:source]&.strip&.clean : self[:source],
 24 |         title: self[:title],
 25 |         description: self[:description],
 26 |         links: self[:body_links],
 27 |         images: self[:body_images]
 28 |       }
 29 |     else
 30 |       self
 31 |     end
 32 |   end
 33 | 
 34 |   ##
 35 |   ## Return the raw HTML of the object
 36 |   ##
 37 |   ## @return    [String] Html representation of the object.
 38 |   ##
 39 |   def to_html
 40 |     if key?(:source)
 41 |       self[:source]
 42 |     end
 43 |   end
 44 | 
 45 |   ##
 46 |   ## Get a value from the hash using a dot-syntax query
 47 |   ##
 48 |   ## @param      query  [String] The query (dot notation)
 49 |   ##
 50 |   ## @return     [Object] result of querying the hash
 51 |   ##
 52 |   def get_value(query)
 53 |     return nil if self.empty?
 54 |     stringify_keys!
 55 | 
 56 |     query.split('.').inject(self) do |v, k|
 57 |       return v.map { |el| el.get_value(k) } if v.is_a? Array
 58 |       # k = k.to_i if v.is_a? Array
 59 |       next v unless v.key?(k)
 60 | 
 61 |       v.fetch(k)
 62 |     end
 63 |   end
 64 | 
 65 |   # Extract data using a dot-syntax path
 66 |   #
 67 |   # @param      path  [String] The path
 68 |   #
 69 |   # @return     [Object] Result of path query
 70 |   #
 71 |   def dot_query(path, root = nil, full_tag: true)
 72 |     res = stringify_keys
 73 |     res = res[root] unless root.nil?
 74 | 
 75 |     unless path =~ /\[/
 76 |       return res.get_value(path)
 77 |     end
 78 | 
 79 |     path.gsub!(/\[(.*?)\]/) do
 80 |       inter = Regexp.last_match(1).gsub(/\./, '%')
 81 |       "[#{inter}]"
 82 |     end
 83 | 
 84 |     out = []
 85 |     q = path.split(/(?<![\d.])\./)
 86 | 
 87 |     while q.count.positive?
 88 |       pth = q.shift
 89 |       pth.gsub!(/%/, '.')
 90 | 
 91 |       return nil if res.nil?
 92 | 
 93 |       unless pth =~ /\[/
 94 |         return res.get_value(pth)
 95 |       end
 96 | 
 97 |       el = Regexp.last_match(1) if pth =~ /\[([0-9,.]+)?\]/
 98 |       pth.sub!(/\[([0-9,.]+)?\]/, '')
 99 | 
100 |       ats = []
101 |       at = []
102 |       while pth =~ /\[[+&,]?[\w.]+( *[\^*$=<>]=? *\w+)?/
103 |         m = pth.match(/\[(?<com>[,+&])? *(?<key>[\w.]+)( *(?<op>[\^*$=<>]{1,2}) *(?<val>[^,&\]]+))? */)
104 | 
105 |         comp = [m['key'], m['op'], m['val']]
106 |         case m['com']
107 |         when ','
108 |           ats.push(comp)
109 |           at = []
110 |         else
111 |           at.push(comp)
112 |         end
113 | 
114 |         pth.sub!(/\[(?<com>[,&+])? *(?<key>[\w.]+)( *(?<op>[\^*$=<>]{1,2}) *(?<val>[^,&\]]+))?/, '[')
115 |       end
116 |       ats.push(at) unless at.empty?
117 |       pth.sub!(/\[\]/, '')
118 | 
119 |       res = res[0] if res.is_a?(Array) && res.count == 1
120 |       if ats.empty? && el.nil? && res.is_a?(Array) && res[0]&.key?(pth)
121 |         res.map! { |r| r[pth] }
122 |         next
123 |       end
124 | 
125 |       res.map!(&:stringify_keys) if res.is_a?(Array) && res[0].is_a?(Hash)
126 |       # if res.is_a?(String) || (res.is_a?(Array) && res[0].is_a?(String))
127 |       #   out.push(res)
128 |       #   next
129 |       # end
130 | 
131 |       # if res.is_a?(Array) && !pth.nil?
132 |       #   return res.delete_if { |r| !r.key?(pth) }
133 |       # else
134 |       #   return false if el.nil? && ats.empty? && res.is_a?(Hash) && (res.nil? || !res.key?(pth))
135 |       # end
136 |       tag = res
137 |       res = res[pth] unless pth.nil? || pth.empty?
138 | 
139 |       pth = ''
140 | 
141 |       return false if res.nil?
142 | 
143 |       if ats.count.positive?
144 |         while ats.count.positive?
145 |           atr = ats.shift
146 |           res = [res] if res.is_a?(Hash)
147 |           res.each do |r|
148 |             out.push(full_tag ? tag : r) if evaluate_comp(r, atr)
149 |           end
150 |         end
151 |       else
152 |         out = res
153 |       end
154 | 
155 |       out = out.get_value(pth) unless pth.nil?
156 | 
157 |       if el.nil? && out.is_a?(Array) && out[0].is_a?(Hash)
158 |         out.map! { |o|
159 |           o.stringify_keys
160 |           # o.key?(pth) ? o[pth] : o
161 |         }
162 |       elsif out.is_a?(Array) && el =~ /^[\d.,]+$/
163 |         out = out[eval(el)]
164 |       end
165 |       res = out
166 |     end
167 | 
168 |     out = out[0] if out&.count == 1
169 |     out
170 |   end
171 | 
172 |   ##
173 |   ## Test if values in an array match an operator
174 |   ##
175 |   ## @param      array [Array] The array
176 |   ## @param      key   [String] The key
177 |   ## @param      comp  [String] The comparison, e.g. *= or $=
178 |   ##
179 |   ## @return [Boolean] true if array contains match
180 |   def array_match(array, key, comp)
181 |     keep = false
182 |     array.each do |el|
183 |       keep = case comp
184 |              when /^\^/
185 |                key =~ /^#{el}/i ? true : false
186 |              when /^\$/
187 |                key =~ /#{el}$/i ? true : false
188 |              when /^\*/
189 |                key =~ /#{el}/i ? true : false
190 |              else
191 |                key =~ /^#{el}$/i ? true : false
192 |              end
193 |       break if keep
194 |     end
195 |     keep
196 |   end
197 | 
198 |   ##
199 |   ## Evaluate a comparison
200 |   ##
201 |   ## @param      r     [Hash] hash of source elements and
202 |   ##                   comparison operators
203 |   ## @param      atr   [Array] Array of arrays conaining [attribute,comparitor,value]
204 |   ##
205 |   ## @return     [Boolean] whether the comparison passes or fails
206 |   ##
207 |   def evaluate_comp(r, atr)
208 |     keep = true
209 | 
210 |     r = r.symbolize_keys
211 | 
212 |     atr.each do |a|
213 |       key = a[0].to_sym
214 |       val = if a[2] =~ /^\d+$/
215 |               a[2].to_i
216 |             elsif a[2] =~ /^\d+\.\d+$/
217 |               a[2].to_f
218 |             else
219 |               a[2]
220 |             end
221 |       r = r.get_value(key.to_s) if key.to_s =~ /\./
222 | 
223 |       if val.nil?
224 |         if r.is_a?(Hash)
225 |           return r.key?(key) && !r[key].nil? && !r[key].empty?
226 |         elsif r.is_a?(String)
227 |           return r.nil? ? false : true
228 |         elsif r.is_a?(Array)
229 |           return r.empty? ? false : true
230 |         end
231 |       end
232 | 
233 |       if r.nil?
234 |         keep = false
235 |       elsif r.is_a?(Array)
236 |         valid = r.filter do |k|
237 |           if k.is_a? Array
238 |             array_match(k, a[2], a[1])
239 |           else
240 |             case a[1]
241 |             when /^\^/
242 |               k =~ /^#{a[2]}/i ? true : false
243 |             when /^\$/
244 |               k =~ /#{a[2]}$/i ? true : false
245 |             when /^\*/
246 |               k =~ /#{a[2]}/i ? true : false
247 |             else
248 |               k =~ /^#{a[2]}$/i ? true : false
249 |             end
250 |           end
251 |         end
252 | 
253 |         keep = valid.count.positive?
254 |       elsif val.is_a?(Numeric) && a[1] =~ /^[<>=]{1,2}$/
255 |         k = r.to_i
256 |         comp = a[1] =~ /^=$/ ? '==' : a[1]
257 |         keep = eval("#{k}#{comp}#{val}")
258 |       else
259 |         v = r.is_a?(Hash) ? r[key] : r
260 |         if v.is_a? Array
261 |           keep = array_match(v, a[2], a[1])
262 |         else
263 |           keep = case a[1]
264 |                  when /^\^/
265 |                    v =~ /^#{a[2]}/i ? true : false
266 |                  when /^\$/
267 |                    v =~ /#{a[2]}$/i ? true : false
268 |                  when /^\*/
269 |                    v =~ /#{a[2]}/i ? true : false
270 |                  else
271 |                    v =~ /^#{a[2]}$/i ? true : false
272 |                  end
273 |         end
274 |       end
275 | 
276 |       return false unless keep
277 |     end
278 | 
279 |     keep
280 |   end
281 | 
282 |   ##
283 |   ## Test if a tag contains an attribute matching filter queries
284 |   ##
285 |   ## @param      tag_name    [String] The tag name
286 |   ## @param      classes     [String] The classes to match
287 |   ## @param      id          [String] The id attribute to
288 |   ##                         match
289 |   ## @param      attribute   [String] The attribute
290 |   ## @param      operator    [String] The operator, <>= *=
291 |   ##                         $= ^=
292 |   ## @param      value       [String] The value to match
293 |   ## @param      descendant  [Boolean] Check descendant tags
294 |   ##
295 |   def tag_match(tag_name, classes, id, attribute, operator, value, descendant: false)
296 |     tag = self
297 |     keep = true
298 | 
299 |     keep = false if tag_name && !tag['tag'] =~ /^#{tag_name}$/i
300 | 
301 |     if tag.key?('attrs') && tag['attrs']
302 |       if keep && id
303 |         tag_id = tag['attrs'].filter { |a| a['key'] == 'id' }.first['value']
304 |         keep = tag_id && tag_id =~ /#{id}/i
305 |       end
306 | 
307 |       if keep && classes
308 |         cls = tag['attrs'].filter { |a| a['key'] == 'class' }.first
309 |         if cls
310 |           all = true
311 |           classes.each { |c| all = cls['value'].include?(c) }
312 |           keep = all
313 |         else
314 |           keep = false
315 |         end
316 |       end
317 | 
318 |       if keep && attribute
319 |         attributes = tag['attrs'].filter { |a| a['key'] =~ /^#{attribute}$/i }
320 |         any = false
321 |         attributes.each do |a|
322 |           break if any
323 | 
324 |           any = case operator
325 |                 when /^*/
326 |                   a['value'] =~ /#{value}/i
327 |                 when /^\^/
328 |                   a['value'] =~ /^#{value}/i
329 |                 when /^\$/
330 |                   a['value'] =~ /#{value}$/i
331 |                 else
332 |                   a['value'] =~ /^#{value}$/i
333 |                 end
334 |         end
335 |         keep = any
336 |       end
337 |     end
338 | 
339 |     return false if descendant && !keep
340 | 
341 |     if !descendant && tag.key?('tags')
342 |       tags = tag['tags'].filter { |t| t.tag_match(tag_name, classes, id, attribute, operator, value) }
343 |       tags.count.positive?
344 |     else
345 |       keep
346 |     end
347 |   end
348 | 
349 |   # Turn all keys into symbols
350 |   #
351 |   # If the hash has both a string and a symbol for key,
352 |   # keep the symbol value, discarding the string value
353 |   #
354 |   # @return     [Hash] a copy of the hash where all its
355 |   #             keys are strings
356 |   #
357 |   def symbolize_keys
358 |     each_with_object({}) do |(k, v), hsh|
359 |       next if k.is_a?(String) && key?(k.to_sym)
360 | 
361 |       hsh[k.to_sym] = v.is_a?(Hash) ? v.symbolize_keys : v
362 |     end
363 |   end
364 | 
365 |   # Turn all keys into strings
366 |   #
367 |   # If the hash has both a string and a symbol for key,
368 |   # keep the string value, discarding the symbol value
369 |   #
370 |   # @return     [Hash] a copy of the hash where all its
371 |   #             keys are strings
372 |   #
373 |   def stringify_keys
374 |     each_with_object({}) do |(k, v), hsh|
375 |       next if k.is_a?(Symbol) && key?(k.to_s)
376 | 
377 |       hsh[k.to_s] = v.is_a?(Hash) ? v.stringify_keys : v
378 |     end
379 |   end
380 | 
381 |   ##
382 |   ## Destructive version of #stringify_keys
383 |   ##
384 |   ## @see        #stringify_keys
385 |   ##
386 |   def stringify_keys!
387 |     replace stringify_keys
388 |   end
389 | 
390 |   ##
391 |   ## Clean up empty arrays and return an array with one or
392 |   ## more elements
393 |   ##
394 |   ## @return     [Array] output array
395 |   ##
396 |   def clean_output
397 |     output = ensure_array
398 |     output.clean_output
399 |   end
400 | 
401 |   ##
402 |   ## Ensure that an object is an array
403 |   ##
404 |   ## @return     [Array] object as Array
405 |   ##
406 |   def ensure_array
407 |     return [self]
408 |   end
409 | end
410 | 


--------------------------------------------------------------------------------
/lib/curly/numeric.rb:
--------------------------------------------------------------------------------
 1 | # Numeric helpers
 2 | class ::Numeric
 3 |   ##
 4 |   ## Return an array version of self
 5 |   ##
 6 |   ## @return     [Array] self enclosed in an array
 7 |   ##
 8 |   def ensure_array
 9 |     [self]
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/lib/curly/string.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | ##
  4 | ## Remove extra spaces and newlines from a string
  5 | ##
  6 | ## @return     [String] cleaned string
  7 | ##
  8 | class ::String
  9 |    ##
 10 |   ## Discard invalid characters and output a UTF-8 String
 11 |   ##
 12 |   ## @return     [String] UTF-8 encoded string
 13 |   ##
 14 |   def utf8
 15 |     encode('utf-16', invalid: :replace).encode('utf-8')
 16 |   end
 17 | 
 18 |   ##
 19 |   ## Destructive version of #utf8
 20 |   ##
 21 |   ## @return     [String] UTF-8 encoded string, in place
 22 |   ##
 23 |   def utf8!
 24 |     replace utf8
 25 |   end
 26 | 
 27 |   ## Remove extra spaces and newlines, compress space
 28 |   ## between tags
 29 |   ##
 30 |   ## @return     [String] cleaned string
 31 |   ##
 32 |   def clean
 33 |     gsub(/[\t\n ]+/m, ' ').gsub(/> +</, '><')
 34 |   end
 35 | 
 36 |   ##
 37 |   ## Remove HTML tags from a string
 38 |   ##
 39 |   ## @return     [String] stripped string
 40 |   ##
 41 |   def strip_tags
 42 |     gsub(%r{</?.*?>}, '')
 43 |   end
 44 | 
 45 |   ##
 46 |   ## Destructive version of #clean
 47 |   ##
 48 |   ## @see #clean
 49 |   ##
 50 |   def clean!
 51 |     replace clean
 52 |   end
 53 | 
 54 |   ##
 55 |   ## Destructive version of #strip_tags
 56 |   ##
 57 |   ## @see #strip_tags
 58 |   ##
 59 |   def strip_tags!
 60 |     replace strip_tags
 61 |   end
 62 | 
 63 |   ##
 64 |   ## Convert an image type string to a symbol
 65 |   ##
 66 |   ## @return     [Symbol] :srcset, :img, :opengraph, :all
 67 |   ##
 68 |   def normalize_image_type(default = :all)
 69 |     case self.to_s
 70 |     when /^[sp]/i
 71 |       :srcset
 72 |     when /^i/i
 73 |       :img
 74 |     when /^o/i
 75 |       :opengraph
 76 |     else
 77 |       default.is_a?(Symbol) ? default.to_sym : default.normalize_image_type
 78 |     end
 79 |   end
 80 | 
 81 |   ##
 82 |   ## Convert a browser type string to a symbol
 83 |   ##
 84 |   ## @return     [Symbol] :chrome, :firefox
 85 |   ##
 86 |   def normalize_browser_type(default = :none)
 87 |     case self.to_s
 88 |     when /^c/i
 89 |       :chrome
 90 |     when /^f/i
 91 |       :firefox
 92 |     else
 93 |       default.is_a?(Symbol) ? default.to_sym : default.normalize_browser_type
 94 |     end
 95 |   end
 96 | 
 97 |   ##
 98 |   ## Convert a screenshot type string to a symbol
 99 |   ##
100 |   ## @return     [Symbol] :full_page, :print_page, :visible
101 |   ##
102 |   def normalize_screenshot_type(default = :none)
103 |     case self.to_s
104 |     when /^f/i
105 |       :full_page
106 |     when /^p/i
107 |       :print_page
108 |     when /^v/i
109 |       :visible
110 |     else
111 |       default.is_a?(Symbol) ? default.to_sym : default.normalize_browser_type
112 |     end
113 |   end
114 | 
115 |   ##
116 |   ## Clean up output and return a single-item array
117 |   ##
118 |   ## @return     [Array] output array
119 |   ##
120 |   def clean_output
121 |     output = ensure_array
122 |     output.clean_output
123 |   end
124 | 
125 |   ##
126 |   ## Ensure that an object is an array
127 |   ##
128 |   ## @return     [Array] object as Array
129 |   ##
130 |   def ensure_array
131 |     return [self]
132 |   end
133 | end
134 | 


--------------------------------------------------------------------------------
/lib/curly/version.rb:
--------------------------------------------------------------------------------
1 | # Top level module for CurlyQ
2 | module Curly
3 |   # Current version number
4 |   VERSION = '0.0.16'
5 | end
6 | 


--------------------------------------------------------------------------------
/src/_README.md:
--------------------------------------------------------------------------------
  1 | <!--README--><!--GITHUB--># CurlyQ
  2 | 
  3 | [![Gem](https://img.shields.io/gem/v/na.svg)](https://rubygems.org/gems/curlyq)
  4 | [![GitHub license](https://img.shields.io/github/license/ttscoff/curlyq.svg)](./LICENSE.txt)
  5 | 
  6 | **A command line helper for curl and web scraping**
  7 | 
  8 | _If you find this useful, feel free to [buy me some coffee][donate]._
  9 | 
 10 | [donate]: https://brettterpstra.com/donate
 11 | <!--END GITHUB-->
 12 | 
 13 | [jq]: https://github.com/jqlang/jq "Command-line JSON processor"
 14 | [yq]: https://github.com/mikefarah/yq "yq is a portable command-line YAML, JSON, XML, CSV, TOML and properties processor"
 15 | 
 16 | The current version of `curlyq` is <!--VER-->0.0.15<!--END VER-->.
 17 | 
 18 | CurlyQ is a utility that provides a simple interface for curl, with additional features for things like extracting images and links, finding elements by CSS selector or XPath, getting detailed header info, and more. It's designed to be part of a scripting pipeline, outputting everything as structured data (JSON or YAML). It also has rudimentary support for making calls to JSON endpoints easier, but it's expected that you'll use something like [jq] to parse the output.
 19 | 
 20 | [github]: https://github.com/ttscoff/curlyq/
 21 | 
 22 | ### Installation
 23 | 
 24 | Assuming you have Ruby and RubyGems installed, you can just run `gem install curlyq`. If you run into errors, try `gem install --user-install curlyq`, or use `sudo gem install curlyq`.
 25 | 
 26 | If you're using Homebrew, you have the option to install via [brew-gem](https://github.com/sportngin/brew-gem):
 27 | 
 28 |     brew install brew-gem
 29 |     brew gem install curlyq
 30 | 
 31 | If you don't have Ruby/RubyGems, you can install them pretty easily with [Homebrew], [rvm], or [asdf].
 32 | 
 33 | [Homebrew]: https://brew.sh/ "Homebrew—The Missing Package Manager for macOS (or Linux)"
 34 | [rvm]: https://rvm.io/ "Ruby Version Manager (RVM)"
 35 | [asdf]: https://github.com/asdf-vm/asdf "asdf-vm/asdf:Extendable version manager with support for ..."
 36 | 
 37 | ### Usage
 38 | 
 39 | Run `curlyq help` for a list of subcommands. Run `curlyq help SUBCOMMAND` for details on a particular subcommand and its options.
 40 | 
 41 | ```
 42 | @cli(bundle exec bin/curlyq help)
 43 | ```
 44 | 
 45 | ### Query and Search syntax
 46 | 
 47 | You can shape the results using `--search` (`-s`) and `--query` (`-q`) on some commands.
 48 | 
 49 | A search uses either CSS or XPath syntax to locate elements. For example, if you wanted to locate all of the `<article>` elements with a class of `post` inside of the div with an id of `main`, you would run `--search '#main article.post'`. Searches can target tags, ids, and classes, and can accept `>` to target direct descendents. You can also use XPaths, but I hate those so I'm not going to document them.
 50 | 
 51 | > I've tried to make the query function useful, but if you want to do any kind of advanced shaping, you're better off piping the JSON output to [jq] or [yq].
 52 | <!--JEKYLL{:.warn}-->
 53 | 
 54 | Queries are specifically for shaping CurlyQ output. If you're using the `html` command, it returns a key called `images`, so you can target just the images in the response with `-q 'images'`. The queries accept array syntax, so to get the first image, you would use `-q 'images[0]'`. Ranges are accepted as well, so `-q 'images[1..4]'` will return the 2nd through 5th images found on the page. You can also do comparisons, e.g. `images[rel=me]'` to target only images with a `rel` attribute of `me`.
 55 | 
 56 | The comparisons for the query flag are:
 57 | 
 58 | - `<` less than
 59 | - `>` greater than
 60 | - `<=` less than or equal to
 61 | - `>=` greater than or equal to
 62 | - `=` or `==` is equal to
 63 | - `*=` contains text
 64 | - `^=` starts with text
 65 | - `$=` ends with text
 66 | 
 67 | Comparisons can be numeric or string comparisons. A numeric comparison like `curlyq images -q '[width>500]' URL` would return all of the images on the page with a width attribute greater than 500.
 68 | 
 69 | You can also use dot syntax inside of comparisons, e.g. `[links.rel*=me]` to target the links object (`html` command), and return only the links with a `rel=me` attribute. If the comparison is to an array object (like `class` or `rel`), it will match if any of the elements of the array match your comparison.
 70 | 
 71 | If you end the query with a specific key, only that key will be output. If there's only one match, it will be output as a raw string. If there are multiple matches, output will be an array:
 72 | 
 73 |     curlyq tags --search '#main .post h3' -q '[attrs.id*=what].source' 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/'
 74 |     
 75 |     <h3 id="whats-next">What’s Next</h3>
 76 | 
 77 | #### Commands
 78 | 
 79 | curlyq makes use of subcommands, e.g. `curlyq html [options] URL` or `curlyq extract [options] URL`. Each subcommand takes its own options, but I've made an effort to standardize the choices between each command as much as possible.
 80 | 
 81 | ##### extract
 82 | 
 83 | Example: 
 84 | 
 85 |     curlyq extract -i -b 'Adding' -a 'accessing the source.' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python'
 86 | 
 87 |     [
 88 |       "Adding <code>time.sleep(10)</code> in various places in case the page had not fully loaded when I was accessing the source."
 89 |     ]
 90 | 
 91 | This specifies a before and after string and includes them (`-i`) in the result.
 92 | 
 93 | ```
 94 | @cli(bundle exec bin/curlyq help extract)
 95 | ```
 96 | 
 97 | 
 98 | ##### execute
 99 | 
100 | You can execute JavaScript on a given web page using the `execute` subcommand.
101 | 
102 | Example:
103 | 
104 |     curlyq execute -s "NiftyAPI.find('file/save').arrow().shoot('file-save')" file:///Users/ttscoff/Desktop/Code/niftymenu/dist/MultiMarkdown-Composer.html
105 | 
106 | You can specify an element id to wait for using `--id`, and define a pause to wait after executing a script with `--wait` (defaults to 2 seconds). Scripts can be read from the command line arguments with `--script "SCRIPT"`, from STDIN with `--script -`, or from a file using `--script PATH`.
107 | 
108 | If you expect a return value, be sure to include a `return` statement in your executed script. Results will be output to STDOUT.
109 | 
110 | ```
111 | @cli(bundle exec bin/curlyq help execute)
112 | ```
113 | 
114 | ##### headlinks
115 | 
116 | Example:
117 | 
118 |     curlyq headlinks -q '[rel=stylesheet]' https://brettterpstra.com
119 | 
120 |     {
121 |       "rel": "stylesheet",
122 |       "href": "https://cdn3.brettterpstra.com/stylesheets/screen.7261.css",
123 |       "type": "text/css",
124 |       "title": null
125 |     }
126 | 
127 | This pulls all `<links>` from the `<head>` of the page, and uses a query `-q` to only show links with `rel="stylesheet"`.
128 | 
129 | ```
130 | @cli(bundle exec bin/curlyq help headlinks)
131 | ```
132 | 
133 | ##### html
134 | 
135 | The html command (aliased as `curl`) gets the entire text of the web page and provides a JSON response with a breakdown of:
136 | 
137 | - URL, after any redirects
138 | - Response code
139 | - Response headers as a keyed hash
140 | - Meta elements for the page as a keyed hash
141 | - All meta links in the head as an array of objects containing (as available): 
142 |     - rel
143 |     - href
144 |     - type
145 |     - title
146 | - source of `<head>`
147 | - source of `<body>`
148 | - the page title (determined first by og:title, then by a title tag)
149 | - description (using og:description first)
150 | - All links on the page as an array of objects with: 
151 |     - href
152 |     - title
153 |     - rel
154 |     - text content
155 |     - classes as array
156 | - All images on the page as an array of objects containing:
157 |     - class
158 |     - all attributes as key/value pairs
159 |     - width and height (if specified)
160 |     - src
161 |     - alt and title
162 | 
163 | You can add a query (`-q`) to only get the information needed, e.g. `-q images[width>600]`.
164 | 
165 | Example:
166 | 
167 |     curlyq html -s '#main article .aligncenter' -q 'images[1]' 'https://brettterpstra.com'
168 | 
169 |     [
170 |       {
171 |         "class": "aligncenter",
172 |         "original": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb_tw.jpg",
173 |         "at2x": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb@2x.jpg",
174 |         "width": "800",
175 |         "height": "226",
176 |         "src": "https://cdn3.brettterpstra.com/uploads/2023/09/giveaway-keyboardmaestro2024-rb.jpg",
177 |         "alt": "Giveaway Robot with Keyboard Maestro icon",
178 |         "title": "Giveaway Robot with Keyboard Maestro icon"
179 |       }
180 |     ]
181 | 
182 | The above example queries the full html of the page, but narrows the elements using `--search` and then takes the 2nd image from the results.
183 | 
184 |     curlyq html -q 'meta.title'  https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
185 | 
186 |     Introducing CurlyQ, a pipeline-oriented curl helper - BrettTerpstra.com
187 | 
188 | The above example curls the page and returns the title attribute found in the meta (`-q 'meta.title'`).
189 | 
190 | ```
191 | @cli(bundle exec bin/curlyq help html)
192 | ```
193 | 
194 | ##### images
195 | 
196 | The images command returns only the images on the page as an array of objects. It can be queried to match certain requirements (see Query and Search syntax above).
197 | 
198 | The base command will return all images on the page, including OpenGraph images from the head, `<img>` tags from the body, and `<srcset>` tags along with their child images.
199 | 
200 | OpenGraph images will be returned with the structure:
201 | 
202 |     {
203 |         "type": "opengraph",
204 |         "attrs": null,
205 |         "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg"
206 |       }
207 | 
208 | `img` tags will be returned with the structure:
209 | 
210 |     {
211 |         "type": "img",
212 |         "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb.jpg",
213 |         "width": "800",
214 |         "height": "226",
215 |         "alt": "Banner image for CurlyQ",
216 |         "title": "CurlyQ, curl better",
217 |         "attrs": [
218 |           {
219 |             "class": [
220 |               "aligncenter"
221 |              ], // all attributes included
222 |           }
223 |         ]
224 |       }
225 | 
226 | 
227 | 
228 | `srcset` images will be returned with the structure:
229 | 
230 |     {
231 |         "type": "srcset",
232 |             "attrs": [
233 |               {
234 |                 "key": "srcset",
235 |                 "value": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg 1x, https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg 2x"
236 |               }
237 |             ],
238 |             "images": [
239 |               {
240 |                 "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb_tw.jpg",
241 |                 "media": "1x"
242 |               },
243 |               {
244 |                 "src": "https://cdn3.brettterpstra.com/uploads/2024/01/curlyq_header-rb@2x.jpg",
245 |                 "media": "2x"
246 |               }
247 |           ]
248 |         }
249 |     }
250 | 
251 | Example:
252 | 
253 |     curlyq images -t img -q '[alt$=screenshot]' https://brettterpstra.com
254 | 
255 | This will return an array of images that are `<img>` tags, and only show the ones that have an `alt` attribute that ends with `screenshot`.
256 | 
257 |     curlyq images -q '[width>750]' https://brettterpstra.com
258 | 
259 | This example will only return images that have a width greater than 750 pixels. This query depends on the images having proper `width` attributes set on them in the source.
260 | 
261 | ```
262 | @cli(bundle exec bin/curlyq help images)
263 | ```
264 | 
265 | ##### json
266 | 
267 | The `json` command just returns an object with header/response info, and the contents of the JSON response after it's been read by the Ruby JSON library and output. If there are fetching or parsing errors it will fail gracefully with an error code.
268 | 
269 | ```
270 | @cli(bundle exec bin/curlyq help json)
271 | ```
272 | 
273 | ##### links
274 | 
275 | Returns all the links on the page, which can be queried on any attribute.
276 | 
277 | Example:
278 | 
279 |     curlyq links -q '[content*=twitter]' 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python'
280 | 
281 |     [
282 |       {
283 |         "href": "https://twitter.com/stackoverflow",
284 |         "title": null,
285 |         "rel": null,
286 |         "content": "Twitter",
287 |         "class": [
288 |           "-link",
289 |           "js-gps-track"
290 |         ]
291 |       }
292 |     ]
293 | 
294 | This example gets all links from the page but only returns ones with link content containing 'twitter' (`-q '[content*=twitter]'`).
295 | 
296 | ```
297 | @cli(bundle exec bin/curlyq help links)
298 | ```
299 | 
300 | ##### scrape
301 | 
302 | Loads the page in a web browser, allowing scraping of dynamically loaded pages that return nothing but scripts when `curl`ed. The `-b` (`--browser`) option is required and should be 'chrome' or 'firefox' (or just 'c' or 'f'). The selected browser must be installed on your system.
303 | 
304 | Example:
305 | 
306 |     curlyq scrape -b firefox -q 'links[rel=me&content*=mastodon][0]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
307 | 
308 |     {
309 |       "href": "https://nojack.easydns.ca/@ttscoff",
310 |       "title": null,
311 |       "rel": [
312 |         "me"
313 |       ],
314 |       "content": "Mastodon",
315 |       "class": [
316 |         "u-url"
317 |       ]
318 |     }
319 | 
320 | This example scrapes the page using firefox and finds the first link with a rel of 'me' and text containing 'mastodon'.
321 | 
322 | ```
323 | @cli(bundle exec bin/curlyq help scrape)
324 | ```
325 | 
326 | ##### screenshot
327 | 
328 | Full-page screenshots require Firefox, installed and specified with `--browser firefox`.
329 | 
330 | Type defaults to `full`, but will only work if `-b` is Firefox. If you want to use Chrome, you must specify a `--type` as 'visible' or 'print'.
331 | 
332 | The `-o` (`--output`) flag is required. It should be a path to a target PNG file (or PDF for `-t print` output). Extension will be modified automatically, all you need is the base name.
333 | 
334 | Example:
335 | 
336 |     curlyq screenshot -b f -o ~/Desktop/test https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
337 | 
338 |     Screenshot saved to /Users/ttscoff/Desktop/test.png
339 | 
340 | You can wait for an element ID to be visible using `--id`. This can be any `#ID` on the page. If the ID doesn't exist on the page, though, the screenshot will hang for a timeout of 10 seconds.
341 | 
342 | You can execute a script before taking the screenshot with the `--script` flag. If this is set to `-`, it will read the script from STDIN. If it's set to an existing file path, that file will be read for script input. Specify an interval (in seconds) to wait after executing the script with `--wait`.
343 | 
344 | ```
345 | @cli(bundle exec bin/curlyq help screenshot)
346 | ```
347 | 
348 | ##### tags
349 | 
350 | Return a hierarchy of all tags in a page. Use `-t` to limit to a specific tag.
351 | 
352 |     curlyq tags --search '#main .post h3' -q '[attrs.id*=what]' https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/
353 | 
354 |     [
355 |       {
356 |         "tag": "h3",
357 |         "source": "<h3 id=\"whats-next\">What’s Next</h3>",
358 |         "attrs": [
359 |           {
360 |             "id": "whats-next"
361 |           }
362 |         ],
363 |         "content": "What’s Next",
364 |         "tags": [
365 | 
366 |         ]
367 |       }
368 |     ]
369 | 
370 | The above command filters the tags based on a CSS query, then further filters them to just tags with an id containing 'what'.
371 | 
372 | ```
373 | @cli(bundle exec bin/curlyq help tags)
374 | ```
375 | 
376 | <!--GITHUB-->
377 | PayPal link: [paypal.me/ttscoff](https://paypal.me/ttscoff)
378 | 
379 | ## Changelog
380 | 
381 | See [CHANGELOG.md](https://github.com/ttscoff/curlyq/blob/main/CHANGELOG.md)
382 | <!--END GITHUB--><!--END README-->
383 | 


--------------------------------------------------------------------------------
/test/curlyq_extract_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'json'
 4 | require 'yaml'
 5 | 
 6 | require 'helpers/curlyq-helpers'
 7 | require 'test_helper'
 8 | 
 9 | # Tests for tags command
10 | class CurlyQExtractTest < Test::Unit::TestCase
11 |   include CurlyQHelpers
12 | 
13 |   def setup
14 |   end
15 | 
16 |   def test_extract_inclusive
17 |     result = curlyq('extract', '-i', '-b', 'Adding', '-a', 'accessing the source.', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python')
18 |     json = JSON.parse(result)
19 | 
20 |     assert_match(/^Adding <code>time.sleep\(10\)<\/code>.*?accessing the source.$/, json[0], 'Match should be found and include the before and after strings')
21 |   end
22 | 
23 |   def test_extract_exclusive
24 |     result = curlyq('extract', '-b', 'Adding', '-a', 'accessing the source.', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python')
25 |     json = JSON.parse(result)
26 | 
27 |     assert_match(/^ <code>time.sleep\(10\)<\/code>.*?when I was $/, json[0], 'Match should be found and not include the before and after strings')
28 |   end
29 | 
30 |   def test_extract_regex_inclusive
31 |     result = curlyq('extract', '-ri', '-b', '.dding <', '-a', 'accessing.*?source.', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python')
32 |     json = JSON.parse(result)
33 | 
34 |     assert_match(/^Adding <code>time.sleep\(10\)<\/code>.*?accessing the source.$/, json[0], 'Match should be found and include the before and after strings')
35 |   end
36 | 
37 |   def test_extract_regex_exclusive
38 |     result = curlyq('extract', '-r', '-b', '.dding <', '-a', 'accessing.*?source.', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python')
39 |     json = JSON.parse(result)
40 | 
41 |     assert_match(/^code>time.sleep\(10\)<\/code>.*?when I was $/, json[0], 'Match should be found and not include the before and after strings')
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/test/curlyq_headlinks_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'json'
 4 | require 'yaml'
 5 | 
 6 | require 'helpers/curlyq-helpers'
 7 | require 'test_helper'
 8 | 
 9 | # Tests for tags command
10 | class CurlyQHeadlinksTest < Test::Unit::TestCase
11 |   include CurlyQHelpers
12 | 
13 |   def setup
14 |   end
15 | 
16 |   def test_headlinks_query
17 |     result = curlyq('headlinks', '-q', '[rel=stylesheet]', 'https://brettterpstra.com')
18 |     json = JSON.parse(result)
19 | 
20 |     assert_equal(Array, json.class, 'Result should be an array')
21 |     assert_match(/stylesheet/, json[0]['rel'], 'Should have retrieved a single result with rel stylesheet')
22 |     assert_match(/screen\.\d+\.css$/, json[0]['href'], 'Stylesheet should be correct primary stylesheet')
23 |   end
24 | 
25 |   def test_headlinks
26 |     result = curlyq('headlinks', 'https://brettterpstra.com')
27 |     json = JSON.parse(result)
28 | 
29 |     assert_equal(Array, json.class, 'Should have an array of results')
30 |     assert(json.count > 1, 'Should have more than one link')
31 |     # assert(json[0].count.positive?)
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/test/curlyq_html_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'json'
 4 | require 'yaml'
 5 | 
 6 | require 'helpers/curlyq-helpers'
 7 | require 'test_helper'
 8 | 
 9 | # Tests for tags command
10 | class CurlyQHtmlTest < Test::Unit::TestCase
11 |   include CurlyQHelpers
12 | 
13 |   def test_html_search_query
14 |     result = curlyq('html', '-s', '#main article .aligncenter', '-q', 'images[0]', 'https://brettterpstra.com/2024/10/19/web-excursions-for-october-19-2024/')
15 |     json = JSON.parse(result)
16 | 
17 |     assert_match(/aligncenter/, json[0]['class'], 'Should have found an image with class "aligncenter"')
18 |   end
19 | 
20 |   def test_html_query
21 |     result = curlyq('html', '-q', 'meta.title', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
22 |     json = JSON.parse(result)
23 |     assert_match(/Introducing CurlyQ/, json[0], 'Should have retrived the page title')
24 |   end
25 | end
26 | 


--------------------------------------------------------------------------------
/test/curlyq_images_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'json'
 4 | require 'yaml'
 5 | 
 6 | require 'helpers/curlyq-helpers'
 7 | require 'test_helper'
 8 | 
 9 | # Tests for tags command
10 | class CurlyQImagesTest < Test::Unit::TestCase
11 |   include CurlyQHelpers
12 | 
13 |   def test_images_query
14 |     result = curlyq('images', '-t', 'img', '-q', '[alt$=screenshot]', 'https://brettterpstra.com/2024/01/08/keyboard-maestro-giveaway/')
15 |     json = JSON.parse(result)
16 | 
17 |     assert(json.count == 1, 'Should have found 1 image')
18 |     assert_match(/Keyboard Maestro screenshot/, json[0]['alt'], 'Should match Keyboard Meastro screenshot')
19 |   end
20 | 
21 |   def test_images_type
22 |     result = curlyq('images', '-t', 'srcset', 'https://brettterpstra.com/')
23 |     json = JSON.parse(result)
24 | 
25 |     assert(json.count.positive?, 'Should have found at least 1 image')
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/test/curlyq_json_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'json'
 4 | require 'yaml'
 5 | 
 6 | require 'helpers/curlyq-helpers'
 7 | require 'test_helper'
 8 | 
 9 | # Tests for tags command
10 | class CurlyQJsonTest < Test::Unit::TestCase
11 |   include CurlyQHelpers
12 | 
13 |   def setup
14 |   end
15 | 
16 |   def test_json
17 |     result = curlyq('json', 'https://brettterpstra.com/scripts/giveaways_wrapper.cgi?v=203495&giveaway=hazel2023&action=count')
18 |     json = JSON.parse(result)[0]
19 | 
20 |     assert_equal(json.class, Hash, 'Single result should be a hash')
21 |     assert_equal(286, json['json']['total'], 'json.total should match 286')
22 |   end
23 | 
24 |   def test_query
25 |     result1 = curlyq('json', '-q', 'total', 'https://brettterpstra.com/scripts/giveaways_wrapper.cgi?v=203495&giveaway=hazel2023&action=count')
26 |     result2 = curlyq('json', '-q', 'json.total', 'https://brettterpstra.com/scripts/giveaways_wrapper.cgi?v=203495&giveaway=hazel2023&action=count')
27 |     json1 = JSON.parse(result1)[0]
28 |     json2 = JSON.parse(result2)[0]
29 | 
30 |     assert_equal(286, json1, 'Should be 286')
31 |     assert_equal(286, json2, 'Including json in dot path should yeild same result')
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/test/curlyq_links_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'json'
 4 | require 'yaml'
 5 | 
 6 | require 'helpers/curlyq-helpers'
 7 | require 'test_helper'
 8 | 
 9 | # Tests for tags command
10 | class CurlyQLinksTest < Test::Unit::TestCase
11 |   include CurlyQHelpers
12 | 
13 |   def test_links
14 |     result = curlyq('links', '-q', '[content*=twitter]', 'https://stackoverflow.com/questions/52428409/get-fully-rendered-html-using-selenium-webdriver-and-python')
15 |     json = JSON.parse(result)
16 | 
17 |     assert(json.count.positive?, 'Should be at least 1 match')
18 |     assert_match(/twitter.com/, json[0]['href'], 'Should be a link to Twitter')
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/test/curlyq_scrape_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'json'
 4 | require 'yaml'
 5 | 
 6 | require 'helpers/curlyq-helpers'
 7 | require 'test_helper'
 8 | 
 9 | # Tests for tags command
10 | class CurlyQScrapeTest < Test::Unit::TestCase
11 |   include CurlyQHelpers
12 | 
13 |   def setup
14 |     @screenshot = File.join(File.dirname(__FILE__), 'screenshot_test')
15 |     FileUtils.rm_f("#{@screenshot}.pdf") if File.exist?("#{@screenshot}.pdf")
16 |     FileUtils.rm_f("#{@screenshot}.png") if File.exist?("#{@screenshot}.png")
17 |     FileUtils.rm_f("#{@screenshot}_full.png") if File.exist?("#{@screenshot}_full.png")
18 |   end
19 | 
20 |   def teardown
21 |     FileUtils.rm_f("#{@screenshot}.pdf") if File.exist?("#{@screenshot}.pdf")
22 |     FileUtils.rm_f("#{@screenshot}.png") if File.exist?("#{@screenshot}.png")
23 |     FileUtils.rm_f("#{@screenshot}_full.png") if File.exist?("#{@screenshot}_full.png")
24 |   end
25 | 
26 |   def test_scrape_firefox
27 |     result = curlyq('scrape', '-b', 'firefox', '-q', 'links[rel=me&content*=mastodon][0]', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
28 |     json = JSON.parse(result)
29 | 
30 |     assert_equal(Array, json.class, 'Result should be an Array')
31 |     assert_match(/Mastodon/, json[0]['content'], 'Should have retrieved a Mastodon link')
32 |   end
33 | 
34 |   def test_scrape_chrome
35 |     result = curlyq('scrape', '-b', 'chrome', '-q', 'links[rel=me&content*=mastodon][0]', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
36 |     json = JSON.parse(result)
37 | 
38 |     assert_equal(Array, json.class, 'Result should be an Array')
39 |     assert_match(/Mastodon/, json[0]['content'], 'Should have retrieved a Mastodon link')
40 |   end
41 | 
42 |   def test_screenshot
43 |     curlyq('screenshot', '-b', 'firefox', '-o', @screenshot, '-t', 'print', 'https://brettterpstra.com')
44 |     assert(File.exist?("#{@screenshot}.pdf"), 'PDF Screenshot should exist')
45 | 
46 |     curlyq('screenshot', '-b', 'chrome', '-o', @screenshot, '-t', 'visible', 'https://brettterpstra.com')
47 |     assert(File.exist?("#{@screenshot}.png"), 'PNG Screenshot should exist')
48 | 
49 |     curlyq('screenshot', '-b', 'firefox', '-o', "#{@screenshot}_full", '-t', 'full', 'https://brettterpstra.com')
50 |     assert(File.exist?("#{@screenshot}_full.png"), 'PNG Screenshot should exist')
51 |   end
52 | end
53 | 


--------------------------------------------------------------------------------
/test/curlyq_tags_test.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'json'
 4 | require 'yaml'
 5 | 
 6 | require 'helpers/curlyq-helpers'
 7 | require 'test_helper'
 8 | 
 9 | # Tests for tags command
10 | class CurlyQTagsTest < Test::Unit::TestCase
11 |   include CurlyQHelpers
12 | 
13 |   def setup
14 |   end
15 | 
16 |   def test_tags
17 |     result = curlyq('tags', '--search', '#main .post h3', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
18 |     json = JSON.parse(result)
19 | 
20 |     assert_equal(Array, json.class, 'Should be an array of matches')
21 |     assert_equal(6, json.count, 'Should be six results')
22 |   end
23 | 
24 |   def test_clean
25 |     result = curlyq('tags', '--search', '#main section.related', '--clean', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
26 |     json = JSON.parse(result)
27 | 
28 |     assert_equal(Array, json.class, 'Should be a single Array')
29 |     assert_equal(1, json.count, 'Should be one element')
30 |     assert_match(%r{Last.fm</h5></a></li>}, json[0]['source'], 'Should have matched #whats-next')
31 |   end
32 | 
33 |   def test_query
34 |     result = curlyq('tags', '--search', '#main .post h3', '-q', '[attrs.id*=what].source', 'https://brettterpstra.com/2024/01/10/introducing-curlyq-a-pipeline-oriented-curl-helper/')
35 |     json = JSON.parse(result)
36 |     assert_equal(Array, json.class, 'Should be an array')
37 |     assert_match(%r{^<h3 id="whats-next">What’s Next</h3>$}, json[0], 'Should have returned just source')
38 |   end
39 | end
40 | 


--------------------------------------------------------------------------------
/test/default_test.rb:
--------------------------------------------------------------------------------
 1 | require_relative "test_helper"
 2 | 
 3 | class DefaultTest < Minitest::Test
 4 | 
 5 |   def setup
 6 |   end
 7 | 
 8 |   def teardown
 9 |   end
10 | 
11 |   def test_the_truth
12 |     assert true
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/test/helpers/curlyq-helpers.rb:
--------------------------------------------------------------------------------
 1 | require 'open3'
 2 | require 'time'
 3 | require 'fileutils'
 4 | $LOAD_PATH.unshift File.join(__dir__, '..', '..', 'lib')
 5 | require 'curly'
 6 | 
 7 | module CurlyQHelpers
 8 |   CURLYQ_EXEC = File.join(File.dirname(__FILE__), '..', '..', 'bin', 'curlyq')
 9 |   BUNDLE = '/Users/ttscoff/.asdf/shims/bundle'
10 | 
11 |   def curlyq_with_env(env, *args, stdin: nil)
12 |     Dir.chdir(File.expand_path('~/Desktop/Code/curlyq'))
13 |     pread(env, BUNDLE, 'exec', 'bin/curlyq', *args, stdin: stdin)
14 |   end
15 | 
16 |   def curlyq(*args)
17 |     curlyq_with_env({ 'GLI_DEBUG' => 'true' }, *args)
18 |   end
19 | 
20 |   def pread(env, *cmd, stdin: nil)
21 |     out, err, status = Open3.capture3(env, *cmd, stdin_data: stdin)
22 |     unless status.success?
23 |       raise [
24 |         "Error (#{status}): #{cmd.inspect} failed", "STDOUT:", out.inspect, "STDERR:", err.inspect
25 |       ].join("\n")
26 |     end
27 | 
28 |     out
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/test/helpers/fake_std_out.rb:
--------------------------------------------------------------------------------
 1 | class FakeStdOut
 2 |   attr_reader :strings
 3 | 
 4 |   def initialize
 5 |     @strings = []
 6 |   end
 7 | 
 8 |   def puts(string=nil)
 9 |     @strings << string unless string.nil?
10 |   end
11 | 
12 |   def write(x)
13 |     puts(x)
14 |   end
15 | 
16 |   def printf(*args)
17 |     puts(Kernel.printf(*args))
18 |   end
19 | 
20 |   # Returns true if the regexp matches anything in the output
21 |   def contained?(regexp)
22 |     strings.find{ |x| x =~ regexp }
23 |   end
24 | 
25 |   def flush; end
26 | 
27 |   def to_s
28 |     @strings.join("\n")
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/test/helpers/threaded_tests.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | require 'tty-spinner'
  4 | require 'tty-progressbar'
  5 | require 'open3'
  6 | require 'shellwords'
  7 | require 'fileutils'
  8 | require 'pastel'
  9 | 
 10 | class ThreadedTests
 11 |   def run(pattern: '*', max_threads: 8, max_tests: 0)
 12 |     pastel = Pastel.new
 13 | 
 14 |     start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
 15 |     @results = File.expand_path('results.log')
 16 | 
 17 |     max_threads = 1000 if max_threads.to_i == 0
 18 | 
 19 |     shuffle = false
 20 | 
 21 |     unless pattern =~ /shuffle/i
 22 |       pattern = "test/curlyq_*#{pattern}*_test.rb"
 23 |     else
 24 |       pattern = "test/curlyq_*_test.rb"
 25 |       shuffle = true
 26 |     end
 27 | 
 28 |     tests = Dir.glob(pattern)
 29 | 
 30 |     tests.shuffle! if shuffle
 31 | 
 32 |     if max_tests.to_i > 0
 33 |       tests = tests.slice(0, max_tests.to_i - 1)
 34 |     end
 35 | 
 36 |     puts pastel.cyan("#{tests.count} test files")
 37 | 
 38 |     banner = "Running tests [:bar] T/A (#{max_threads.to_s} threads)"
 39 | 
 40 |     progress = TTY::ProgressBar::Multi.new(banner,
 41 |                                            width: 12,
 42 |                                            clear: true,
 43 |                                            hide_cursor: true)
 44 |     @children = []
 45 |     tests.each do |t|
 46 |       test_name = File.basename(t, '.rb').sub(/curlyq_(.*?)_test/, '\1')
 47 |       new_sp = progress.register("[:bar] #{test_name}:status",
 48 |                                  total: tests.count + 8,
 49 |                                  width: 1,
 50 |                                  head: ' ',
 51 |                                  unknown: ' ',
 52 |                                  hide_cursor: true,
 53 |                                  clear: true)
 54 |       status = ': waiting'
 55 |       @children.push([test_name, new_sp, status])
 56 |     end
 57 | 
 58 |     @elapsed = 0.0
 59 |     @test_total = 0
 60 |     @assrt_total = 0
 61 |     @error_out = []
 62 |     @threads = []
 63 |     @running_tests = []
 64 | 
 65 |     begin
 66 |       finish_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
 67 |       while @children.count.positive?
 68 | 
 69 |         slices = @children.slice!(0, max_threads)
 70 |         slices.each { |c| c[1].start }
 71 |         slices.each do |s|
 72 |           @threads << Thread.new do
 73 |             run_test(s)
 74 |             finish_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
 75 |           end
 76 |         end
 77 | 
 78 |         @threads.each { |t| t.join }
 79 |       end
 80 | 
 81 |       finish_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
 82 | 
 83 |       progress.finish
 84 |     rescue
 85 |       progress.stop
 86 |     ensure
 87 |       msg = @running_tests.map { |t| t[1].format.sub(/^\[:bar\] (.*?):status/, "\\1#{t[2]}") }.join("\n")
 88 | 
 89 |       output = []
 90 |       output << if @error_out.count.positive?
 91 |                   pastel.red("#{@error_out.count} Issues")
 92 |                 else
 93 |                   pastel.green('Success')
 94 |                 end
 95 |       output << pastel.green("#{@test_total} tests")
 96 |       output << pastel.cyan("#{@assrt_total} assertions")
 97 |       output << pastel.yellow("#{(finish_time - start_time).round(3)}s")
 98 |       puts output.join(', ')
 99 | 
100 |       if @error_out.count.positive?
101 |         puts @error_out.join(pastel.white("\n----\n"))
102 |         Process.exit 1
103 |       end
104 |     end
105 |   end
106 | 
107 |   def run_test(s)
108 |     pastel = Pastel.new
109 | 
110 |     bar = s[1]
111 |     s[2] = ": #{pastel.green('running')}"
112 |     bar.advance(status: s[2])
113 | 
114 |     if @running_tests.count.positive?
115 |       @running_tests.each do |b|
116 |         prev_bar = b[1]
117 |         if prev_bar.complete?
118 |           prev_bar.reset
119 |           prev_bar.advance(status: b[2])
120 |           prev_bar.finish
121 |         else
122 |           prev_bar.update(head: ' ', unfinished: ' ')
123 |           prev_bar.advance(status: b[2])
124 |         end
125 |       end
126 |     end
127 | 
128 |     @running_tests.push(s)
129 |     out, _err, status = Open3.capture3(ENV, 'rake', "test:#{s[0]}", stdin_data: nil)
130 |     time = out.match(/^Finished in (?<time>\d+\.\d+) seconds\./)
131 |     count = out.match(/^(?<tests>\d+) tests, (?<assrt>\d+) assertions, (?<fails>\d+) failures, (?<errs>\d+) errors/)
132 | 
133 |     unless status.success? && !count['fails'].to_i.positive? && !count['errs'].to_i.positive?
134 |       s[2] = if count
135 |                ": #{pastel.red(count['fails'])} #{pastel.red('failures')}, #{pastel.red(count['errs'])} #{pastel.red('errors')}"
136 |              else
137 |                ": #{pastel.red('Unknown Error')}"
138 |              end
139 |       bar.update(head: pastel.red('✖'))
140 |       bar.advance(head: pastel.red('✖'), status: s[2])
141 | 
142 |       # errs = out.scan(/(?:Failure|Error): [\w_]+\((?:.*?)\):(?:.*?)(?=\n=======)/m)
143 |       @error_out.push(out)
144 |       bar.finish
145 | 
146 |       next_test
147 |       Thread.exit
148 |     end
149 | 
150 |     s[2] = [
151 |       ': ',
152 |       pastel.green(count['tests']),
153 |       '/',
154 |       pastel.cyan(count['assrt']),
155 |       ' ',
156 |       pastel.yellow(time['time'].to_f.round(3).to_s),
157 |       's'
158 |     ].join('')
159 |     bar.update(head: pastel.green('✔'))
160 |     bar.advance(head: pastel.green('✔'), status: s[2])
161 |     @test_total += count['tests'].to_i
162 |     @assrt_total += count['assrt'].to_i
163 |     @elapsed += time['time'].to_f
164 | 
165 |     bar.finish
166 | 
167 |     next_test
168 |   end
169 | 
170 |   def next_test
171 |     if @children.count.positive?
172 |       t = Thread.new do
173 |         s = @children.shift
174 |         # s[1].start
175 |         # s[1].advance(status: ": #{'running'.green}")
176 |         run_test(s)
177 |       end
178 | 
179 |       t.join
180 |     end
181 |   end
182 | end
183 | 


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'test/unit'
 4 | # Add test libraries you want to use here, e.g. mocha
 5 | 
 6 | class Test::Unit::TestCase
 7 |   ENV['TZ'] = 'UTC'
 8 |   # Add global extensions to the test case class here
 9 | end
10 | 


--------------------------------------------------------------------------------