├── index.js ├── package.json ├── LICENSE ├── .gitignore ├── cli.js ├── .github └── hred.svg └── README.md /index.js: -------------------------------------------------------------------------------- 1 | let { JSDOM } = require('jsdom'); 2 | let qsx = require('qsx'); 3 | 4 | module.exports = function(content, query, url, contentType) { 5 | let doc = new JSDOM(content, { url, contentType }).window.document; 6 | return qsx(doc, query); 7 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "hred", 3 | "version": "1.5.1", 4 | "main": "index.js", 5 | "repository": "git@github.com:danburzo/hred.git", 6 | "author": "Dan Burzo ", 7 | "license": "MIT", 8 | "dependencies": { 9 | "jsdom": "^16.6.0", 10 | "opsh": "^1.0.0", 11 | "qsx": "^3.3.0" 12 | }, 13 | "bin": { 14 | "hred": "./cli.js" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Dan Burzo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | -------------------------------------------------------------------------------- /cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | let opsh = require('opsh'); 3 | let hred = require('./index.js'); 4 | let pkg = require('./package.json'); 5 | let fs = require('fs'); 6 | 7 | let { stdin, stdout } = process; 8 | 9 | // Set of options accepting an argument 10 | let accepts_optarg = new Set(['u', 'url', 'f', 'file']); 11 | 12 | // Accumulate options and their arguments on the one hand, 13 | // and operands, on the other 14 | let opts = {}; 15 | let operands = []; 16 | opsh(process.argv.slice(2), { 17 | option(option, value) { 18 | opts[option] = value !== undefined ? value : true; 19 | }, 20 | operand(operand, opt) { 21 | if (opt !== undefined && accepts_optarg.has(opt)) { 22 | opts[opt] = operand; 23 | } else { 24 | operands.push(operand); 25 | } 26 | } 27 | }); 28 | 29 | if (opts.version || opts.V) { 30 | console.log(pkg.version); 31 | process.exit(0); 32 | } 33 | 34 | if (opts.help || opts.h) { 35 | console.log( 36 | `hred version ${pkg.version} 37 | 38 | Reduce HTML (and XML) to JSON from the command line. 39 | Details at: https://github.com/danburzo/hred 40 | 41 | Usage: hred [options...] 42 | 43 | General options: 44 | 45 | -h, --help Print this help message 46 | -V, --version Print hred version 47 | 48 | Input options: 49 | 50 | -u , --url= Specify base URL for relative HTML attributes 51 | -x, --xml Parse input as XML rather than HTML 52 | -f , --file= Read the query from an external file instead of 53 | passing it as an operand. 54 | 55 | Output options: 56 | 57 | -c, --concat Output array as concatenated JSON records 58 | -r, --raw Output raw (unquoted) strings 59 | 60 | Examples: 61 | 62 | Get the "alt" and "src" HTML attributes of images on a Wikipedia page: 63 | 64 | curl https://en.wikipedia.org/wiki/Banana | hred "img { @alt, @src }" 65 | 66 | Read the titles and definitions of HTTP response codes from a MDN page: 67 | 68 | curl https://developer.mozilla.org/en-US/docs/Web/HTTP/Status | hred " 69 | dt { 70 | a { 71 | @href, 72 | ^ :scope > code @.textContent >> title 73 | } >> ., 74 | :scope + dd @.textContent 75 | } 76 | " 77 | `); 78 | process.exit(0); 79 | } 80 | 81 | let query = opts.f || opts.file ? fs.readFileSync(opts.f || opts.file, 'utf8') : operands[0]; 82 | 83 | let content = ''; 84 | 85 | stdin 86 | .setEncoding('utf8') 87 | .on('readable', () => { 88 | let chunk; 89 | while ((chunk = stdin.read()) !== null) { 90 | content += chunk; 91 | } 92 | }).on('end', () => { 93 | let res = hred(content, query || '^', opts.url || opts.u, (opts.x || opts.xml) ? 'application/xml' : 'text/html'), out; 94 | if ((opts.concat || opts.c) && Array.isArray(res)) { 95 | out = res.map(it => { 96 | if ((opts.raw || opts.r) && typeof it === 'string') { 97 | return it; 98 | } 99 | return JSON.stringify(it, null, 2); 100 | }).join('\n'); 101 | } else { 102 | out = (opts.raw || opts.r) && typeof res === 'string' ? res : JSON.stringify(res, null, 2); 103 | } 104 | stdout.write(out); 105 | }); -------------------------------------------------------------------------------- /.github/hred.svg: -------------------------------------------------------------------------------- 1 | 2 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![hred](./.github/hred.svg) 2 | 3 | npm version 4 | 5 | hred (**h**tml **red**uce) is a command-line tool to extract data from HTML. It reads HTML from the standard input and outputs the JSON produced by a [qsx query](https://github.com/danburzo/qsx): 6 | 7 | ```bash 8 | > curl https://danburzo.ro/rolodex/ | hred "article a { @href, @.textContent }" 9 | [ 10 | { 11 | "href": "http://www.3quarksdaily.com/", 12 | ".textContent": "3 Quarks Daily" 13 | }, 14 | { 15 | "href": "http://50watts.com", 16 | ".textContent": "50 Watts" 17 | }, 18 | { 19 | "href": "http://aworkinglibrary.com/", 20 | ".textContent": "A Working Library" 21 | }, 22 | ... 23 | ] 24 | ``` 25 | 26 | [The qsx documentation](https://github.com/danburzo/qsx) describes the kinds of queries you can make with hred, but if you're familiar with CSS selectors you're mostly good to go. 27 | 28 | ## Installation 29 | 30 | hred runs on Node.js. You can find hred in the npm registry: 31 | 32 | ```bash 33 | # install hred globally with npm: 34 | npm install -g hred 35 | 36 | # install hred globally with yarn: 37 | yarn global add hred 38 | 39 | # run hred without installing it: 40 | npx hred 41 | ``` 42 | 43 | ## Usage 44 | 45 | hred accepts a qsx query string: 46 | 47 | ```bash 48 | curl https://en.wikipedia.org/wiki/Banana | hred "img { @alt, @src }" 49 | 50 | [ 51 | { 52 | "alt": "Page semi-protected", 53 | "src": "//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" 54 | }, 55 | { 56 | "alt": "Banana and cross section.jpg", 57 | "src": "//upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Banana_and_cross_section.jpg/250px-Banana_and_cross_section.jpg" 58 | }, 59 | ... 60 | ] 61 | ``` 62 | 63 | hred has the single, modest purpose of extracting parts of HTML as JSON. Because the qsx query language is a lightweight extension to the CSS selector syntax used by the `Element.querySelectorAll()` DOM method, hred offers only limited reshaping of the resulting JSON via aliases. The tool is designed to be piped to something like [`jq`](https://stedolan.github.io/jq/) if further JSON processing is necessary. 64 | 65 | hred has a few options available: 66 | 67 | Option | Description 68 | ------ | ----------- 69 | `-c`, `--concat` | if the result is an array, return it as [concatenated JSON records](https://en.wikipedia.org/wiki/JSON_streaming#Concatenated_JSON), to make it easier to collate several results together 70 | `-f `, `--file=` | read the query from an external file instead of passing it as an operand 71 | `-h`, `--help` | print help message 72 | `-r`, `--raw` | a complement to `-c` that returns raw (unquoted) strings when the result is an array of strings 73 | `-u `, `--url=` | add the base URL against which the HTML should be evaluated; influences the value of the DOM properties `@.href`, `@.src` when the HTML attributes are relative 74 | `-V`, `--version` | display the current version 75 | `-x`, `--xml` | parse the input as XML rather than HTML 76 | 77 | ## A real-life example 78 | 79 | Let's take a web page that uses atomic, presentational CSS rather than semantic CSS classes (and thus makes it more challenging to extract data), such as [my starred repos page](https://github.com/danburzo?tab=stars). To extract info about the repositories, at the time of writing: 80 | 81 | ```bash 82 | curl https://github.com/danburzo\?tab\=stars | hred " 83 | .mb-1 { 84 | h3 a ...{ 85 | @href => url , 86 | @.textContent => title 87 | }, 88 | ^ :scope ~ .py-1 @.textContent => description 89 | }" 90 | ``` 91 | 92 | Let's break the query apart: 93 | 94 | > For each element with the class `mb-1`: 95 | > 1. on the one hand, find `` elements nested into `

`s: 96 | > 1. read their `href` HTML attribute as `url` and their `textContent` DOM property as `title`; 97 | > 2. merge the resulting object into the current scope with `>> .`; 98 | > 1. on the other hand, find the first (`^`) subsequent element (`:scope ~`) that matches the class `py-1` 99 | > 1. extract its `textContent` as `description`. 100 | 101 | The resulting JSON, abridged: 102 | 103 | ```json 104 | [ 105 | { 106 | "url": "/urfave/cli", 107 | "title": "\n urfave / cli\n ", 108 | "description": "\n \n A simple, fast, and fun package for building command line apps in Go\n \n " 109 | }, 110 | ``` 111 | 112 | ## A note on security 113 | 114 | hred uses as its DOM environment [jsdom](https://github.com/jsdom/jsdom), which has the ability to run the JavaScript included in web pages. Because scripts specially crafted to attack jsdom may potentially evade the sandbox to which their execution is confined and access your machine through Node.js APIs, [script execution is disabled](https://github.com/jsdom/jsdom#executing-scripts); furthermore, external resources (scripts, images, stylesheets, iframes) are not fetched. Even with these precautions, be careful with what web pages you process with hred; when in doubt, inspect the page's source code beforehand. 115 | 116 | ## Related projects 117 | 118 | You might be interested in these: 119 | 120 | * [pup](https://github.com/ericchiang/pup/) was the original _`jq` for HTML_; 121 | * [x-ray](https://github.com/matthewmueller/x-ray) has the concept of including HTML attributes in the query string; 122 | * [gdom](https://github.com/syrusakbary/gdom) — `qsx` looks a bit like GraphQL, so maybe GraphQL for DOM can be a thing; 123 | * [tq](https://github.com/plainas/tq) — another popular CLI tool for extracting data from HTML; 124 | * [htmlq](https://github.com/mgdm/htmlq) — like `jq`, but for HTML; 125 | * [xidel](https://github.com/benibela/xidel) supports a variety of query languages (CSS, XQuery, XPath, etc.); 126 | * [wikipedia_ql](https://github.com/zverok/wikipedia_ql) — a query language for efficient data extraction from Wikipedia; 127 | * [dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools/) maintains a comprehensive list of command-line tools for manipulating structured text data. --------------------------------------------------------------------------------