├── index.js
├── package.json
├── LICENSE
├── .gitignore
├── cli.js
├── .github
    └── hred.svg
└── README.md


/index.js:
--------------------------------------------------------------------------------
1 | let { JSDOM } = require('jsdom');
2 | let qsx = require('qsx');
3 | 
4 | module.exports = function(content, query, url, contentType) {
5 | 	let doc = new JSDOM(content, { url, contentType }).window.document;
6 | 	return qsx(doc, query);
7 | }


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "hred",
 3 |   "version": "1.5.1",
 4 |   "main": "index.js",
 5 |   "repository": "git@github.com:danburzo/hred.git",
 6 |   "author": "Dan Burzo <dan@danburzo.ro>",
 7 |   "license": "MIT",
 8 |   "dependencies": {
 9 |     "jsdom": "^16.6.0",
10 |     "opsh": "^1.0.0",
11 |     "qsx": "^3.3.0"
12 |   },
13 |   "bin": {
14 |     "hred": "./cli.js"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Dan Burzo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | 
  9 | # Diagnostic reports (https://nodejs.org/api/report.html)
 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 11 | 
 12 | # Runtime data
 13 | pids
 14 | *.pid
 15 | *.seed
 16 | *.pid.lock
 17 | 
 18 | # Directory for instrumented libs generated by jscoverage/JSCover
 19 | lib-cov
 20 | 
 21 | # Coverage directory used by tools like istanbul
 22 | coverage
 23 | *.lcov
 24 | 
 25 | # nyc test coverage
 26 | .nyc_output
 27 | 
 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 29 | .grunt
 30 | 
 31 | # Bower dependency directory (https://bower.io/)
 32 | bower_components
 33 | 
 34 | # node-waf configuration
 35 | .lock-wscript
 36 | 
 37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 38 | build/Release
 39 | 
 40 | # Dependency directories
 41 | node_modules/
 42 | jspm_packages/
 43 | 
 44 | # TypeScript v1 declaration files
 45 | typings/
 46 | 
 47 | # TypeScript cache
 48 | *.tsbuildinfo
 49 | 
 50 | # Optional npm cache directory
 51 | .npm
 52 | 
 53 | # Optional eslint cache
 54 | .eslintcache
 55 | 
 56 | # Microbundle cache
 57 | .rpt2_cache/
 58 | .rts2_cache_cjs/
 59 | .rts2_cache_es/
 60 | .rts2_cache_umd/
 61 | 
 62 | # Optional REPL history
 63 | .node_repl_history
 64 | 
 65 | # Output of 'npm pack'
 66 | *.tgz
 67 | 
 68 | # Yarn Integrity file
 69 | .yarn-integrity
 70 | 
 71 | # dotenv environment variables file
 72 | .env
 73 | .env.test
 74 | 
 75 | # parcel-bundler cache (https://parceljs.org/)
 76 | .cache
 77 | 
 78 | # Next.js build output
 79 | .next
 80 | 
 81 | # Nuxt.js build / generate output
 82 | .nuxt
 83 | dist
 84 | 
 85 | # Gatsby files
 86 | .cache/
 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js
 88 | # https://nextjs.org/blog/next-9-1#public-directory-support
 89 | # public
 90 | 
 91 | # vuepress build output
 92 | .vuepress/dist
 93 | 
 94 | # Serverless directories
 95 | .serverless/
 96 | 
 97 | # FuseBox cache
 98 | .fusebox/
 99 | 
100 | # DynamoDB Local files
101 | .dynamodb/
102 | 
103 | # TernJS port file
104 | .tern-port
105 | 


--------------------------------------------------------------------------------
/cli.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | let opsh = require('opsh');
  3 | let hred = require('./index.js');
  4 | let pkg = require('./package.json');
  5 | let fs = require('fs');
  6 | 
  7 | let { stdin, stdout } = process;
  8 | 
  9 | // Set of options accepting an argument
 10 | let accepts_optarg = new Set(['u', 'url', 'f', 'file']);
 11 | 
 12 | // Accumulate options and their arguments on the one hand,
 13 | // and operands, on the other
 14 | let opts = {}; 
 15 | let operands = [];
 16 | opsh(process.argv.slice(2), {
 17 | 	option(option, value) {
 18 | 		opts[option] = value !== undefined ? value : true;
 19 | 	},
 20 | 	operand(operand, opt) {
 21 | 		if (opt !== undefined && accepts_optarg.has(opt)) {
 22 | 			opts[opt] = operand;
 23 | 		} else {
 24 | 			operands.push(operand);
 25 | 		}
 26 | 	}
 27 | });
 28 | 
 29 | if (opts.version || opts.V) {
 30 | 	console.log(pkg.version);
 31 | 	process.exit(0);
 32 | }
 33 | 
 34 | if (opts.help || opts.h) {
 35 | 	console.log(
 36 | `hred version ${pkg.version}
 37 | 
 38 | Reduce HTML (and XML) to JSON from the command line.
 39 | Details at: https://github.com/danburzo/hred
 40 | 
 41 | Usage: hred [options...]
 42 | 
 43 | General options:
 44 | 
 45 | -h, --help                Print this help message
 46 | -V, --version             Print hred version
 47 | 
 48 | Input options:
 49 | 
 50 | -u <url>, --url=<url>     Specify base URL for relative HTML attributes
 51 | -x, --xml                 Parse input as XML rather than HTML
 52 | -f <file>, --file=<file>  Read the query from an external file instead of
 53 |                           passing it as an operand.
 54 | 
 55 | Output options:
 56 | 
 57 | -c, --concat              Output array as concatenated JSON records
 58 | -r, --raw                 Output raw (unquoted) strings
 59 | 
 60 | Examples:
 61 | 
 62 | Get the "alt" and "src" HTML attributes of images on a Wikipedia page:
 63 | 	
 64 | 	curl https://en.wikipedia.org/wiki/Banana | hred "img { @alt, @src }"
 65 | 
 66 | Read the titles and definitions of HTTP response codes from a MDN page:
 67 | 
 68 | 	curl https://developer.mozilla.org/en-US/docs/Web/HTTP/Status | hred "
 69 | 		dt { 
 70 | 			a { 
 71 | 				@href, 
 72 | 				^ :scope > code @.textContent >> title
 73 | 			} >> .,
 74 | 			:scope + dd @.textContent
 75 | 		}
 76 | 	"
 77 | `);
 78 | 	process.exit(0);
 79 | }
 80 | 
 81 | let query = opts.f || opts.file ? fs.readFileSync(opts.f || opts.file, 'utf8') : operands[0];
 82 | 
 83 | let content = '';
 84 | 
 85 | stdin
 86 | 	.setEncoding('utf8')
 87 | 	.on('readable', () => {
 88 | 		let chunk;
 89 | 		while ((chunk = stdin.read()) !== null) {
 90 | 			content += chunk;
 91 | 		}
 92 | 	}).on('end', () => {
 93 | 		let res = hred(content, query || '^', opts.url || opts.u, (opts.x || opts.xml) ? 'application/xml' : 'text/html'), out;
 94 | 		if ((opts.concat || opts.c) && Array.isArray(res)) {
 95 | 			out = res.map(it => {
 96 | 				if ((opts.raw || opts.r) && typeof it === 'string') {
 97 | 					return it;
 98 | 				}
 99 | 				return JSON.stringify(it, null, 2);
100 | 			}).join('\n');
101 | 		} else {
102 | 			out = (opts.raw || opts.r) && typeof res === 'string' ? res : JSON.stringify(res, null, 2);
103 | 		}
104 | 		stdout.write(out);
105 | 	});


--------------------------------------------------------------------------------
/.github/hred.svg:
--------------------------------------------------------------------------------
 1 | <svg version="1.1" xmlns="http://www.w3.org/2000/svg" width="158.763px" height="50px" viewBox="0 0 158.763 50">
 2 | 	<path fill="#CC9252" d="M5.89,31.977l15.827,7.429c-0.517,1.809-1.744,3.876-2.584,4.974L0.141,34.238
 3 | 		c-0.258-1.227-0.129-2.003,0.129-3.424l19.703-9.561c0.904,1.098,1.227,2.196,1.615,3.488L5.89,31.977z M27.854,49.031
 4 | 		c0.194-1.744,2.326-11.305,2.519-13.889C30.632,25,30.826,14.535,31.019,4.07c-1.034-0.129-2.842-0.323-3.747-0.517
 5 | 		c-0.129-0.517-0.194-1.163-0.129-1.809c1.68-0.452,5.749-1.551,7.364-1.744c0.84,0.388,1.938,1.357,2.261,2.003l-2.196,29.134
 6 | 		l0.388,0.129c2.39-3.941,7.235-11.563,10.401-15.31c1.034-0.323,3.359-0.194,4.522,0c1.034,0.646,2.132,1.486,3.101,2.778
 7 | 		c-1.034,7.041-2.455,23.966-2.455,26.938l3.747-0.452c0.129,0.258,0.517,1.357,0.517,1.68c-1.098,0.969-4.845,3.036-5.491,3.036
 8 | 		c-1.227,0-3.682-2.261-3.682-3.295c0.646-7.364,1.55-18.799,2.584-25.904c-0.452-0.581-1.163-0.969-1.68-1.034
 9 | 		c-2.972,3.036-10.982,14.018-12.08,17.119c-0.452,4.134-0.84,10.53-0.775,11.434c-0.452,0.517-3.036,1.68-3.876,1.68
10 | 		C29.404,49.935,28.306,49.548,27.854,49.031z M60.09,49.031c0.194-1.615,2.261-11.434,2.455-14.018l0.581-15.245l-3.941,0.323
11 | 		c-0.194-0.517-0.258-1.357-0.258-1.744c1.938-1.098,5.685-2.584,6.331-2.584c1.163,0,2.713,1.551,2.842,2.326l-1.615,13.824
12 | 		l0.388,0.065c1.615-4.328,6.073-15.568,7.494-16.279c2.261-0.065,4.328,0.581,6.008,1.938c-0.258,1.357-1.486,4.457-2.584,5.814
13 | 		c-1.809-0.258-2.713-0.711-3.553-1.034c-2.067,3.23-6.396,11.757-7.429,14.858c-0.646,4.328-1.034,9.884-0.969,10.982
14 | 		C65.258,48.773,62.738,50,61.899,50C61.511,50,60.542,49.483,60.09,49.031z M101.047,43.217c-2.455,3.036-7.106,6.718-11.046,6.718
15 | 		c-2.519,0-6.848-4.457-7.687-7.364c-0.517-1.744-1.034-3.941-1.034-7.881c0.065-6.654,1.809-11.434,4.587-15.375
16 | 		c1.421-2.067,5.168-3.618,7.235-3.618c5.62,0,8.269,3.553,8.463,6.072c0.194,2.713-0.84,8.01-2.261,9.173
17 | 		c-3.165,2.649-8.204,3.811-13.114,4.587c0.194,7.171,3.295,9.819,5.878,10.659c2.649-0.129,6.589-3.682,7.687-4.716
18 | 		C100.207,41.796,100.918,42.7,101.047,43.217z M86.189,33.333c1.744-0.452,6.395-1.55,8.462-2.778
19 | 		c1.938-1.357,3.036-8.075,2.39-10.853c-1.357-0.969-4.134-1.68-5.491-1.486C89.096,21.447,86.512,26.486,86.189,33.333z
20 | 		 M107.378,48.643c-0.711-0.646-1.68-3.165-1.68-4.522c-0.065-5.556,2.39-19.768,6.202-25.84c1.163-1.421,3.165-2.52,4.393-2.52
21 | 		c1.55,0,6.266,0.84,8.204,1.809l0.775-13.501c-1.034-0.129-3.101-0.323-3.941-0.517c-0.065-0.452-0.194-1.227-0.129-1.809
22 | 		c1.68-0.452,6.072-1.551,7.687-1.744c0.84,0.388,1.938,1.357,2.196,2.003l-3.036,25.194c-0.452,6.072-0.904,12.855-1.098,18.476
23 | 		l3.747-0.452c0.194,0.323,0.452,1.227,0.452,1.68c-1.163,0.969-4.587,3.036-5.555,3.036c-1.228,0-3.811-2.326-3.682-3.489
24 | 		l1.809-13.178l-0.387-0.129c-2.713,5.362-10.724,16.796-12.855,16.796C109.832,49.935,108.217,49.419,107.378,48.643z
25 | 		 M123.98,27.39l0.452-7.041c-2.067-0.969-6.783-1.68-8.527-1.551c-3.101,6.46-5.491,20.22-5.491,25.452
26 | 		c0.129,0.388,1.034,0.582,1.357,0.582C114.548,42.313,122.365,31.137,123.98,27.39z M158.67,30.814
27 | 		c0.194,1.227,0.065,1.938-0.194,3.424l-19.638,9.561c-0.904-1.098-1.227-2.196-1.615-3.553l15.633-7.235l-15.827-7.429
28 | 		c0.582-1.809,1.744-3.811,2.584-4.91L158.67,30.814z"/>
29 | </svg>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![hred](./.github/hred.svg)
  2 | 
  3 | <a href="https://www.npmjs.org/package/hred"><img src="https://img.shields.io/npm/v/hred.svg?style=flat-square&labelColor=CC9252&color=black" alt="npm version"></a>
  4 | 
  5 | hred (**h**tml **red**uce) is a command-line tool to extract data from HTML. It reads HTML from the standard input and outputs the JSON produced by a [qsx query](https://github.com/danburzo/qsx):
  6 | 
  7 | ```bash
  8 | > curl https://danburzo.ro/rolodex/ | hred "article a { @href, @.textContent }"
  9 | [
 10 |   {
 11 |     "href": "http://www.3quarksdaily.com/",
 12 |     ".textContent": "3 Quarks Daily"
 13 |   },
 14 |   {
 15 |     "href": "http://50watts.com",
 16 |     ".textContent": "50 Watts"
 17 |   },
 18 |   {
 19 |     "href": "http://aworkinglibrary.com/",
 20 |     ".textContent": "A Working Library"
 21 |   },
 22 |   ...
 23 | ]
 24 | ``` 
 25 | 
 26 | [The qsx documentation](https://github.com/danburzo/qsx) describes the kinds of queries you can make with hred, but if you're familiar with CSS selectors you're mostly good to go.
 27 | 
 28 | ## Installation
 29 | 
 30 | hred runs on Node.js. You can find hred in the npm registry:
 31 | 
 32 | ```bash
 33 | # install hred globally with npm:
 34 | npm install -g hred
 35 | 
 36 | # install hred globally with yarn:
 37 | yarn global add hred
 38 | 
 39 | # run hred without installing it:
 40 | npx hred 
 41 | ```
 42 | 
 43 | ## Usage
 44 | 
 45 | hred accepts a qsx query string:
 46 | 
 47 | ```bash
 48 | curl https://en.wikipedia.org/wiki/Banana | hred "img { @alt, @src }"
 49 | 
 50 | [
 51 |   {
 52 |     "alt": "Page semi-protected",
 53 |     "src": "//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png"
 54 |   },
 55 |   {
 56 |     "alt": "Banana and cross section.jpg",
 57 |     "src": "//upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Banana_and_cross_section.jpg/250px-Banana_and_cross_section.jpg"
 58 |   },
 59 |   ...
 60 | ]
 61 | ```
 62 | 
 63 | hred has the single, modest purpose of extracting parts of HTML as JSON. Because the qsx query language is a lightweight extension to the CSS selector syntax used by the `Element.querySelectorAll()` DOM method, hred offers only limited reshaping of the resulting JSON via aliases. The tool is designed to be piped to something like [`jq`](https://stedolan.github.io/jq/) if further JSON processing is necessary.
 64 | 
 65 | hred has a few options available:
 66 | 
 67 | Option | Description
 68 | ------ | -----------
 69 | `-c`, `--concat` | if the result is an array, return it as [concatenated JSON records](https://en.wikipedia.org/wiki/JSON_streaming#Concatenated_JSON), to make it easier to collate several results together
 70 | `-f <queryfile>`, `--file=<queryfile>` | read the query from an external file instead of passing it as an operand
 71 | `-h`, `--help` | print help message
 72 | `-r`, `--raw` | a complement to `-c` that returns raw (unquoted) strings when the result is an array of strings
 73 | `-u <url>`, `--url=<url>` | add the base URL against which the HTML should be evaluated; influences the value of the DOM properties `@.href`, `@.src` when the HTML attributes are relative
 74 | `-V`, `--version` | display the current version
 75 | `-x`, `--xml` | parse the input as XML rather than HTML
 76 | 
 77 | ## A real-life example
 78 | 
 79 | Let's take a web page that uses atomic, presentational CSS rather than semantic CSS classes (and thus makes it more challenging to extract data), such as [my starred repos page](https://github.com/danburzo?tab=stars). To extract info about the repositories, at the time of writing:
 80 | 
 81 | ```bash
 82 | curl https://github.com/danburzo\?tab\=stars | hred "
 83 | .mb-1 {
 84 | 	h3 a ...{ 
 85 | 		@href => url , 
 86 | 		@.textContent => title 
 87 | 	}, 
 88 | 	^ :scope ~ .py-1 @.textContent => description 
 89 | }"
 90 | ```
 91 | 
 92 | Let's break the query apart:
 93 | 
 94 | > For each element with the class `mb-1`:
 95 | > 1. on the one hand, find `<a>` elements nested into `<h3>`s: 
 96 | >    1. read their `href` HTML attribute as `url` and their `textContent` DOM property as `title`;
 97 | >    2. merge the resulting object into the current scope with `>> .`;
 98 | > 1. on the other hand, find the first (`^`) subsequent element (`:scope ~`) that matches the class `py-1`
 99 | >    1. extract its `textContent` as `description`. 
100 | 
101 | The resulting JSON, abridged:
102 | 
103 | ```json
104 | [
105 |   {
106 |     "url": "/urfave/cli",
107 |     "title": "\n        urfave / cli\n      ",
108 |     "description": "\n      \n        A simple, fast, and fun package for building command line apps in Go\n      \n  "
109 |   },
110 | ```
111 | 
112 | ## A note on security
113 | 
114 | hred uses as its DOM environment [jsdom](https://github.com/jsdom/jsdom), which has the ability to run the JavaScript included in web pages. Because scripts specially crafted to attack jsdom may potentially evade the sandbox to which their execution is confined and access your machine through Node.js APIs, [script execution is disabled](https://github.com/jsdom/jsdom#executing-scripts); furthermore, external resources (scripts, images, stylesheets, iframes) are not fetched. Even with these precautions, be careful with what web pages you process with hred; when in doubt, inspect the page's source code beforehand.
115 | 
116 | ## Related projects
117 | 
118 | You might be interested in these:
119 | 
120 | * [pup](https://github.com/ericchiang/pup/) was the original _`jq` for HTML_;
121 | * [x-ray](https://github.com/matthewmueller/x-ray) has the concept of including HTML attributes in the query string; 
122 | * [gdom](https://github.com/syrusakbary/gdom) — `qsx` looks a bit like GraphQL, so maybe GraphQL for DOM can be a thing;
123 | * [tq](https://github.com/plainas/tq) — another popular CLI tool for extracting data from HTML;
124 | * [htmlq](https://github.com/mgdm/htmlq) — like `jq`, but for HTML;
125 | * [xidel](https://github.com/benibela/xidel) supports a variety of query languages (CSS, XQuery, XPath, etc.);
126 | * [wikipedia_ql](https://github.com/zverok/wikipedia_ql) — a query language for efficient data extraction from Wikipedia;
127 | * [dbohdan/structured-text-tools](https://github.com/dbohdan/structured-text-tools/) maintains a comprehensive list of command-line tools for manipulating structured text data.


--------------------------------------------------------------------------------