├── .github └── workflows │ ├── install_from_github_repo.yml │ └── pythonapp.yml ├── .gitignore ├── .pylintrc ├── .travis.yml ├── LICENSE ├── README.md ├── bin └── tq ├── doc ├── compile_manpage.sh ├── tq.1 └── tq.1.md ├── setup.py ├── tests ├── no-selector.args ├── no-selector.expected-output ├── no-selector.html ├── not-first-of-type.args ├── not-first-of-type.expected-output ├── not-first-of-type.html ├── utf8.args ├── utf8.expected-output ├── utf8.html ├── version.args ├── version.expected-output └── version.html ├── tq └── __init__.py ├── tq_bin └── tqtest.sh /.github/workflows/install_from_github_repo.yml: -------------------------------------------------------------------------------- 1 | name: Install from github zipball (stable branch) 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Set up Python 3.7 10 | uses: actions/setup-python@v1 11 | with: 12 | python-version: 3.7 13 | - name: Install tq 14 | run: | 15 | sudo apt-get install python3-setuptools 16 | sudo -H pip3 install --upgrade pip 17 | sudo -H pip3 install https://github.com/plainas/tq/zipball/stable 18 | - name: Run tq 19 | run: | 20 | echo "ok, which version are we at...." 21 | tq -v 22 | echo "Let's check what's on hacker news today" 23 | curl https://news.ycombinator.com/news | tq -tj ".title a" 24 | -------------------------------------------------------------------------------- /.github/workflows/pythonapp.yml: -------------------------------------------------------------------------------- 1 | name: Install and test 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v1 10 | - name: Set up Python 3.7 11 | uses: actions/setup-python@v1 12 | with: 13 | python-version: 3.7 14 | - name: Install 15 | run: | 16 | pip install . 17 | - name: Run tests 18 | run: ./tqtest.sh 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | MANIFEST 3 | build/ 4 | dist/ 5 | *.pyc 6 | *.egg-info/ 7 | *.actual-output 8 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable = bad-whitespace, fixme, invalid-name, missing-docstring, wrong-import-order 3 | max-line-length = 150 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.7" 4 | cache: pip 5 | install: pip install pylint -e . 6 | script: 7 | - pylint tq/ 8 | - ./tqtest.sh 9 | notifications: 10 | email: false 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright Pedro (c) 2015 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tq 2 | 3 | tq is command line utility that performs an HTML element selection on HTML content passed to the stdin. Using css selectors that everybody knows. 4 | 5 | Since input comes from stdin and output is sent to stdout, it can easily be used inside traditional UNIX pipelines to extract content from webpages and html files. 6 | 7 | tq provides extra formating options such as json-encoding or newlines squashing, so it can play nicely with everyones favourite command line tooling. 8 | 9 | 10 | ## Installation 11 | 12 | sudo pip3 install https://github.com/plainas/tq/zipball/stable 13 | 14 | ## Example usage 15 | 16 | Get headlines from hacker news 17 | 18 | curl https://news.ycombinator.com/news | tq -tj ".title a" 19 | 20 | Get the title of an html document stored in a file 21 | 22 | cat mydocument.html | tq -t title 23 | 24 | Get all the images from a webpage 25 | 26 | curl -s 'http://example.com/' | tq "img" -a src | wget -i - 27 | 28 | 29 | Notice that tq doesn't provide a way to make http requests or read files. You can use your favorite HTTP client, or provide the html source from any source you want. 30 | 31 | For a modern, user friendly http client, check httpie. Or you can just use curl, wget, netcat, etc. 32 | 33 | ## Command options 34 | 35 | * `SELECTOR` 36 | A css selector 37 | 38 | * `-a ATTRIBUTE --attr=ATTRIBUTE` 39 | Outputs only the contents of the html ATTRIBUTE. 40 | 41 | * `-t, --text` 42 | Outputs only the inner text of the selected elements. 43 | 44 | * `-p, --parent` 45 | Select the parent elements instead. 46 | 47 | * `-q, --squash` 48 | Squash lines. 49 | 50 | * `-s, --squash-space` 51 | Squash spaces. 52 | 53 | * `-j, --json-lines` 54 | JSON encode each match. 55 | 56 | * `-J, --json` 57 | Output as json array of strings. 58 | 59 | * `-v, --version` 60 | Prints tq version 61 | -------------------------------------------------------------------------------- /bin/tq: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import tq 3 | 4 | if __name__ == '__main__': 5 | tq.main() -------------------------------------------------------------------------------- /doc/compile_manpage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # ronn is used to turn the markdown into a manpage. 4 | # Get ronn at https://github.com/rtomayko/ronn 5 | # Alternately, since ronn is a Ruby gem, you can just 6 | # `gem install ronn` 7 | 8 | ronn --roff tq.1.md 9 | -------------------------------------------------------------------------------- /doc/tq.1: -------------------------------------------------------------------------------- 1 | .\" generated with Ronn/v0.7.3 2 | .\" http://github.com/rtomayko/ronn/tree/0.7.3 3 | . 4 | .TH "TQ" "1" "November 2015" "" "" 5 | . 6 | .SH "NAME" 7 | \fBtq\fR \- Terminal based HTML query tool 8 | . 9 | .SH "SYNOPSIS" 10 | cat file\.html | \fBtq\fR [\fIoptions\fR] SELECTOR 11 | . 12 | .SH "DESCRIPTION" 13 | Perform a css query with SELECTOR on an html document passed to the standard input\. 14 | . 15 | .SH "OPTIONS" 16 | . 17 | .IP "\(bu" 4 18 | \fISELECTOR\fR A css selector 19 | . 20 | .IP "\(bu" 4 21 | \fB\-a\fR\fIATTRIBUTE\fR\fB\-\-attr=\fR\fIATTRIBUTE\fR Outputs only the contents of the html ATTRIBUTE\. 22 | . 23 | .IP "\(bu" 4 24 | \fB\-t, \-\-text\fR Outputs only the inner text of the selected elements\. 25 | . 26 | .IP "\(bu" 4 27 | \fB\-p, \-\-parrent\fR Select the parent elements instead\. 28 | . 29 | .IP "\(bu" 4 30 | \fB\-q, \-\-squash\fR Squash lines\. 31 | . 32 | .IP "\(bu" 4 33 | \fB\-s, \-\-squash\-space\fR Squash spaces\. 34 | . 35 | .IP "\(bu" 4 36 | \fB\-j, \-\-json\-lines\fR JSON encode each match\. 37 | . 38 | .IP "\(bu" 4 39 | \fB\-J, \-\-json\fR Output as json array of strings\. 40 | . 41 | .IP "\(bu" 4 42 | \fB\-v, \-\-version\fR Prints tq version 43 | . 44 | .IP "" 0 45 | . 46 | .SH "EXAMPLES" 47 | . 48 | .SS "Get headlines from hacker news" 49 | curl https://news\.ycombinator\.com/news | tq \-tj "\.title a" 50 | . 51 | .SS "Download a gallery of nice forest pictures from flickr" 52 | . 53 | .nf 54 | 55 | curl \-s \'https://www\.flickr\.com/photos/tgerus/galleries/72157622468645106/\' \e 56 | | tq " \.pc_img" \-a src \e 57 | | wget \-i 58 | . 59 | .fi 60 | . 61 | .SH "AUTHORS" 62 | \fBtq\fR was written by Pedro \fIpedroghcode@gmail\.com\fR\. 63 | . 64 | .SH "DISTRIBUTION" 65 | The latest version of tq may be downloaded from https://github\.com/plainas/tq 66 | . 67 | .SH "SEE ALSO" 68 | curl(1), wget(1), jq(1) 69 | -------------------------------------------------------------------------------- /doc/tq.1.md: -------------------------------------------------------------------------------- 1 | tq(1) -- Terminal based HTML query tool 2 | ============================================= 3 | 4 | ## SYNOPSIS 5 | 6 | cat file.html | `tq` [] SELECTOR 7 | 8 | ## DESCRIPTION 9 | 10 | Perform a css query with SELECTOR on an html document passed to the standard input. 11 | 12 | ## OPTIONS 13 | 14 | * _SELECTOR_ 15 | A css selector 16 | 17 | * `-a `_ATTRIBUTE_` --attr=`_ATTRIBUTE_ 18 | Outputs only the contents of the html ATTRIBUTE. 19 | 20 | * `-t, --text` 21 | Outputs only the inner text of the selected elements. 22 | 23 | * `-p, --parent` 24 | Select the elements instead. 25 | 26 | * `-q, --squash` 27 | Squash lines. 28 | 29 | * `-s, --squash-space` 30 | Squash spaces. 31 | 32 | * `-j, --json-lines` 33 | JSON encode each match. 34 | 35 | * `-J, --json` 36 | Output as json array of strings. 37 | 38 | * `-v, --version` 39 | Prints tq version 40 | 41 | 42 | ## EXAMPLES 43 | 44 | 45 | ### Get headlines from hacker news 46 | 47 | curl https://news.ycombinator.com/news | tq -tj ".title a" 48 | 49 | ### Download a gallery of nice forest pictures from flickr 50 | 51 | curl -s 'https://www.flickr.com/photos/tgerus/galleries/72157622468645106/' \ 52 | | tq " .pc_img" -a src \ 53 | | wget -i 54 | 55 | 56 | ## AUTHORS 57 | 58 | `tq` was written by Pedro . 59 | 60 | ## DISTRIBUTION 61 | The latest version of tq may be downloaded from https://github.com/plainas/tq 62 | 63 | ## SEE ALSO 64 | 65 | curl(1), wget(1), jq(1) 66 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from distutils.core import setup 4 | 5 | setup( 6 | name='tq', 7 | version='0.2.1', 8 | description='Comand line css selector over HTML', 9 | author='Pedro', 10 | author_email='pedroghcode@gmail.com', 11 | url='https://github.com/plainas/tq', 12 | packages= ['tq'], 13 | scripts=['bin/tq'], 14 | install_requires=[ 15 | 'beautifulsoup4==4.8.1', 16 | 'setuptools==39.0.1' 17 | ], 18 | classifiers=[ 19 | 'License :: OSI Approved :: MIT License', 20 | 'Programming Language :: Python', 21 | 'Programming Language :: Python :: 3', 22 | 'Programming Language :: Python :: 3.7', 23 | 'Programming Language :: Python :: Implementation :: CPython', 24 | 'Programming Language :: Python :: Implementation :: PyPy', 25 | ] 26 | ) 27 | -------------------------------------------------------------------------------- /tests/no-selector.args: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/plainas/tq/3f5ec2805569c3766ea3ae0b765ea96997c1747d/tests/no-selector.args -------------------------------------------------------------------------------- /tests/no-selector.expected-output: -------------------------------------------------------------------------------- 1 | usage: tq [-h] [-t] [-q] [-s] [-j] [-J] [-v] [-a ATTR] [-p] [selector] 2 | tq: error: the following arguments are required: selector 3 | -------------------------------------------------------------------------------- /tests/no-selector.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | tqtest 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /tests/not-first-of-type.args: -------------------------------------------------------------------------------- 1 | --text .a-class:not(:first-of-type) -------------------------------------------------------------------------------- /tests/not-first-of-type.expected-output: -------------------------------------------------------------------------------- 1 | B 2 | C 3 | -------------------------------------------------------------------------------- /tests/not-first-of-type.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | tqtest 5 | 6 | 7 |
A
8 |
B
9 |
C
10 | 11 | 12 | -------------------------------------------------------------------------------- /tests/utf8.args: -------------------------------------------------------------------------------- 1 | --text .Ça -------------------------------------------------------------------------------- /tests/utf8.expected-output: -------------------------------------------------------------------------------- 1 | LÀ 2 | -------------------------------------------------------------------------------- /tests/utf8.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | tqtest 5 | 6 | 7 |
8 |
RATÉ
9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/version.args: -------------------------------------------------------------------------------- 1 | --version -------------------------------------------------------------------------------- /tests/version.expected-output: -------------------------------------------------------------------------------- 1 | 0.2.1 2 | -------------------------------------------------------------------------------- /tests/version.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tq/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test non unicode input with: 3 | curl https://www.flashback.org/| ./tq.py -Jt ".td_forum" 4 | 5 | Test unicode input 6 | curl https://news.ycombinator.com/news | ./tq.py -Jt ".title a" 7 | 8 | curl https://www.flashback.org/t2494391 | ./tq.py -j ".post_message" 9 | 10 | """ 11 | 12 | #TODO: use add_mutually_exclusive_group() 13 | 14 | import sys 15 | from bs4 import BeautifulSoup 16 | import argparse 17 | import json 18 | import codecs 19 | import io 20 | import pkg_resources 21 | 22 | 23 | VERSION = pkg_resources.get_distribution('tq').version 24 | 25 | 26 | def main(argv=None): 27 | parser = argparse.ArgumentParser(description="Performs a css selection on an HTML document.", prog="tq") 28 | parser.add_argument("selector", nargs="?", help="A css selector") 29 | parser.add_argument("-t", "--text", action="store_true", help="Outputs only the inner text of the selected elements.") 30 | parser.add_argument("-q", "--squash", action="store_true", help="Squash lines.") 31 | parser.add_argument("-s", "--squash-space", action="store_true", help="Squash spaces.") 32 | parser.add_argument("-j", "--json-lines", action="store_true", help="JSON encode each match.") 33 | parser.add_argument("-J", "--json", action="store_true", help="Output as json array of strings.") 34 | parser.add_argument("-v", "--version", action="store_true", help="Ouputs tq version") 35 | parser.add_argument("-a", "--attr", help="Ouputs only te contents of given HTML attribute of selected elements") 36 | parser.add_argument("-p", "--parent", action="store_true", help="Select the parents of the elements matching the selector") 37 | 38 | args = parser.parse_args(argv) 39 | 40 | if args.version: 41 | print(VERSION) 42 | return 43 | 44 | if not args.selector: 45 | parser.error("the following arguments are required: selector") 46 | 47 | if args.json and args.json_lines: 48 | parser.error("--json and --json-lines options cannot be used simultaniously") 49 | 50 | def get_els(css_selector): 51 | input_stream = io.TextIOWrapper(sys.stdin.buffer, errors='ignore') 52 | soup = BeautifulSoup(input_stream, "html.parser") 53 | return soup.select(css_selector) 54 | 55 | selected_els = get_els(args.selector) 56 | 57 | if args.parent: 58 | selected_els = [el.parent for el in selected_els] 59 | 60 | if args.attr: 61 | selected_els = [el.get(args.attr) for el in selected_els if args.attr in el.attrs] 62 | 63 | if args.text: 64 | selected_els = [el.get_text() for el in selected_els] 65 | 66 | if args.squash: 67 | selected_els = [el.replace('\n', ' ').replace('\r', '') for el in selected_els] 68 | 69 | if args.squash_space: 70 | selected_els = [el.replace('\t', ' ') for el in selected_els] 71 | selected_els = [' '.join(el.split(' ')) for el in selected_els] 72 | 73 | if args.json or args.json_lines: 74 | selected_els = [json.dumps(str(el_text)) for el_text in selected_els] 75 | 76 | if args.json: 77 | sys.stdout.write(json.dumps(selected_els, indent=1)) 78 | sys.stdout.write("\n") 79 | else: 80 | for el_text in selected_els: 81 | sys.stdout.write(str(el_text) + "\n") 82 | -------------------------------------------------------------------------------- /tq_bin: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import tq 3 | 4 | if __name__ == '__main__': 5 | tq.main() -------------------------------------------------------------------------------- /tqtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script executes all tests f the tests/ directory. 4 | 5 | # For every .html file in it, it is passed as input to the `tq` command, 6 | # the options in the corresponding .args file are provided as arguments, 7 | # and the resulting output is compared to the matching .expected-output file. 8 | 9 | set -o pipefail -o errexit -o nounset 10 | 11 | for input_file in tests/*.html; do 12 | echo $input_file... 13 | basename=${input_file%.*} 14 | tq $(cat $basename.args) <${input_file} >$basename.actual-output 2>&1 || true 15 | diff $basename.expected-output $basename.actual-output 16 | echo OK 17 | done --------------------------------------------------------------------------------