├── .gitignore
├── screenshots
└── stackoverflow.png
├── engines
├── amazon.json
├── mdn.json
├── ddg.json
├── google.json
├── github.json
├── jira.json
├── gmail.json
├── stackoverflow.json
├── npm.json
└── reddit.json
├── open.sh
├── open_in_graphene.sh
├── scan_page.js
├── preview_full.js
├── graphene
├── preview.js
├── README.md
├── utils.js
└── search.js
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | .cache/
3 | package-lock.json
4 |
--------------------------------------------------------------------------------
/screenshots/stackoverflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atsepkov/Graphene/HEAD/screenshots/stackoverflow.png
--------------------------------------------------------------------------------
/engines/amazon.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=white,fg=yellow}Amazon",
3 | "query": "https://www.amazon.com/s?k=",
4 | "goodQuery": "desk",
5 | "badQuery": "dlskjadbhlads",
6 | "pager": {
7 | "name": "Next →",
8 | "href": "page="
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/engines/mdn.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=white,fg=blue}MDN web docs",
3 | "query": "https://developer.mozilla.org/en-US/search?q=",
4 | "goodQuery": "drag and drop",
5 | "badQuery": "dlskjadbhlads",
6 | "pager": {
7 | "name": "Next",
8 | "href": "page=",
9 | "id": "search-result-next"
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/engines/ddg.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=color166,fg=bright}DuckDuckGo",
3 | "query": "https://www.duckduckgo.com/?q=",
4 | "goodQuery": "food",
5 | "badQuery": "dlskjadbhlads",
6 | "pager": {
7 | "name": "Next",
8 | "href": "&start="
9 | },
10 | "resultsPerPage": 20,
11 | "resultModifier": "&num=",
12 | "minGroupSize": 30000
13 | }
14 |
--------------------------------------------------------------------------------
/engines/google.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=white,fg=blue}G{fg=red}o{fg=yellow}o{fg=blue}g{fg=green}l{fg=red}e",
3 | "query": "https://www.google.com/search?q=",
4 | "goodQuery": "food",
5 | "badQuery": "dlskjadbhlads",
6 | "pager": {
7 | "name": "Next",
8 | "href": "&start="
9 | },
10 | "resultsPerPage": 20,
11 | "resultModifier": "&num=",
12 | "minGroupSize": 30000
13 | }
14 |
--------------------------------------------------------------------------------
/engines/github.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=white,fg=black}GitHub",
3 | "query": "https://github.com/search?q=",
4 | "goodQuery": "react",
5 | "badQuery": "dlskjadbhlads",
6 | "pager": {
7 | "name": "Next",
8 | "href": "p="
9 | },
10 | "categories": {
11 | "language": [
12 | { "find": { "href": "l=" } }
13 | ],
14 | "tag": [
15 | { "find": { "href": "/topics/" } }
16 | ]
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/open.sh:
--------------------------------------------------------------------------------
1 | engine=$1
2 | line=$2
3 |
4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
5 |
6 | url=$(echo $line | sed 's#.*\(https*://\)#\1#')
7 | if [[ "$line" =~ \(pager\)$ ]]; then
8 | bash $DIR/graphene $engine $url
9 | else
10 | OPENCMD=open
11 | [[ $(which xdg-open) ]] && OPENCMD=xdg-open
12 |
13 | $OPENCMD $url
14 | result=$(echo $line | sed 's#\(.*\)https*://#\1#')
15 | node -e "require('$DIR/utils').writeHistory('$url', 'X', { engine: '$engine', result: '$result' })"
16 | fi
17 |
--------------------------------------------------------------------------------
/engines/jira.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=white,fg=red}Jira",
3 | "query": "https://{{URL}}/jira/secure/RapidBoard.jspa?view=planning.nodetail&quickFilter=71424&rapidView=",
4 | "goodQuery": "",
5 | "badQuery": "dlskjadbhlads",
6 | "authentication": {
7 | "loginPage": "https://{{URL}}/jira/login.jsp",
8 | "usernameSelector": "#login-form-username",
9 | "passwordSelector": "#login-form-password",
10 | "submitSelector": "#login-form-submit",
11 | "username": "{{USERNAME}}",
12 | "password": "{{PASSWORD}}"
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/engines/gmail.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=red,fg=white}gmail",
3 | "query": "https://mail.google.com/mail/u/0/#search/",
4 | "goodQuery": "",
5 | "badQuery": "dlskjadbhlads",
6 | "authentication": {
7 | "loginPage": "https://accounts.google.com/signin/v2/identifier",
8 | "usernameSelector": "#identifierId",
9 | "submitUsernameSelector": "#identifierNext",
10 | "passwordSelector": "input[name='password']",
11 | "submitPasswordSelector": "#passwordNext",
12 | "username": "{{USERNAME}}",
13 | "password": "{{PASSWORD}}"
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/engines/stackoverflow.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=color166,fg=white}StackOverflow",
3 | "query": "https://stackoverflow.com/search?q=",
4 | "goodQuery": "regex",
5 | "badQuery": "dlskjadbhlads",
6 | "pager": {
7 | "name": "next",
8 | "href": "page="
9 | },
10 | "minSize": 30000,
11 | "resultsPerPage": 20,
12 | "resultModifier": "&pagesize=",
13 | "categories": {
14 | "user": [
15 | { "find": { "href": "/users/", "name": ".+" } }
16 | ],
17 | "tag": [
18 | { "find": { "href": "/tagged/", "name": ".+" } }
19 | ]
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/engines/npm.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=red,fg=white}NPM",
3 | "query": "https://www.npmjs.com/search?q=",
4 | "goodQuery": "react",
5 | "badQuery": "dlskjadbhlads",
6 | "weights": {
7 | "context": 3,
8 | "coverage": 1,
9 | "area": 1,
10 | "textLength": 1,
11 | "numElements": 0
12 | },
13 | "pager": {
14 | "name": "»",
15 | "href": "page="
16 | },
17 | "categories": {
18 | "user": [
19 | { "find": { "href": "/~" } }
20 | ],
21 | "tag": [
22 | { "find": { "href": "q=keywords:" } }
23 | ]
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/engines/reddit.json:
--------------------------------------------------------------------------------
1 | {
2 | "banner": "{bg=red,fg=white}reddit",
3 | "query": "https://www.reddit.com/search/?q=",
4 | "goodQuery": "dating",
5 | "badQuery": "dlskjadbhlads",
6 | "authentication": {
7 | "loginPage": "https://www.reddit.com/login/",
8 | "usernameSelector": "#loginUsername",
9 | "passwordSelector": "#loginPassword",
10 | "submitSelector": "button.AnimatedForm__submitButton",
11 | "username": "{{USERNAME}}",
12 | "password": "{{PASSWORD}}"
13 | },
14 | "categories": {
15 | "subreddit": [
16 | { "find": { "href": "/r/[^/]+/$" } }
17 | ],
18 | "user": [
19 | { "find": { "href": "/user/" } }
20 | ]
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/open_in_graphene.sh:
--------------------------------------------------------------------------------
1 | engine=$1
2 | line=$2
3 |
4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
5 | url=$(echo $line | sed 's#.*\(https*://\)#\1#')
6 |
7 | # if [[ "$line" =~ \(pager\)$ ]]; then
8 | # bash $DIR/graphene $engine $url
9 | # else
10 | # open $url
11 | # fi
12 |
13 | # --bind "f1:execute(LINES=$LINES node '$DIR/preview_full.js' {} | less -r < /dev/tty > /dev/tty 2>&1)" \
14 | show_result() {
15 | local url
16 | url="$1"
17 | # TODO: fix, f1 is currently broken, the idea is to use it as copy/save mode
18 | node $DIR/scan_page.js $engine "$url" |
19 | fzf --reverse --ansi --tiebreak=begin,index \
20 | --bind "f1:execute(LINES=$LINES node '$DIR/preview_full.js' {} | nvim < /dev/tty > /dev/tty 2>&1)" \
21 | --preview-window=right:80% --preview="node '$DIR/preview_full.js' {}"
22 | }
23 |
24 | show_result "$url"
25 |
--------------------------------------------------------------------------------
/scan_page.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require('puppeteer');
2 | const { color, readCache, writeCache, writeHistory } = require('./utils');
3 |
4 | const engine = process.argv[2];
5 | const url = process.argv[3];
6 |
7 | (async () => {
8 | const browser = await puppeteer.launch();
9 | const page = await browser.newPage();
10 |
11 | // load cookies, if they exist
12 | const cookieData = readCache(engine, 'cookies');
13 | if (cookieData.cookies) {
14 | for (let cookie of cookieData.cookies) {
15 | await page.setCookie(cookie);
16 | }
17 | }
18 |
19 | await page.goto(url);
20 | const pageTitle = await page.title();
21 | writeHistory(url, 'R', { title: pageTitle });
22 | await page.addScriptTag({url: 'https://unpkg.com/turndown/dist/turndown.js'});
23 | await page.addScriptTag({url: 'https://unpkg.com/turndown-plugin-gfm/dist/turndown-plugin-gfm.js'});
24 |
25 | const content = await page.evaluate(() => {
26 | // return document.getElementsByTagName('body')[0].innerText;
27 |
28 | let turndownService = new TurndownService({
29 | headingStyle: 'atx',
30 | codeBlockStyle: 'fenced',
31 | bulletListMarker: '-',
32 | // linkStyle: 'referenced',
33 | // linkReferenceStyle: 'collapsed'
34 | });
35 | let gfm = turndownPluginGfm.gfm;
36 | turndownService.use(gfm);
37 | turndownService.remove('script');
38 | turndownService.remove('style');
39 | /*turndownService.addRule('url', { // force url to be one-line to avoid breaking later regex
40 | filter: ['a'],
41 | replacement: function (content, node) {
42 | let href = node.getAttribute('href');
43 | let title = node.title ? ' "' + node.title.replace('\n', ' ') + '"' : '';
44 | return '[' + content + '](' + href + ')';
45 | }
46 | });*/
47 | let markdown = turndownService.turndown(document.getElementsByTagName('body')[0].innerHTML);
48 | return markdown;
49 | });
50 |
51 | writeCache('_result_', 'preview', {
52 | url: url,
53 | content: content.split('\n')
54 | });
55 | content.split('\n').forEach((line, index) => {
56 | console.log(`${color.bright}\x1b[38;5;237m${('' + index).padEnd(3)}${color.reset}\x1b[38;5;244m${line}${color.reset}`);
57 | })
58 |
59 | await browser.close();
60 | })();
61 |
--------------------------------------------------------------------------------
/preview_full.js:
--------------------------------------------------------------------------------
1 | const { color, readCache, stringToChunks, MarkdownTableFormatter } = require('./utils');
2 | const line = process.argv[2];
3 | const lineNumber = parseInt(line.split(' ')[0]);
4 |
5 | const content = readCache('_result_', 'preview')
6 |
7 | // Computes offset based on line number
8 | function computeOffset(n) {
9 | //let height = process.stdout.rows;
10 | let height = process.env.LINES;
11 | let preferredCursorPos = 1/3;
12 | return Math.max(n - parseInt(height * preferredCursorPos), 0);
13 | }
14 |
15 | // Add color to the line depending on markdown element
16 | const NORMAL = 0;
17 | const CODE = 1;
18 | const TABLE = 2;
19 | let border = 4; // chars
20 | let maxLen = parseInt(process.env.COLUMNS * 0.8) - border; // 80% of the window
21 | let mode = NORMAL;
22 | let buffer = [];
23 | function pretty(line, next, highlight) {
24 |
25 | if (line.slice(0, 3) === '```') {
26 | // codeblock toggle
27 | mode = mode === NORMAL ? CODE : NORMAL;
28 | line = color.green + line + color.reset;
29 | console.log(line);
30 | return;
31 | } else if (line[0] === '|') {
32 | // enter table mode
33 | mode = TABLE;
34 | } else if (mode === TABLE) {
35 | // exit table mode
36 | let table = new MarkdownTableFormatter();
37 | try {
38 | table.format_table(buffer.join('\n'));
39 | console.log(table.output_table);
40 | } catch (e) {
41 | console.log(color.red + "[ Couldn't format table ]" + color.reset);
42 | }
43 | buffer = [];
44 | mode = NORMAL;
45 | }
46 |
47 | if (mode === NORMAL) {
48 | // urls
49 | line = line.replace(/\[(.*?)\]\(.*?\)/g, color.blue + '$1' + color.reset);
50 |
51 | if (line[0] === '#') {
52 | // header
53 | line = color.yellow + color.bright + line + color.reset;
54 | }
55 |
56 | // bold, italic, underline, code
57 | line = line.replace(/(?:^|\W)\*\*([^*]+)\*\*\s/g, ' ' + color.bright + '$1' + color.reset + ' ');
58 | line = line.replace(/(?:^|\W)\*([^*]+)\*\s/g, ' ' + color.italic + '$1' + color.reset + ' ');
59 | line = line.replace(/(?:^|\W)_([^_]+)_\s/g, ' ' + color.underscore + '$1' + color.reset + ' ');
60 | line = line.replace(/(?:^|\W)`([^`]+)`\s/g, ' ' + color.green + '$1' + color.reset + ' ');
61 | } else if (mode === TABLE) {
62 | buffer.push(line);
63 | return;
64 | } else {
65 | line = color.green + line + color.reset;
66 | }
67 |
68 | if (highlight) {
69 | line = color.bright + line + color.reset;
70 | }
71 |
72 | /*let chunks = stringToChunks(line, maxLen);
73 | if (chunks.length) {
74 | chunks.forEach(c => {
75 | console.log(c);
76 | });
77 | } else {
78 | console.log(line);
79 | }*/
80 | console.log(line)
81 | }
82 |
83 | // Create illusion of scrolled text based on selected line number
84 | let offset = computeOffset(lineNumber);
85 | console.log(color.blue + content.url + color.reset);
86 | console.log(lineNumber, process.env.LINES, process.stdout.rows, offset)
87 | content.content.forEach((line, index) => {
88 | let next = content.content.length > index + 1 ? content.content[index + 1] : '';
89 | if (index > offset) {
90 | pretty(line, next, index === lineNumber);
91 | }
92 | });
93 |
--------------------------------------------------------------------------------
/graphene:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
4 |
5 | display_usage() {
6 | echo -e "\nUsage: $0 [options] [engine] [query]"
7 | echo -e " Search for [query] using specified search engine."
8 | echo -e "\n $0 [options] url [url]"
9 | echo -e " Parse a specific website/url."
10 | echo -e "Options:\n"
11 | echo -e "\t-h, --help\t\tPrint this help message and exit.\n"
12 | echo -e "\t-r [number]\t\tNumber of results you want the search to return (not all engines will respect this).\n"
13 | echo -e "\t-o\t\t\tShow only the main result group, and omit all other groups.\n"
14 | }
15 |
16 | show_search() {
17 | local engine query controls selection
18 | engine=$1
19 | query="${@:2}"
20 |
21 | controls="f1:execute(bash $DIR/open_in_graphene.sh $engine {}),f2:toggle-preview"
22 | selection="$(node "$DIR/search.js" $engine "$query" |
23 | fzf --reverse --ansi --tiebreak=begin,index --bind "$controls" \
24 | --preview-window=right:40% --header-lines=1 --preview="node '$DIR/preview.js' $engine {}")"
25 | if [ ! -z "$selection" ]; then
26 | bash "$DIR/open.sh" $engine "$selection"
27 | fi
28 | }
29 |
30 | # populates caches for all engines this script is aware of
31 | initialize() {
32 | local engine
33 | engine=$1
34 |
35 | echo "Rebuilding cache for $engine..."
36 | mkdir -p .cache
37 | export CACHING=1
38 | node "$DIR/search.js" $engine
39 | unset CACHING
40 | }
41 |
42 | # open history item
43 | open_history() {
44 | local timestamp urltype url
45 | date=$1
46 | time=$2
47 | urltype=$3
48 | url=$4
49 |
50 | case $urltype in
51 | U) # direct url
52 | show_search url $url
53 | ;;
54 | S) # search query
55 | show_search url $url
56 | ;;
57 | N) # navigational link
58 | show_search url $url
59 | ;;
60 | R) # result opened in graphene
61 | bash "$DIR/open_in_graphene.sh" $url
62 | ;;
63 | X) # externally opened result
64 | open $url
65 | ;;
66 | esac
67 | }
68 |
69 | if [ "$#" -lt 2 ]; then
70 | display_usage
71 | exit 1
72 | fi
73 |
74 | while getopts ":hr:o" opt; do
75 | case $opt in
76 | h|help) # help
77 | display_usage
78 | exit 0
79 | ;;
80 | r|results) # number of results to prefer
81 | export RESULTS=$OPTARG; shift
82 | ;;
83 | o|only) # number of results to prefer
84 | export ONLY_MAIN=1; shift
85 | ;;
86 | \?)
87 | echo "Invalid option: -$OPTARG" >&2
88 | exit 1
89 | ;;
90 | esac
91 | done
92 |
93 | if [ "$1" == "history" ]; then
94 | # figure out if this has GNU date, or fallback to OSX date if not
95 | date="date -d @"
96 | date -d @1550000000 &>/dev/null
97 | if [ $? -eq 1 ]; then
98 | date="date -r "
99 | fi
100 |
101 | legend="\x1b[1m\x1b[30mS: Search page\tU: Direct URL\tN: Navigational page\tR: Graphene-opened search result\tX: Externally-opened search result\x1b[0m"
102 | esc=$(echo -e "\x1b") # sed seems to have trouble generating this sequence
103 | open_history $(cat "$DIR/.cache/history" | awk -v dt="$date" '{
104 | cmd = dt substr( $1, 1, length($1) - 3 ) " +\"%Y-%m-%d %H:%M\""
105 | if ( (cmd | getline dd) > 0 ) {
106 | $1 = dd
107 | }
108 | close(cmd)
109 | print
110 | }' | sed 's/\([0-9-]\{10\} [0-9:]\{5\}\) \([A-Z"]\)\( [a-z>]*>\)\{0,1\}\(.*\)\{0,1\}\( [^[:space:]]*\)$/'$esc'[1m'$esc'[30m\1'$esc'[0m '$esc'[31m\2'$esc'[36m\3'$esc'[0m\4'$esc'[0m '$esc'[34m\5'$esc'[0m/; s/"\(.*\)"/\1/' |
111 | fzf --reverse --ansi --header " Browsing History " --tiebreak=begin,index --tac --preview-window=bottom:1 --preview="echo -e '$legend'")
112 | exit
113 | elif [ "$1" != "url" ]; then
114 | if [ ! -d "$DIR/.cache" ] || [ ! -f "$DIR/.cache/$1-template.json" ]; then
115 | initialize $1
116 | fi
117 | fi
118 | show_search $1 "${@:2}"
119 |
--------------------------------------------------------------------------------
/preview.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require('puppeteer');
2 | const engine = process.argv[2];
3 | const entry = process.argv[3];
4 | const url = entry.match(/\bhttps?:\/\/\S+/gi)[0];
5 | const { color, readCache, stringToChunks } = require('./utils');
6 | const preview_location = '/tmp/_web_preview.png';
7 | const isPager = /\(pager\)$/.test(entry);
8 |
9 | function showShortcuts() {
10 | const k = (key, msg) => { console.log(color.black + color.bright + `\t${key}\t${msg}` + color.reset) };
11 |
12 | console.log('');
13 | k('F1', 'Open result in Graphene');
14 | k('F2', 'Hide/show preview window');
15 | k('Enter', isPager ? 'Fetch next page of results' : 'Open result in GUI browser');
16 | console.log('');
17 | }
18 |
19 | // convert HEX code to ANSI
20 | function colorToAnsi(rgb, type) {
21 | let code = type === 'fg' ? '38' : '48';
22 | let colors = rgb.slice(4, -1).split(', ').map(n => parseInt(n));
23 | // return `\x1b${code};2;${colors[0]};${colors[1]};${colors[2]}m`;
24 |
25 | let mod = '';
26 | if (colors[0] > 130 || colors[1] > 130 || colors[2] > 130) {
27 | mod = color.bright;
28 | }
29 |
30 | // very crappy simulation of colors because FZF currently only does 8 colors
31 | if (colors[0] < 30 && colors[1] < 30 && colors[2] < 30) {
32 | // most websites are black text on white background, so ignore this setting for now
33 | // return mod + color.black;
34 | return mod + color.white;
35 | } else if (colors[0] > colors[1] * 2 && colors[0] > colors[2] * 2) {
36 | return mod + color.red;
37 | } else if (colors[1] > colors[0] * 2 && colors[1] > colors[2] * 2) {
38 | return mod + color.green;
39 | } else if (colors[2] > colors[1] * 2 && colors[2] > colors[0] * 2) {
40 | return mod + color.blue;
41 | } else if (colors[1] > colors[2] * 2) {
42 | return mod + color.yellow;
43 | } else if (colors[1] > colors[0] * 2) {
44 | return mod + color.cyan;
45 | } else if (colors[0] > colors[1] * 2) {
46 | return mod + color.magenta;
47 | }
48 |
49 | return mod + color.white;
50 | }
51 |
52 | // apply style to the element
53 | function style(element, style, tag) {
54 | let rendered = colorToAnsi(style.color, 'fg') + element + color.reset;
55 | // console.log(tag)
56 | if (/H\d/.test(tag)) {
57 | // heading
58 | return `\n${color.bright}${rendered}`;
59 | // } else if (!/^0px /.test(style.border)) {
60 | // // border
61 | // return `[${rendered}]`;
62 | // } else if (style.background !== 'rgba(0, 0, 0, 0)') {
63 | // // background
64 | // let bg = colorToAnsi(style.background, 'fg');
65 | // return `${bg}[${color.reset}${rendered}${bg}]${color.reset}`
66 | }
67 | return rendered;
68 | }
69 |
70 | // recursive helper for render()
71 | function _render(context) {
72 | let text = '';
73 |
74 | context.children.forEach(child => {
75 | let visible = true;
76 | if (typeof child === 'string') {
77 | if (child.trim()) {
78 | text += style(child, context.css, context.tag);
79 | } else {
80 | visible = false;
81 | }
82 | } else {
83 | text += _render(child);
84 | }
85 | if (visible && context.css.display !== 'inline' && text) {
86 | text += '\n';
87 | }
88 |
89 | })
90 |
91 | return text;
92 | }
93 |
94 | function render(data) {
95 | console.log(color.blue + color.underscore + url + color.reset);
96 | showShortcuts();
97 |
98 | if (data.error) {
99 | console.log(color.red + data.error + color.reset);
100 | return;
101 | }
102 |
103 | let output = _render(data.context);
104 |
105 | let border = 4;
106 | let maxLen = parseInt(process.env.COLUMNS * 0.4) - border; // 40% of the window
107 | let lines = output.split('\n');
108 | lines.forEach(l => {
109 | let chunks = stringToChunks(l.trim(), maxLen);
110 | chunks.forEach(c => {
111 | c && console.log(c);
112 | });
113 | });
114 | }
115 |
116 | let cache = readCache(engine, 'current');
117 | if (Object.keys(cache).length) {
118 | if (cache[url]) {
119 | render(cache[url]);
120 | } else {
121 | render({
122 | error: 'There was a problem fetching preview for this result.',
123 | })
124 | }
125 | } else {
126 | render({
127 | error: 'Preview file could not be loaded.',
128 | })
129 | }
130 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Graphene Text Browser
2 | ========================
3 | [](https://asciinema.org/a/249148)
4 |
5 | A text-based browser / search aggregator. It allows you to search the web through the terminal in a style that makes the most
6 | sense in the terminal. It does not try to emulate a GUI browser in the terminal or replace it, because the terminal was not meant for that, and that
7 | just creates a miserable experience for the user. Once you find the page of interest, you can preview it as markdown (`[F1]`) or open it
8 | in your browser of choice (`[Enter]`).
9 |
10 | Main use case is to minimize context switch by starting your search in the termimal (same place you're editing your code), and only jump to the
11 | browser if you found the result you want. This also cuts down distractions that you'll likely encounter in the regular browser, and formats every
12 | page in a consistent way.
13 |
14 | This is still a work in progress, but works decently well. The original inspiration for this project was Ranger File Manager, but after realizing that
15 | adapting it from local browsing to the web would require a significant rewrite, I put the project aside. Then, a couple months later, I stumbled into
16 | FZF, and figured I'd give this tool a try again using FZF instead of Ranger as a way to represent results. And this is the result.
17 |
18 |
19 | Usage
20 | =====
21 | ```
22 | graphene [engine] [query]
23 | ```
24 | or
25 | ```
26 | graphene url [url]
27 | ```
28 |
29 | Result: An FZF query with a set of links from the page classified as follows:
30 | - golden/yellow: main group (probably results you meant to search for)
31 | - cyan: navigational links that will result in query to be reperformed instead of opening the page
32 | - green: categories defined for this engine, clicking these will rerender the category URL as new set of search results
33 | - white: regular group, if golden group is wrong, your results may be here (you can adjust weights to reclassify)
34 | - black/gray: most-likely cruft (irrelevant/generic page links)
35 |
36 | Selecting a result will open it in your browser of choice, unless the result is a navigational (cyan) link, which will re-trigger the search with new
37 | offset. Pressing `F1` will instead load the result as markdown version of the page in Graphene (for text-based pages this works well and often can
38 | avoid the unnecessary hop to a browser).
39 |
40 | While previewing the page via `F1`, you can navigate/search the page using search patterns or simple scrolling. For example, typing `#` will filter
41 | all page headings as search results, effectivelly creating a table of contents for the page. Similarly, typing `` ` `` will filter all code blocks
42 | (useful for navigating directly to the example on websites like MDN).
43 |
44 | Other options:
45 | ```
46 | graphene history # browse search/view history
47 | ```
48 |
49 | Installation
50 | ============
51 | Project currently uses the following dependencies:
52 |
53 | - FZF
54 | - node.js
55 | - puppeteer
56 |
57 | To install on OSX/Linux:
58 |
59 | ```
60 | brew/apt-get/dnf install fzf
61 | npm install puppeteer
62 | ```
63 |
64 | Add the project directory to `$PATH`, i.e. by adding this to your `.bash_profile`:
65 |
66 | ```
67 | export PATH="/path/to/graphene:$PATH"
68 | ```
69 |
70 | Roadmap
71 | =======
72 | I've built this mainly for myself, the initial set of features are mainly driven by my own use case an aesthetics. What I would like to add (when time allows):
73 |
74 | - Identification of categorizing components (tags (github, npm), search subtypes (github, google, amazon)).
75 | - Authentication encryption
76 | - Ability to trigger a category/subtype search (i.e. search issue list of specific github repo).
77 | - Use of `goodQuery` setting to improve initial calibration.
78 | - `graphene-dsl`: a simplified scripting language for customizing portions of webpage loading logic or performing actions such as complex authentication
79 |
80 | Configuration
81 | =============
82 | If you want to add a new engine that I haven't included, look at an example of an existing engine in `engines` directory and customize it accordingly.
83 | The only required field is `query` (url used to formulate a search query). For best results, you should fill in as many parameters for the engine as possible.
84 | If your engine works well, feel free to contribute it back to this repository. Here is an explanation of each field:
85 |
86 | ```
87 | {
88 | "banner": "Banner you want displayed to the user performing the search",
89 | "query": "URL used by the engine as point of entry",
90 | "goodQuery": "Example of a good query that yields a lot of results (not yet used for calibration)",
91 | "badQuery": "Example of a bad query that yields few or no results",
92 | "pager": {
93 | "name": "Name to search for to identify navigational component (i.e. next/prev page of results)",
94 | "href": "Unique field in URL to search for that correlates to navigational offset (i.e. page=, start=, etc.)"
95 | },
96 | "weights": {
97 | "context": 2, // Amount of context per element.
98 | "coverage": 1, // Amount of space your elements seem to cover on screen (how spread out they are).
99 | "area": 1, // Area correlates with things like font size but may break if you stick a large image inside that's not a main group.
100 | "textLength": 1, // Text length is the combined length of all text inside the given group of links.
101 | "numElements": 0, // Number of elements in the group, higher weight means groups with more elements will be preferred.
102 | }
103 | }
104 | ```
105 |
106 | Queries are used to calibrate the caching mechanism. Pager info is optional (there is a well functioning set of defaults) and is meant for websites where
107 | defaults. Weights are numeric values (these can be integers or floating point) used to calibrate the browser's determination of search result significance
108 | for specific website. For a regular search engine like Google, results would have longer text length and have more contextual text to summarize the result.
109 | For an image-based search engine like Amazon, area taken up by results may be more significant. If your engine is misclassifying the main group, play with
110 | the weights to adjust it.
111 |
112 | FAQ
113 | ===
114 |
115 | #### Will this work with any search engine/website?
116 | Probably not, but it has worked with more than I expected, and will continue to improve.
117 |
118 | #### Can this profile a page that's not a search engine?
119 | Yes it can, and it falls back to defaults, which usually work well but may epic-fail on some websites. You can pass an exact URL instead of the query to open.
120 | Instead of engine, use `url` keyword. This seems to work with websites like Slashdot, with Reddit it fails to find the pager (which loads dynamically via scroll).
121 | If you have ideas for how to handle this case or other improvements, feel free to contribute.
122 |
123 | #### Will this work if I point it to a specific news story or blog entry via `url` keyword?
124 | Not yet, but almost. This is not meant to be a complete replacement for your regular browser. It's designed to process aggregate-based webpages and extracting key
125 | information for each link. It can extract arbitrary text from a webpage and render it as markdown, but for now you need to drive from an aggregate website first.
126 |
127 | #### How does it work? How does it know which groups are signfiicant and which is the main one?
128 | It uses heuristics similar to what a human would do when navigating to a page. Groups that take up more visual space on the page are deemed more important.
129 | Groups whose elements don't change at all between searches are deemed unimportant, groups whose names don't change but urls do are navigational (they apply
130 | to the search in some way but aren't part of the results).
131 |
132 | #### Can this be made faster/smarter by specifying the exact class/id of the results group?
133 | That's typically what scrapers do, and why they're easy to break with minor changes to the search engine. This aggregator uses more generic heuristics
134 | and therefore harder to fool. For example, Google runs some sort of uglifier on their frontend. This uglifier mangles class/id names. These names then
135 | stay consistent between searches (giving you the illusion of your selector working), but change every time Google redeploys their frontend (which happens
136 | several times per week). This aggregator doesn't care about changes like that, it analyzes link significance on the page the same way a human would. Moreover,
137 | even if the engine decides to change the page in a significant way, the aggregator should be able to adapt to it after clearing your old cache.
138 |
139 | #### Does this comply with terms of use for the websites being aggregated?
140 | Most websites should be fine with it (especially since I'm not explicitly blocking ads - they'd just get classified into one of the less relevant categories).
141 | I'm also not monetizing their results in any way, which is typically what triggers them to go after you. Some websites do indeed have very draconic
142 | (and probably unenforcable) policies, worst thing they'll do is block puppeteer from being able to crawl their website or temporarily ban the abusing IP.
143 |
--------------------------------------------------------------------------------
/utils.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const path = require("path");
3 |
4 | // color scheme
5 | const color = {
6 | reset: '\x1b[0m',
7 | bright: '\x1b[1m',
8 | italic: '\x1b[3m',
9 | underscore: '\x1b[4m',
10 | black: '\x1b[30m',
11 | red: '\x1b[31m',
12 | green: '\x1b[32m',
13 | yellow: '\x1b[33m',
14 | blue: '\x1b[34m',
15 | magenta: '\x1b[35m',
16 | cyan: '\x1b[36m',
17 | white: '\x1b[37m',
18 | };
19 |
20 | // reads previously written cache
21 | function readCache(engine, type) {
22 | let json;
23 | try {
24 | json = require('./.cache/' + engine + '-' + type);
25 | } catch (e) {
26 | json = {};
27 | }
28 |
29 | return json;
30 | };
31 |
32 | // writes page as a cache file to disk
33 | function writeCache(engine, type, json) {
34 | fs.writeFileSync(path.resolve(__dirname, './.cache/' + engine + '-' + type + '.json'), JSON.stringify(json));
35 | };
36 |
37 | // scans config for placeholders for user to fill in, and requests them from user
38 | let input;
39 | function initInput() {
40 | input = require('readline').createInterface({
41 | input: process.stdin,
42 | output: process.stdout
43 | });
44 | // overwrite to hide password
45 | input._writeToOutput = function _writeToOutput(stringToWrite) {
46 | if (input.stdoutMuted)
47 | //input.output.write("\x1B[2K\x1B[200D"+input.query+"["+((input.line.length%2==1)?"=-":"-=")+"]");
48 | input.output.write("*");
49 | else
50 | input.output.write(stringToWrite);
51 | };
52 | }
53 | // helper function to get user input and avoid pyramid of doom
54 | function ask(question, test, hide=false) {
55 | return new Promise((resolve) => {
56 | if (test()) {
57 | input.stdoutMuted = hide;
58 | input.question(`${question}: `, (answer) => {
59 | resolve(answer);
60 | });
61 | } else {
62 | resolve(undefined);
63 | }
64 | });
65 | }
66 | async function requestUserFields(engine, settings) {
67 | let userConfig = readCache(engine, 'auth');
68 |
69 | if (!Object.keys(userConfig).length) {
70 | initInput();
71 | // website
72 | userConfig.url = await ask('This engine needs a host website', () => /{{URL}}/.test(settings.query));
73 | // userConfig.url && console.log('Website:', userConfig.url);
74 |
75 | if (settings.authentication) {
76 | const auth = settings.authentication;
77 | // username
78 | userConfig.username = await ask('Enter username', () => auth.username && /{{USERNAME}}/.test(auth.username));
79 | // userConfig.username && console.log('User Name:', userConfig.username);
80 |
81 | // password
82 | userConfig.password = await ask('Enter password', () => {
83 | return auth.password && /{{PASSWORD}}/.test(auth.password)
84 | }, true);
85 | // userConfig.password && console.log('Password:', ''.padEnd(userConfig.password.length, '*'));
86 | }
87 |
88 | let save = (await ask('Save for future use? [yN]', () => Object.keys(userConfig).length)) === 'y';
89 | if (save) {
90 | writeCache(engine, 'auth', userConfig);
91 | }
92 | input.close();
93 | }
94 | // console.log(userConfig)
95 | return userConfig;
96 | }
97 |
98 | // writes entry to history file
99 | // types: S (search), U (url), N (navigational), R (result), X (external)
100 | function writeHistory(url, type, params, initial=false) {
101 | let time = Date.now();
102 | let context = '';
103 | if (params) {
104 | if (type === 'S') {
105 | context = `${params.engine}> "${params.query}" `;
106 | } else if (type === 'R') {
107 | context = `graphene> "${params.title}" `;
108 | } else if (type === 'X') {
109 | context = `${params.engine}>open> "${params.result}" `;
110 | } else if (type === 'N') {
111 | context = `${params.engine}>nav> "${params.title}" `;
112 | } else if (type === 'U') {
113 | context = `url> "${params.title}" `;
114 | }
115 | }
116 | fs.appendFile(path.resolve(__dirname, './.cache/history'), `${initial ? '' : ' '}${time} ${type} ${context}${url}\n`, (err) => {
117 | if (err) {
118 | throw err;
119 | }
120 | });
121 | }
122 |
123 | // split a long string into shorter chunks
124 | function stringToChunks(str, size) {
125 | // const numChunks = Math.ceil(str.length / size);
126 | const chunks = [];
127 |
128 | let index = 0;
129 | while (str.length && index < str.length - 1) {
130 | let line = str.substr(index, size);
131 | index += size;
132 | if (index < str.length - 1 && str[index-1] !== ' ' && str[index] !== ' ') {
133 | // we're mid-word
134 | let offset = 0
135 | while (index && str[index-1] !== ' ') {
136 | index--;
137 | offset++;
138 | }
139 | line = line.substr(0, line.length-offset);
140 | }
141 | chunks.push(line);
142 | }
143 |
144 | return chunks;
145 | }
146 |
147 |
148 | // dictionary of common element names
149 | let dictionary = {
150 | // groups that are typically not actionable from terminal
151 | cruft: [
152 | // account
153 | 'sign in',
154 | 'log in',
155 | 'login',
156 | 'sign up',
157 | 'join',
158 | 'register',
159 |
160 | // menus
161 | 'about',
162 | 'blog',
163 | 'contact us',
164 | 'cookie policy',
165 | 'feedback',
166 | 'help',
167 | 'home',
168 | 'jobs',
169 | 'legal',
170 | 'privacy',
171 | 'privacy policy',
172 | 'return policy',
173 | 'security',
174 | 'settings',
175 | 'terms',
176 | 'terms of service',
177 | 'terms of use',
178 |
179 | // categorization
180 | 'questions',
181 | 'tags',
182 | 'users',
183 | 'votes',
184 |
185 | // media sharing
186 | 'facebook',
187 | 'linkedin',
188 | 'reddit',
189 | 'twitch',
190 | 'twitter',
191 | 'youtube',
192 | ],
193 | // navigation elements/groups
194 | navigation: {
195 | name: [
196 | '^1$',
197 | '^2$',
198 | '^3$',
199 | '^4$',
200 | '^5$',
201 | '^next\\b',
202 | '^prev\\b',
203 | '^previous\\b',
204 | '^back\\b',
205 | '^newer\\b',
206 | '^older\\b',
207 | ],
208 | href: [
209 | '\\bstart=\\d+\\b',
210 | '\\bpage=\\d+\\b',
211 | '\\bp=\\d+\\b',
212 | '\\bpstart=\\d+\\b',
213 | ]
214 | }
215 | };
216 |
217 | // weights to apply when evaluating significance of each group
218 | const weights = {
219 | context: 2, // amount of context per element
220 | coverage: 1, // amount of space your elements seem to cover on screen (how spread out they are)
221 | area: 1, // area correlates with things like font size but may break if you stick a large image inside that's not a main group
222 | textLength: 1, // text length is the combined length of all text inside the given group of links
223 | numElements: 0, // number of elements in the group
224 | }
225 |
226 | // minimum thresholds each group has to meet to be considered significant
227 | const thresholds = {
228 | coverage: 30000,
229 | numElements: 5,
230 | };
231 |
232 |
233 |
234 | // taken from https://github.com/alanwsmith/markdown_table_formatter
235 | // Not the prettiest code, but it gets the job done
236 | function MarkdownTableFormatter() {
237 |
238 | // Setup instance variables.
239 | this.cells = new Array();
240 | this.column_widths = new Array();
241 | this.output_table = "";
242 |
243 | }
244 |
245 | MarkdownTableFormatter.prototype.add_missing_cell_columns = function() {
246 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) {
247 | for (var col_i = 0, col_l = this.column_widths.length; col_i < col_l; col_i = col_i + 1) {
248 | if (typeof this.cells[row_i][col_i] === 'undefined') {
249 | this.cells[row_i][col_i] = '';
250 | }
251 | }
252 | }
253 | }
254 |
255 | MarkdownTableFormatter.prototype.format_table = function(table) {
256 |
257 | this.import_table(table);
258 | this.get_column_widths();
259 | this.add_missing_cell_columns();
260 | this.pad_cells_for_output();
261 |
262 | // Header
263 | this.output_table = "| ";
264 | this.output_table += this.cells[0].join(" | ");
265 | this.output_table += " |\n";
266 |
267 | // Separator
268 | this.output_table += "|-";
269 | this.output_table += this.cells[1].join("-|-");
270 | this.output_table += "-|\n";
271 |
272 |
273 | for (var row_i = 2, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) {
274 | this.output_table += "| ";
275 | this.output_table += this.cells[row_i].join(" | ");
276 | this.output_table += " |\n";
277 | }
278 |
279 | }
280 |
281 | MarkdownTableFormatter.prototype.get_column_widths = function() {
282 |
283 | this.column_widths = new Array();
284 |
285 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) {
286 | for (var col_i = 0, col_l = this.cells[row_i].length; col_i < col_l; col_i = col_i + 1) {
287 | if (typeof this.column_widths[col_i] === 'undefined') {
288 | this.column_widths[col_i] = this.cells[row_i][col_i].length;
289 | }
290 | else if (this.column_widths[col_i] < this.cells[row_i][col_i].length) {
291 | this.column_widths[col_i] = this.cells[row_i][col_i].length;
292 | }
293 | }
294 | }
295 | }
296 |
297 | MarkdownTableFormatter.prototype.import_table = function(table) {
298 |
299 | var table_rows = table.split("\n");
300 |
301 | // Remove leading empty lines
302 | while (table_rows[0].indexOf('|') == -1) {
303 | table_rows.shift();
304 | }
305 |
306 | for (var row_i = 0, row_l = table_rows.length; row_i < row_l; row_i = row_i + 1) {
307 |
308 | // TODO: Set up the indexes so that empty lines at either the top or bottom will
309 | // be removed. Right now, this is only helpful for empty lines at the bottom.
310 | if(table_rows[row_i].indexOf('|') == -1) {
311 | continue;
312 | }
313 |
314 | this.cells[row_i] = new Array();
315 |
316 | var row_columns = table_rows[row_i].split("\|");
317 |
318 | for (var col_i = 0, col_l = row_columns.length; col_i < col_l; col_i = col_i + 1) {
319 | this.cells[row_i][col_i] = row_columns[col_i]
320 | this.cells[row_i][col_i] = this.cells[row_i][col_i].replace(/^\s+/g,"");
321 | this.cells[row_i][col_i] = this.cells[row_i][col_i].replace(/\s+$/g,"");
322 |
323 | // If it's the separator row, parse down the dashes
324 | // Only do this if it matches to avoid adding a
325 | // dash in an empty column and messing with the column widths.
326 | if (row_i == 1) {
327 | this.cells[row_i][col_i] = this.cells[row_i][col_i].replace(/-+/g,"-");
328 | }
329 | }
330 | }
331 |
332 |
333 | // Remove leading and trailing rows if they are empty.
334 | this.get_column_widths();
335 |
336 | if (this.column_widths[0] == 0) {
337 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) {
338 | this.cells[row_i].shift();
339 | }
340 | }
341 |
342 | this.get_column_widths();
343 |
344 | // check to see if the last item in column widths is empty
345 | if (this.column_widths[ (this.column_widths.length - 1) ] == 0) {
346 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) {
347 | // Only remove the row if it is in the proper last slot.
348 | if (this.cells[row_i].length == this.column_widths.length) {
349 | this.cells[row_i].pop();
350 | }
351 | }
352 | }
353 |
354 | this.get_column_widths();
355 |
356 | }
357 |
358 | MarkdownTableFormatter.prototype.pad_cells_for_output = function() {
359 |
360 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) {
361 | for (var col_i = 0, col_l = this.cells[row_i].length; col_i < col_l; col_i = col_i + 1) {
362 |
363 | // Handle anything that's not the separator row
364 | if (row_i != 1) {
365 | while(this.cells[row_i][col_i].length < this.column_widths[col_i]) {
366 | this.cells[row_i][col_i] += " ";
367 | }
368 | }
369 | // Handle the separator row.
370 | else {
371 | while(this.cells[row_i][col_i].length < this.column_widths[col_i]) {
372 | this.cells[row_i][col_i] += "-";
373 | }
374 | }
375 | }
376 | }
377 | }
378 |
379 | module.exports = {
380 | color,
381 | requestUserFields,
382 | readCache,
383 | writeCache,
384 | writeHistory,
385 | dictionary,
386 | stringToChunks,
387 | weights,
388 | thresholds,
389 | MarkdownTableFormatter
390 | };
391 |
--------------------------------------------------------------------------------
/search.js:
--------------------------------------------------------------------------------
1 | const puppeteer = require('puppeteer');
2 | const { color, dictionary, requestUserFields, readCache, writeCache, writeHistory, weights, thresholds } = require('./utils');
3 |
4 | const engine = process.argv[2];
5 | const query = process.argv[3];
6 |
7 |
8 | // populate banner
9 | function banner() {
10 | if (engine !== 'url' && settings.banner) {
11 | let banner = settings.banner;
12 | let matches = banner.match(/{(.*?)}/g);
13 | matches.forEach((match, i) => {
14 | let escCode = '';
15 | let fields = match.slice(1,-1).split(',');
16 | fields.forEach(field => {
17 | props = field.split('=');
18 | let colorModifier = props[0].trim() === 'bg' ? 10 : 0;
19 | if (props[1].trim().startsWith('color')) {
20 | escCode += '\x1b[' + (38 + colorModifier) + ';5;' + props[1].trim().slice(5) + 'm';
21 | } else {
22 | escCode += color[props[1].trim()];
23 | if (colorModifier) {
24 | escCode = escCode.replace('[3', '[4');
25 | }
26 | }
27 | });
28 | if (!i) escCode += ' '; // pad the beginning
29 | banner = banner.replace(match, escCode);
30 | });
31 | return `${banner} ${color.reset} ${color.bright}${query}${color.reset}`;
32 | } else {
33 | return `${color.red}${engine} ${color.reset} ${color.bright}${query}${color.reset}`;
34 | }
35 | }
36 |
37 | // output data
38 | function outputToTerminal(format, groups) {
39 | // return if we're just caching
40 | if (!groups) return;
41 |
42 | if (format === "json") {
43 | console.log(JSON.stringify(groups));
44 | } else if (format === 'shell') {
45 | var curated = banner() + '\n';
46 |
47 | groups.forEach(function (group, index) {
48 | let groupColor;
49 | if (group.groupType === MAIN) {
50 | groupColor = color.yellow;
51 | } else if (group.groupType === PAGER) {
52 | groupColor = color.bright + color.cyan;
53 | } else if (group.groupType === GENERIC) {
54 | groupColor = color.black + color.bright;
55 | } else if (group.groupType === OTHER) {
56 | groupColor = color.black + color.bright;
57 | } else if (group.groupType === CATEGORY) {
58 | groupColor = color.green;
59 | } else {
60 | groupColor = color.white;
61 | }
62 |
63 | group.elements.forEach(function (element) {
64 | if (!process.env.ONLY_MAIN || group.groupType === MAIN || group.groupType === PAGER) {
65 | curated +=
66 | groupColor + element.name.replace(/\n/g, ', ').padEnd(parseInt(120 * 2 / 3)) + color.reset + '\t' +
67 | color.blue + color.underscore + element.href + color.reset + (
68 | group.groupType === PAGER || group.groupType === CATEGORY ? '\t\t(pager)' : ''
69 | ) + '\n';
70 | }
71 | });
72 | });
73 | console.log(curated);
74 | } else {
75 | console.log('No format specified');
76 | }
77 | }
78 |
79 | // helper function for determining if paths are the same
80 | function isSamePath(a, b) {
81 | return a.path.every((element, index) => element === b.path[index]);
82 | }
83 |
84 | // finds a group with the same style in current results
85 | // chances are groups will be in the same order, but there may be missing/new
86 | // groups depending on what the search engine inserts into the page (ads, previews, maps, cards)
87 | function findGroupByStyle(currentResults, style) {
88 | for (var index = 0; index < currentResults.groups.length; index++) {
89 | let group = currentResults.groups[index];
90 | if (
91 | group.style.fontSize === style.fontSize &&
92 | group.style.fontFamily === style.fontFamily &&
93 | group.style.fontWeight === style.fontWeight &&
94 | group.style.color === style.color &&
95 | group.style.border === style.border &&
96 | group.style.visible === style.visible &&
97 | isSamePath(group.style, style)
98 | ) {
99 | return index;
100 | }
101 | }
102 | return -1;
103 | }
104 |
105 |
106 | // returns domain name from passed URL
107 | function domain(url) {
108 | let hostname;
109 | if (url.indexOf("//") > -1) {
110 | hostname = url.split('/')[2];
111 | } else {
112 | hostname = url.split('/')[0];
113 | }
114 |
115 | // find & remove port number
116 | hostname = hostname.split(':')[0];
117 | // find & remove "?"
118 | hostname = hostname.split('?')[0];
119 |
120 | return hostname;
121 | }
122 |
123 | // helper functions used by classifier
124 | const mostly = (g, group) => g.length / group.elements.length > 0.6;
125 | function isNavigation(element) {
126 | // if (element.name.slice(0, 2) === 'Old')
127 | let names = dictionary.navigation.name;
128 | let links = dictionary.navigation.href;
129 | let elementName = element.name.toLowerCase();
130 | let elementHref = element.href.toLowerCase();
131 | for (var nameIndex = 0; nameIndex < names.length; nameIndex++) {
132 | if (new RegExp(names[nameIndex], 'u').test(elementName)) {
133 | // name passes navigation check
134 | for (var hrefIndex = 0; hrefIndex < links.length; hrefIndex++) {
135 | if (new RegExp(links[hrefIndex], 'u').test(elementHref)) {
136 | return true;
137 | }
138 | }
139 | }
140 | }
141 | return false;
142 | }
143 |
144 | // constants for group types
145 | const MAIN = 0;
146 | const PAGER = 1;
147 | const CATEGORY =2;
148 | const CATEGORY2=3;
149 | const DEFAULT = 4;
150 | const GENERIC = 5;
151 | const OTHER = 6;
152 |
153 | // removes any groups/elements that are static between pages, pages are cached
154 | function removeCruftAndClassify(currentResults) {
155 | let urlMap = {};
156 | if (process.env.CACHING) {
157 | writeCache(engine, 'template', currentResults);
158 | return;
159 | } else if (engine === 'url') {
160 | currentResults.groups.slice(0).forEach(group => {
161 | group.groupType = DEFAULT;
162 |
163 | let cruft = [];
164 | let jsLink = [];
165 | let generic = [];
166 | group.elements.forEach(element => {
167 | if (dictionary.cruft.includes(element.name.toLowerCase())) {
168 | cruft.push(element);
169 | } else if (isNavigation(element)) {
170 | group.groupType = PAGER;
171 | } else if (element.href.slice(0, 11) === "javascript:") {
172 | jsLink.push(element);
173 | }
174 |
175 | group.elements.forEach(e => {
176 | urlMap[e.href] = e;
177 | })
178 | });
179 |
180 | let currentIndex = currentResults.groups.indexOf(group);
181 | if (mostly(cruft, group)) {
182 | // a lot of generic elements
183 | currentResults.groups.splice(currentIndex, 1);
184 | } else if (mostly(jsLink, group)) {
185 | // a lot of elements that only execute JS, we can't do anything with them yet
186 | currentResults.groups.splice(currentIndex, 1);
187 | } else if (group.coverage < thresholds.coverage || group.elements.length < thresholds.numElements) {
188 | // group is too small to seem significant
189 | group.groupType = OTHER;
190 | }
191 | });
192 | } else {
193 | let cache = readCache(engine, 'template');
194 | // filter out results based on cache
195 | currentResults.groups.slice(0).forEach(group => {
196 | group.groupType = DEFAULT;
197 |
198 | let index = findGroupByStyle(cache, group.style);
199 | let cruft = [];
200 | let jsLink = [];
201 | let generic = [];
202 | let currentIndex = currentResults.groups.indexOf(group);
203 | if (index !== -1) {
204 | let cachedGroup = cache.groups[index];
205 | group.elements.forEach(element => {
206 | let found = cachedGroup.elements.find(currentElement => {
207 | return currentElement.name === element.name;
208 | });
209 | if (found) {
210 | if (found.href === element.href || !found.name) {
211 | // 100% cruft (url and name match)
212 | cruft.push(found);
213 | } else if (dictionary.cruft.includes(element.name.toLowerCase())) {
214 | cruft.push(found);
215 | } else if (settings.pager) {
216 | // generic navigational component that may be related to current search
217 | // (name matches, url does not)
218 | generic.push(found);
219 | if (found.name === settings.pager.name &&
220 | found.href.includes(settings.pager.href) &&
221 | domain(element.href) === domain(settings.query)
222 | ) {
223 | // this is a pager group
224 | group.groupType = PAGER;
225 | }
226 | }
227 | } else if (element.href.slice(0, 11) === "javascript:") {
228 | jsLink.push(element);
229 | }
230 | if (isNavigation(element)) {
231 | group.groupType = PAGER;
232 | }
233 | group.elements.forEach(e => {
234 | urlMap[e.href] = e;
235 | })
236 | });
237 |
238 | if (mostly(cruft, group)) {
239 | // a lot of generic elements
240 | currentResults.groups.splice(currentIndex, 1);
241 | } else if (!group.pagers && group.elements.length < 2) {
242 | // only 1 element in group
243 | currentResults.groups.splice(currentIndex, 1);
244 | } else if (mostly(generic, group) && group.groupType !== PAGER) {
245 | // group of generically-named components
246 | group.groupType = GENERIC;
247 | }
248 | } else {
249 | let categoryElements = [];
250 | let spliceOffset = 0;
251 | group.elements.slice(0).forEach((e, i) => {
252 | urlMap[e.href] = e;
253 | if (isNavigation(e)) {
254 | // this is needed for now since we're going off of bad query, since the query may not yield
255 | // other pages, as we improve caching logic, we can probbaly remove this
256 | group.groupType = PAGER;
257 | } else if (settings.categories && !(group.groupType === PAGER)) {
258 | Object.keys(settings.categories).forEach(category => {
259 | settings.categories[category].forEach(rule => {
260 | if (rule.find) {
261 | // a rule that recategorizes existing results
262 | if (rule.find.href && new RegExp(rule.find.href, 'u').test(e.href)) {
263 | if (rule.find.name && !(new RegExp(rule.find.name, 'u').test(e.name))) {
264 | return;
265 | }
266 | e.name = category + ': ' + e.name;
267 | categoryElements.push(e);
268 | group.elements.splice(i - spliceOffset, 1);
269 | spliceOffset++;
270 | }
271 | }
272 | });
273 | });
274 | } else if (e.href.slice(0, 11) === "javascript:") {
275 | jsLink.push(e);
276 | }
277 | });
278 | if (categoryElements.length) {
279 | // some elements were categorized
280 | if (!group.elements.length) {
281 | // entire group got categorized
282 | group.elements = categoryElements;
283 | group.groupType = CATEGORY;
284 | } else {
285 | // part of the group got categorized
286 | // TODO; technically group areas need to be recomputed and they need to be resorted
287 | let categoryGroup = { ...group, groupType: CATEGORY, elements: categoryElements };
288 | currentResults.groups.splice(currentIndex, 0, categoryGroup);
289 | }
290 | }
291 | }
292 |
293 | // further classify the group
294 | if (mostly(jsLink, group)) {
295 | // a lot of elements that only execute JS, we can't do anything with them yet
296 | currentResults.groups.splice(currentIndex, 1);
297 | } else if (
298 | group.groupType !== PAGER && (
299 | group.coverage < (settings.minGroupSize ? settings.minGroupSize : thresholds.coverage) ||
300 | group.elements.length < thresholds.numElements
301 | )
302 | ) {
303 | // group is too small to seem significant
304 | //group.groupType = OTHER;
305 | }
306 | })
307 | }
308 | writeCache(engine, 'current', urlMap);
309 |
310 | // find main group
311 | let groupIndex = 0;
312 | while (groupIndex < currentResults.groups.length) {
313 | if (currentResults.groups[groupIndex].groupType === DEFAULT) {
314 | currentResults.groups[groupIndex].groupType = MAIN;
315 | break;
316 | }
317 | groupIndex++;
318 | }
319 |
320 | return currentResults.groups.sort((a, b) => a.groupType - b.groupType);
321 | }
322 |
323 | // load engine-specific settings
324 | let settings = {};
325 | if (engine !== 'url') {
326 | try {
327 | settings = require('./engines/' + engine);
328 | } catch (e) {
329 | if (/Cannot find module/.test(e)) {
330 | console.log('No configuration exists for ' + engine);
331 | } else {
332 | console.log(engine + '.json: ' + e);
333 | }
334 | process.exit(1);
335 | }
336 | }
337 |
338 | const isValidUrl = (string) => {
339 | try {
340 | new URL(string);
341 | return true;
342 | } catch (_) {
343 | return false;
344 | }
345 | }
346 |
347 | (async () => {
348 | const browser = await puppeteer.launch({
349 | args: [
350 | '--no-sandbox',
351 | '--disable-setuid-sandbox',
352 | '--disable-infobars',
353 | '--window-position=0,0',
354 | '--ignore-certifcate-errors',
355 | '--ignore-certifcate-errors-spki-list',
356 | '--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"',
357 | '--disk-cache-dir=/tmp',
358 | ],
359 | ignoreHTTPSErrors: true,
360 | });
361 | const page = await browser.newPage();
362 | await page.setRequestInterception(true);
363 |
364 | // skip downloading images
365 | page.on('request', request => {
366 | if (request.resourceType() === 'image') {
367 | request.abort();
368 | } else {
369 | request.continue();
370 | }
371 | });
372 |
373 | // login, if relevant info is available
374 | if (settings.authentication && !process.env.CACHING) {
375 | const auth = settings.authentication;
376 | let config = readCache(engine, 'auth');
377 | let cookieData = readCache(engine, 'cookies');
378 | if (config.url) {
379 | auth.loginPage = auth.loginPage.replace('{{URL}}', config.url);
380 | settings.query = settings.query.replace('{{URL}}', config.url);
381 | }
382 |
383 | if (cookieData.cookies) {
384 | // we already have cookies, set them and continue
385 | // TODO: we need to test for expired cookies
386 | for (let cookie of cookieData.cookies) {
387 | await page.setCookie(cookie);
388 | }
389 | } else {
390 | // no cookies, perform login
391 | if (config.username) {
392 | auth.username = auth.username.replace('{{USERNAME}}', config.username);
393 | }
394 | if (config.password) {
395 | auth.password = auth.password.replace('{{PASSWORD}}', config.password);
396 | }
397 |
398 | if (auth.submitUsernameSelector) {
399 | // 2-page authentication system (i.e. gmail)
400 | await page.goto(auth.loginPage);
401 | await page.type(auth.usernameSelector, auth.username);
402 | await Promise.all([
403 | page.click(auth.submitUsernameSelector),
404 | page.waitForNavigation({ waitUntil: 'networkidle0' }),
405 | ]);
406 | await page.type(auth.passwordSelector, auth.password);
407 | await Promise.all([
408 | page.click(auth.submitPasswordSelector),
409 | page.waitForNavigation({ waitUntil: 'networkidle0' }),
410 | ]);
411 | // await page.screenshot({path: 'postlogin.png'});
412 | } else {
413 | // regular 1-page authentication
414 | await page.goto(auth.loginPage);
415 | // await page.screenshot({path: 'login.png'});
416 | await page.type(auth.usernameSelector, auth.username);
417 | await page.type(auth.passwordSelector, auth.password);
418 | await Promise.all([
419 | page.click(auth.submitSelector),
420 | page.waitForNavigation({ waitUntil: 'networkidle0' }),
421 | ]);
422 | // await page.screenshot({path: 'postlogin.png'});
423 | }
424 |
425 | // get cookies for future use
426 | // for now we'll jsut reauthenticate each time, in the future we should test
427 | // cookies first, and have a way to test if we're already logged in:w
428 | const cookies = await page.cookies();
429 | writeCache(engine, 'cookies', { cookies: cookies });
430 | }
431 | }
432 |
433 | // page.on('console', msg => console.log('page log: ' + msg.text()));
434 |
435 | if (process.env.CACHING) {
436 | let config = await requestUserFields(engine, settings);
437 | settings.query = settings.query.replace('{{URL}}', config.url);
438 | // caching page structure
439 | await page.goto(settings.query + encodeURIComponent(settings.badQuery));
440 | } else if (engine === "url") {
441 | // go directly to this page (direct)
442 | let url = query;
443 | if (!isValidUrl(url)) {
444 | url = 'http://' + url;
445 | }
446 | await page.goto(url);
447 | let title = await page.title();
448 | writeHistory(url, 'U', { title: title }, true);
449 | } else if (isValidUrl(query) && domain(query) === domain(settings.query)) {
450 | // go directly to this page (navigational)
451 | await page.goto(query);
452 | let title = await page.title();
453 | writeHistory(query, 'N', { engine: engine, title: title });
454 | } else {
455 | // start a new search with query
456 | let modifier = settings.resultModifier ? settings.resultModifier + (process.env.RESULTS || settings.resultsPerPage || 20) : '';
457 | let searchQuery = settings.query + encodeURIComponent(query) + modifier;
458 | await page.goto(searchQuery);
459 | writeHistory(searchQuery, 'S', { engine: engine, query: query }, true);
460 | }
461 | // await page.screenshot({path: 'example.png'});
462 | let results = await page.evaluate((columns, weights, settings) => {
463 |
464 | /** LIST OF LOGIC TO BE USED */
465 |
466 |
467 | // test if DOM element is visible to end user
468 | function isVisible(elem) {
469 | if (!(elem instanceof Element)) throw Error('DomUtil: elem is not an element.');
470 | var style = getComputedStyle(elem);
471 | var rect = elem.getBoundingClientRect();
472 | if (style.display === 'none') return false;
473 | if (style.visibility !== 'visible') return false;
474 | if (parseFloat(style.opacity) < 0.1) return false;
475 | if (elem.offsetWidth + elem.offsetHeight + rect.height + rect.width === 0) {
476 | return false;
477 | }
478 | return true;
479 | }
480 |
481 | // squishes node into its CSS selector
482 | function extractCssSelector(node) {
483 | return node.tagName +
484 | (node.id ? '#' + node.id : '') +
485 | (node.className ? '.' + Array.prototype.join.call(node.classList, '.') : '');
486 | }
487 |
488 | // find DOM element ancestors
489 | function listParents(node) {
490 | var nodes = [extractCssSelector(node)]
491 | for (; node; node = node.parentNode) {
492 | nodes.unshift(extractCssSelector(node))
493 | }
494 | return nodes
495 | }
496 |
497 | // get visual style for a single DOM element
498 | function getStyle(element) {
499 | var style = window.getComputedStyle(element);
500 | var dimensions = element.getBoundingClientRect();
501 | return {
502 | fontSize: style.fontSize,
503 | fontFamily: style.fontFamily,
504 | fontWeight: style.fontWeight,
505 | color: style.color,
506 | background: style.backgroundColor,
507 | border: style.border,
508 | visible: isVisible(element),
509 | display: style.display,
510 | loc: {
511 | x: dimensions.left,
512 | y: dimensions.top,
513 | h: dimensions.height,
514 | w: dimensions.width
515 | }
516 | };
517 | }
518 |
519 | // extract important DOM element properties into serializable JSON object
520 | function extract(element) {
521 | return {
522 | tag: element.tagName,
523 | css: getStyle(element),
524 | href: element.href,
525 | name: element.innerText ? element.innerText.trim() : '',
526 | classes: [...element.classList],
527 | path: listParents(element),
528 | id: element.id
529 | };
530 | }
531 |
532 | // compute encompassing region given 2 child regions
533 | function combineRegion(region1, region2) {
534 | var minX = Math.min(region1.x, region2.x);
535 | var minY = Math.min(region1.y, region2.y);
536 | var maxX = Math.max(region1.x + region1.w, region2.x + region2.w);
537 | var maxY = Math.max(region1.y + region1.h, region2.y + region2.h);
538 |
539 | return {
540 | x: minX,
541 | y: minY,
542 | w: maxX - minX,
543 | h: maxY - minY
544 | };
545 | }
546 |
547 | // helper function for expandSelection
548 | function isSameStyle(a, b) {
549 | a = getStyle(a);
550 | b = getStyle(b);
551 |
552 | if (
553 | a.fontSize === b.fontSize &&
554 | a.fontFamily === b.fontFamily &&
555 | a.fontWeight === b.fontWeight &&
556 | a.color === b.color &&
557 | a.border === b.border &&
558 | a.visible === b.visible
559 | ) {
560 | return true;
561 | }
562 | return false;
563 | }
564 |
565 | // expands selection to elements encompassing the link elements until largest common
566 | // ancestor is found for all elements in the group (a basis for better preview)
567 | function expandSelection(elements) {
568 | let parents = [...elements].map(e => {
569 | let node = e._node;
570 | delete e._node;
571 | return node;
572 | });
573 | if (parents.length === 1) {
574 | // there won't be other elements to compare the context to, assume no context
575 | return parents;
576 | }
577 | let grandParents;
578 | while (true) {
579 | grandParents = [];
580 | for (var i=0; i < parents.length; i++) {
581 | let parent = parents[i].parentNode;
582 | if (parent === window) {
583 | return parents;
584 | }
585 | if (grandParents.length) {
586 | let prev = grandParents[grandParents.length-1];
587 | if (prev === parent) {
588 | // at least two elements joined, stop analyzing
589 | return parents;
590 | } else if (!isSameStyle(prev, parent)) {
591 | // styles don't match
592 | return parents;
593 | }
594 | }
595 | grandParents.push(parent);
596 | }
597 | parents = grandParents;
598 | }
599 | return parents;
600 | }
601 |
602 | // fetches details from current selection suitable for rendering later
603 | function getRenderDetail(node) {
604 | let detail = extract(node);
605 | if (detail.css.visible) {
606 | return {
607 | ...detail,
608 | children: Array.prototype.map.call(node.childNodes, (node) => {
609 | if (node.nodeType === Node.TEXT_NODE) {
610 | return node.textContent;
611 | } else if (node.nodeType === Node.ELEMENT_NODE) {
612 | return getRenderDetail(node);
613 | } else {
614 | return '';
615 | }
616 | })
617 | }
618 | } else {
619 | return '';
620 | }
621 | }
622 |
623 | // compares children of each node
624 | function isSameRenderDetail(a, b) {
625 |
626 | // there may be undefined nodes
627 | if (a === undefined) {
628 | if (b === undefined) {
629 | return true;
630 | } else {
631 | return false;
632 | }
633 | } else if (b === undefined) {
634 | return false;
635 | }
636 |
637 | // there may be text nodes
638 | if (a.nodeType === Node.TEXT_NODE) {
639 | if (b.nodeType === Node.TEXT_NODE) {
640 | return true;
641 | } else {
642 | return false;
643 | }
644 | } else if (b.nodeType === Node.TEXT_NODE) {
645 | return false;
646 | }
647 |
648 | let aSummary = getRenderDetail(a);
649 | let bSummary = getRenderDetail(b);
650 |
651 | // there may be invisible nodes
652 | if (aSummary === '') {
653 | if (bSummary === '') {
654 | return true;
655 | } else {
656 | return false;
657 | }
658 | } else if (bSummary === '') {
659 | return false;
660 | }
661 |
662 | if (
663 | aSummary.css.fontSize === bSummary.css.fontSize &&
664 | aSummary.css.fontFamily === bSummary.css.fontFamily &&
665 | aSummary.css.fontWeight === bSummary.css.fontWeight &&
666 | aSummary.css.color === bSummary.css.color &&
667 | aSummary.css.border === bSummary.css.border &&
668 | [...a.childNodes].every((child, i) => isSameRenderDetail(child, b.childNodes[i]))
669 | ) {
670 | return true;
671 | }
672 | return false;
673 | }
674 |
675 | // these parameters are used for normalization later
676 | let metrics = {
677 | 'max-area': 0,
678 | 'max-coverage': 0,
679 | 'max-textLength': 0,
680 | 'max-context': 0
681 | };
682 | // gather metrics
683 | const updateMax = (group, type) => {
684 | metrics['max-' + type] = Math.max(metrics['max-' + type], group[type]);
685 | }
686 |
687 | // group a list of DOM elements by visual style
688 | function groupByStyle(elements) {
689 | var groups = [];
690 | elements.forEach(function (e) {
691 |
692 | // if group already exists, find it and append to it
693 | for (var i = 0; i < groups.length; i++) {
694 | var style = groups[i].style;
695 | if (
696 | // group should have same color/font
697 | e.css.color === style.color &&
698 | e.css.fontFamily === style.fontFamily &&
699 | e.css.fontSize === style.fontSize &&
700 | e.css.fontWeight === style.fontWeight && (
701 | // group should resemble some sort of list/tile layout
702 | e.css.loc.x === style.loc.x ||
703 | e.css.loc.y === style.loc.y ||
704 | e.css.loc.x + e.css.loc.w === style.loc.x + style.loc.w ||
705 | e.css.loc.y + e.css.loc.h === style.loc.y + style.loc.h
706 | ) && isSameRenderDetail(e._node, groups[i].elements[0]._node)
707 | ) {
708 | groups[i].elements.push(e);
709 | groups[i].style.loc = combineRegion(
710 | groups[i].style.loc,
711 | e.css.loc
712 | );
713 | groups[i].coverage = groups[i].style.loc.w * groups[i].style.loc.h;
714 | groups[i].area += e.css.loc.w * e.css.loc.h;
715 |
716 | return;
717 | }
718 | }
719 |
720 | // group doesn't exist, start a new group
721 | groups.push({
722 | // deep-copy the structure, since we will edit size
723 | style: { ...e.css, loc: { ...e.css.loc}, path: e.path },
724 | elements: [e],
725 | area: e.css.loc.w * e.css.loc.h,
726 | coverage: e.css.loc.w * e.css.loc.h
727 | });
728 | });
729 |
730 | groups.forEach(group => {
731 | group.textLength = group.elements.reduce((a, v) => { return a + v.name.length }, 0);
732 |
733 | updateMax(group, 'area');
734 | updateMax(group, 'coverage');
735 | updateMax(group, 'textLength');
736 | });
737 |
738 | return groups;
739 | }
740 |
741 | // helper logic for normalizing significance params and applying weights
742 | const weigh = (group, field) => {
743 | let weight = weights[field];
744 | if (settings && settings.weights && settings.weights[field]) {
745 | weight = settings.weights[field];
746 | }
747 | return weight * group[field] / metrics['max-' + field];
748 | }
749 |
750 | // returns relative significance of the group based on a number of heuristics
751 | function significance(group) {
752 | return weigh(group, 'coverage') + weigh(group, 'area') + weigh(group, 'textLength') + weigh(group, 'context');
753 | }
754 |
755 |
756 |
757 | /** END LIST, BEGIN PROGRAM **/
758 |
759 |
760 |
761 | let elements = document.querySelectorAll('a');
762 | let relevant = [];
763 | for (var i = 0; i < elements.length; i++) {
764 | var e = extract(elements[i]);
765 | e._node = elements[i];
766 | if (e.css.visible) {
767 | relevant.push(e);
768 | }
769 | }
770 |
771 | // fill in extra context for better preview later
772 | let groups = groupByStyle(relevant);
773 | groups.forEach(group => {
774 | let parents = expandSelection(group.elements);
775 | group.context = 0;
776 | group.elements.forEach((element, index) => {
777 | element.context = getRenderDetail(parents[index]);
778 | group.context += parents[index].innerText.length;
779 | });
780 | updateMax(group, 'context');
781 | });
782 | groups = groups.sort((a, b) => significance(a) < significance(b) ? 1 : -1);
783 |
784 | return {
785 | groups: groups
786 | };
787 | }, process.stdout.columns, weights, settings);
788 |
789 | outputToTerminal('shell', removeCruftAndClassify(results));
790 |
791 | await browser.close();
792 | })();
793 |
--------------------------------------------------------------------------------