├── .gitignore ├── screenshots └── stackoverflow.png ├── engines ├── amazon.json ├── mdn.json ├── ddg.json ├── google.json ├── github.json ├── jira.json ├── gmail.json ├── stackoverflow.json ├── npm.json └── reddit.json ├── open.sh ├── open_in_graphene.sh ├── scan_page.js ├── preview_full.js ├── graphene ├── preview.js ├── README.md ├── utils.js └── search.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .cache/ 3 | package-lock.json 4 | -------------------------------------------------------------------------------- /screenshots/stackoverflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atsepkov/Graphene/HEAD/screenshots/stackoverflow.png -------------------------------------------------------------------------------- /engines/amazon.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=white,fg=yellow}Amazon", 3 | "query": "https://www.amazon.com/s?k=", 4 | "goodQuery": "desk", 5 | "badQuery": "dlskjadbhlads", 6 | "pager": { 7 | "name": "Next →", 8 | "href": "page=" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /engines/mdn.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=white,fg=blue}MDN web docs", 3 | "query": "https://developer.mozilla.org/en-US/search?q=", 4 | "goodQuery": "drag and drop", 5 | "badQuery": "dlskjadbhlads", 6 | "pager": { 7 | "name": "Next", 8 | "href": "page=", 9 | "id": "search-result-next" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /engines/ddg.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=color166,fg=bright}DuckDuckGo", 3 | "query": "https://www.duckduckgo.com/?q=", 4 | "goodQuery": "food", 5 | "badQuery": "dlskjadbhlads", 6 | "pager": { 7 | "name": "Next", 8 | "href": "&start=" 9 | }, 10 | "resultsPerPage": 20, 11 | "resultModifier": "&num=", 12 | "minGroupSize": 30000 13 | } 14 | -------------------------------------------------------------------------------- /engines/google.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=white,fg=blue}G{fg=red}o{fg=yellow}o{fg=blue}g{fg=green}l{fg=red}e", 3 | "query": "https://www.google.com/search?q=", 4 | "goodQuery": "food", 5 | "badQuery": "dlskjadbhlads", 6 | "pager": { 7 | "name": "Next", 8 | "href": "&start=" 9 | }, 10 | "resultsPerPage": 20, 11 | "resultModifier": "&num=", 12 | "minGroupSize": 30000 13 | } 14 | -------------------------------------------------------------------------------- /engines/github.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=white,fg=black}GitHub", 3 | "query": "https://github.com/search?q=", 4 | "goodQuery": "react", 5 | "badQuery": "dlskjadbhlads", 6 | "pager": { 7 | "name": "Next", 8 | "href": "p=" 9 | }, 10 | "categories": { 11 | "language": [ 12 | { "find": { "href": "l=" } } 13 | ], 14 | "tag": [ 15 | { "find": { "href": "/topics/" } } 16 | ] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /open.sh: -------------------------------------------------------------------------------- 1 | engine=$1 2 | line=$2 3 | 4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 5 | 6 | url=$(echo $line | sed 's#.*\(https*://\)#\1#') 7 | if [[ "$line" =~ \(pager\)$ ]]; then 8 | bash $DIR/graphene $engine $url 9 | else 10 | OPENCMD=open 11 | [[ $(which xdg-open) ]] && OPENCMD=xdg-open 12 | 13 | $OPENCMD $url 14 | result=$(echo $line | sed 's#\(.*\)https*://#\1#') 15 | node -e "require('$DIR/utils').writeHistory('$url', 'X', { engine: '$engine', result: '$result' })" 16 | fi 17 | -------------------------------------------------------------------------------- /engines/jira.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=white,fg=red}Jira", 3 | "query": "https://{{URL}}/jira/secure/RapidBoard.jspa?view=planning.nodetail&quickFilter=71424&rapidView=", 4 | "goodQuery": "", 5 | "badQuery": "dlskjadbhlads", 6 | "authentication": { 7 | "loginPage": "https://{{URL}}/jira/login.jsp", 8 | "usernameSelector": "#login-form-username", 9 | "passwordSelector": "#login-form-password", 10 | "submitSelector": "#login-form-submit", 11 | "username": "{{USERNAME}}", 12 | "password": "{{PASSWORD}}" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /engines/gmail.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=red,fg=white}gmail", 3 | "query": "https://mail.google.com/mail/u/0/#search/", 4 | "goodQuery": "", 5 | "badQuery": "dlskjadbhlads", 6 | "authentication": { 7 | "loginPage": "https://accounts.google.com/signin/v2/identifier", 8 | "usernameSelector": "#identifierId", 9 | "submitUsernameSelector": "#identifierNext", 10 | "passwordSelector": "input[name='password']", 11 | "submitPasswordSelector": "#passwordNext", 12 | "username": "{{USERNAME}}", 13 | "password": "{{PASSWORD}}" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /engines/stackoverflow.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=color166,fg=white}StackOverflow", 3 | "query": "https://stackoverflow.com/search?q=", 4 | "goodQuery": "regex", 5 | "badQuery": "dlskjadbhlads", 6 | "pager": { 7 | "name": "next", 8 | "href": "page=" 9 | }, 10 | "minSize": 30000, 11 | "resultsPerPage": 20, 12 | "resultModifier": "&pagesize=", 13 | "categories": { 14 | "user": [ 15 | { "find": { "href": "/users/", "name": ".+" } } 16 | ], 17 | "tag": [ 18 | { "find": { "href": "/tagged/", "name": ".+" } } 19 | ] 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /engines/npm.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=red,fg=white}NPM", 3 | "query": "https://www.npmjs.com/search?q=", 4 | "goodQuery": "react", 5 | "badQuery": "dlskjadbhlads", 6 | "weights": { 7 | "context": 3, 8 | "coverage": 1, 9 | "area": 1, 10 | "textLength": 1, 11 | "numElements": 0 12 | }, 13 | "pager": { 14 | "name": "»", 15 | "href": "page=" 16 | }, 17 | "categories": { 18 | "user": [ 19 | { "find": { "href": "/~" } } 20 | ], 21 | "tag": [ 22 | { "find": { "href": "q=keywords:" } } 23 | ] 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /engines/reddit.json: -------------------------------------------------------------------------------- 1 | { 2 | "banner": "{bg=red,fg=white}reddit", 3 | "query": "https://www.reddit.com/search/?q=", 4 | "goodQuery": "dating", 5 | "badQuery": "dlskjadbhlads", 6 | "authentication": { 7 | "loginPage": "https://www.reddit.com/login/", 8 | "usernameSelector": "#loginUsername", 9 | "passwordSelector": "#loginPassword", 10 | "submitSelector": "button.AnimatedForm__submitButton", 11 | "username": "{{USERNAME}}", 12 | "password": "{{PASSWORD}}" 13 | }, 14 | "categories": { 15 | "subreddit": [ 16 | { "find": { "href": "/r/[^/]+/$" } } 17 | ], 18 | "user": [ 19 | { "find": { "href": "/user/" } } 20 | ] 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /open_in_graphene.sh: -------------------------------------------------------------------------------- 1 | engine=$1 2 | line=$2 3 | 4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 5 | url=$(echo $line | sed 's#.*\(https*://\)#\1#') 6 | 7 | # if [[ "$line" =~ \(pager\)$ ]]; then 8 | # bash $DIR/graphene $engine $url 9 | # else 10 | # open $url 11 | # fi 12 | 13 | # --bind "f1:execute(LINES=$LINES node '$DIR/preview_full.js' {} | less -r < /dev/tty > /dev/tty 2>&1)" \ 14 | show_result() { 15 | local url 16 | url="$1" 17 | # TODO: fix, f1 is currently broken, the idea is to use it as copy/save mode 18 | node $DIR/scan_page.js $engine "$url" | 19 | fzf --reverse --ansi --tiebreak=begin,index \ 20 | --bind "f1:execute(LINES=$LINES node '$DIR/preview_full.js' {} | nvim < /dev/tty > /dev/tty 2>&1)" \ 21 | --preview-window=right:80% --preview="node '$DIR/preview_full.js' {}" 22 | } 23 | 24 | show_result "$url" 25 | -------------------------------------------------------------------------------- /scan_page.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const { color, readCache, writeCache, writeHistory } = require('./utils'); 3 | 4 | const engine = process.argv[2]; 5 | const url = process.argv[3]; 6 | 7 | (async () => { 8 | const browser = await puppeteer.launch(); 9 | const page = await browser.newPage(); 10 | 11 | // load cookies, if they exist 12 | const cookieData = readCache(engine, 'cookies'); 13 | if (cookieData.cookies) { 14 | for (let cookie of cookieData.cookies) { 15 | await page.setCookie(cookie); 16 | } 17 | } 18 | 19 | await page.goto(url); 20 | const pageTitle = await page.title(); 21 | writeHistory(url, 'R', { title: pageTitle }); 22 | await page.addScriptTag({url: 'https://unpkg.com/turndown/dist/turndown.js'}); 23 | await page.addScriptTag({url: 'https://unpkg.com/turndown-plugin-gfm/dist/turndown-plugin-gfm.js'}); 24 | 25 | const content = await page.evaluate(() => { 26 | // return document.getElementsByTagName('body')[0].innerText; 27 | 28 | let turndownService = new TurndownService({ 29 | headingStyle: 'atx', 30 | codeBlockStyle: 'fenced', 31 | bulletListMarker: '-', 32 | // linkStyle: 'referenced', 33 | // linkReferenceStyle: 'collapsed' 34 | }); 35 | let gfm = turndownPluginGfm.gfm; 36 | turndownService.use(gfm); 37 | turndownService.remove('script'); 38 | turndownService.remove('style'); 39 | /*turndownService.addRule('url', { // force url to be one-line to avoid breaking later regex 40 | filter: ['a'], 41 | replacement: function (content, node) { 42 | let href = node.getAttribute('href'); 43 | let title = node.title ? ' "' + node.title.replace('\n', ' ') + '"' : ''; 44 | return '[' + content + '](' + href + ')'; 45 | } 46 | });*/ 47 | let markdown = turndownService.turndown(document.getElementsByTagName('body')[0].innerHTML); 48 | return markdown; 49 | }); 50 | 51 | writeCache('_result_', 'preview', { 52 | url: url, 53 | content: content.split('\n') 54 | }); 55 | content.split('\n').forEach((line, index) => { 56 | console.log(`${color.bright}\x1b[38;5;237m${('' + index).padEnd(3)}${color.reset}\x1b[38;5;244m${line}${color.reset}`); 57 | }) 58 | 59 | await browser.close(); 60 | })(); 61 | -------------------------------------------------------------------------------- /preview_full.js: -------------------------------------------------------------------------------- 1 | const { color, readCache, stringToChunks, MarkdownTableFormatter } = require('./utils'); 2 | const line = process.argv[2]; 3 | const lineNumber = parseInt(line.split(' ')[0]); 4 | 5 | const content = readCache('_result_', 'preview') 6 | 7 | // Computes offset based on line number 8 | function computeOffset(n) { 9 | //let height = process.stdout.rows; 10 | let height = process.env.LINES; 11 | let preferredCursorPos = 1/3; 12 | return Math.max(n - parseInt(height * preferredCursorPos), 0); 13 | } 14 | 15 | // Add color to the line depending on markdown element 16 | const NORMAL = 0; 17 | const CODE = 1; 18 | const TABLE = 2; 19 | let border = 4; // chars 20 | let maxLen = parseInt(process.env.COLUMNS * 0.8) - border; // 80% of the window 21 | let mode = NORMAL; 22 | let buffer = []; 23 | function pretty(line, next, highlight) { 24 | 25 | if (line.slice(0, 3) === '```') { 26 | // codeblock toggle 27 | mode = mode === NORMAL ? CODE : NORMAL; 28 | line = color.green + line + color.reset; 29 | console.log(line); 30 | return; 31 | } else if (line[0] === '|') { 32 | // enter table mode 33 | mode = TABLE; 34 | } else if (mode === TABLE) { 35 | // exit table mode 36 | let table = new MarkdownTableFormatter(); 37 | try { 38 | table.format_table(buffer.join('\n')); 39 | console.log(table.output_table); 40 | } catch (e) { 41 | console.log(color.red + "[ Couldn't format table ]" + color.reset); 42 | } 43 | buffer = []; 44 | mode = NORMAL; 45 | } 46 | 47 | if (mode === NORMAL) { 48 | // urls 49 | line = line.replace(/\[(.*?)\]\(.*?\)/g, color.blue + '$1' + color.reset); 50 | 51 | if (line[0] === '#') { 52 | // header 53 | line = color.yellow + color.bright + line + color.reset; 54 | } 55 | 56 | // bold, italic, underline, code 57 | line = line.replace(/(?:^|\W)\*\*([^*]+)\*\*\s/g, ' ' + color.bright + '$1' + color.reset + ' '); 58 | line = line.replace(/(?:^|\W)\*([^*]+)\*\s/g, ' ' + color.italic + '$1' + color.reset + ' '); 59 | line = line.replace(/(?:^|\W)_([^_]+)_\s/g, ' ' + color.underscore + '$1' + color.reset + ' '); 60 | line = line.replace(/(?:^|\W)`([^`]+)`\s/g, ' ' + color.green + '$1' + color.reset + ' '); 61 | } else if (mode === TABLE) { 62 | buffer.push(line); 63 | return; 64 | } else { 65 | line = color.green + line + color.reset; 66 | } 67 | 68 | if (highlight) { 69 | line = color.bright + line + color.reset; 70 | } 71 | 72 | /*let chunks = stringToChunks(line, maxLen); 73 | if (chunks.length) { 74 | chunks.forEach(c => { 75 | console.log(c); 76 | }); 77 | } else { 78 | console.log(line); 79 | }*/ 80 | console.log(line) 81 | } 82 | 83 | // Create illusion of scrolled text based on selected line number 84 | let offset = computeOffset(lineNumber); 85 | console.log(color.blue + content.url + color.reset); 86 | console.log(lineNumber, process.env.LINES, process.stdout.rows, offset) 87 | content.content.forEach((line, index) => { 88 | let next = content.content.length > index + 1 ? content.content[index + 1] : ''; 89 | if (index > offset) { 90 | pretty(line, next, index === lineNumber); 91 | } 92 | }); 93 | -------------------------------------------------------------------------------- /graphene: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 4 | 5 | display_usage() { 6 | echo -e "\nUsage: $0 [options] [engine] [query]" 7 | echo -e " Search for [query] using specified search engine." 8 | echo -e "\n $0 [options] url [url]" 9 | echo -e " Parse a specific website/url." 10 | echo -e "Options:\n" 11 | echo -e "\t-h, --help\t\tPrint this help message and exit.\n" 12 | echo -e "\t-r [number]\t\tNumber of results you want the search to return (not all engines will respect this).\n" 13 | echo -e "\t-o\t\t\tShow only the main result group, and omit all other groups.\n" 14 | } 15 | 16 | show_search() { 17 | local engine query controls selection 18 | engine=$1 19 | query="${@:2}" 20 | 21 | controls="f1:execute(bash $DIR/open_in_graphene.sh $engine {}),f2:toggle-preview" 22 | selection="$(node "$DIR/search.js" $engine "$query" | 23 | fzf --reverse --ansi --tiebreak=begin,index --bind "$controls" \ 24 | --preview-window=right:40% --header-lines=1 --preview="node '$DIR/preview.js' $engine {}")" 25 | if [ ! -z "$selection" ]; then 26 | bash "$DIR/open.sh" $engine "$selection" 27 | fi 28 | } 29 | 30 | # populates caches for all engines this script is aware of 31 | initialize() { 32 | local engine 33 | engine=$1 34 | 35 | echo "Rebuilding cache for $engine..." 36 | mkdir -p .cache 37 | export CACHING=1 38 | node "$DIR/search.js" $engine 39 | unset CACHING 40 | } 41 | 42 | # open history item 43 | open_history() { 44 | local timestamp urltype url 45 | date=$1 46 | time=$2 47 | urltype=$3 48 | url=$4 49 | 50 | case $urltype in 51 | U) # direct url 52 | show_search url $url 53 | ;; 54 | S) # search query 55 | show_search url $url 56 | ;; 57 | N) # navigational link 58 | show_search url $url 59 | ;; 60 | R) # result opened in graphene 61 | bash "$DIR/open_in_graphene.sh" $url 62 | ;; 63 | X) # externally opened result 64 | open $url 65 | ;; 66 | esac 67 | } 68 | 69 | if [ "$#" -lt 2 ]; then 70 | display_usage 71 | exit 1 72 | fi 73 | 74 | while getopts ":hr:o" opt; do 75 | case $opt in 76 | h|help) # help 77 | display_usage 78 | exit 0 79 | ;; 80 | r|results) # number of results to prefer 81 | export RESULTS=$OPTARG; shift 82 | ;; 83 | o|only) # number of results to prefer 84 | export ONLY_MAIN=1; shift 85 | ;; 86 | \?) 87 | echo "Invalid option: -$OPTARG" >&2 88 | exit 1 89 | ;; 90 | esac 91 | done 92 | 93 | if [ "$1" == "history" ]; then 94 | # figure out if this has GNU date, or fallback to OSX date if not 95 | date="date -d @" 96 | date -d @1550000000 &>/dev/null 97 | if [ $? -eq 1 ]; then 98 | date="date -r " 99 | fi 100 | 101 | legend="\x1b[1m\x1b[30mS: Search page\tU: Direct URL\tN: Navigational page\tR: Graphene-opened search result\tX: Externally-opened search result\x1b[0m" 102 | esc=$(echo -e "\x1b") # sed seems to have trouble generating this sequence 103 | open_history $(cat "$DIR/.cache/history" | awk -v dt="$date" '{ 104 | cmd = dt substr( $1, 1, length($1) - 3 ) " +\"%Y-%m-%d %H:%M\"" 105 | if ( (cmd | getline dd) > 0 ) { 106 | $1 = dd 107 | } 108 | close(cmd) 109 | print 110 | }' | sed 's/\([0-9-]\{10\} [0-9:]\{5\}\) \([A-Z"]\)\( [a-z>]*>\)\{0,1\}\(.*\)\{0,1\}\( [^[:space:]]*\)$/'$esc'[1m'$esc'[30m\1'$esc'[0m '$esc'[31m\2'$esc'[36m\3'$esc'[0m\4'$esc'[0m '$esc'[34m\5'$esc'[0m/; s/"\(.*\)"/\1/' | 111 | fzf --reverse --ansi --header " Browsing History " --tiebreak=begin,index --tac --preview-window=bottom:1 --preview="echo -e '$legend'") 112 | exit 113 | elif [ "$1" != "url" ]; then 114 | if [ ! -d "$DIR/.cache" ] || [ ! -f "$DIR/.cache/$1-template.json" ]; then 115 | initialize $1 116 | fi 117 | fi 118 | show_search $1 "${@:2}" 119 | -------------------------------------------------------------------------------- /preview.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const engine = process.argv[2]; 3 | const entry = process.argv[3]; 4 | const url = entry.match(/\bhttps?:\/\/\S+/gi)[0]; 5 | const { color, readCache, stringToChunks } = require('./utils'); 6 | const preview_location = '/tmp/_web_preview.png'; 7 | const isPager = /\(pager\)$/.test(entry); 8 | 9 | function showShortcuts() { 10 | const k = (key, msg) => { console.log(color.black + color.bright + `\t${key}\t${msg}` + color.reset) }; 11 | 12 | console.log(''); 13 | k('F1', 'Open result in Graphene'); 14 | k('F2', 'Hide/show preview window'); 15 | k('Enter', isPager ? 'Fetch next page of results' : 'Open result in GUI browser'); 16 | console.log(''); 17 | } 18 | 19 | // convert HEX code to ANSI 20 | function colorToAnsi(rgb, type) { 21 | let code = type === 'fg' ? '38' : '48'; 22 | let colors = rgb.slice(4, -1).split(', ').map(n => parseInt(n)); 23 | // return `\x1b${code};2;${colors[0]};${colors[1]};${colors[2]}m`; 24 | 25 | let mod = ''; 26 | if (colors[0] > 130 || colors[1] > 130 || colors[2] > 130) { 27 | mod = color.bright; 28 | } 29 | 30 | // very crappy simulation of colors because FZF currently only does 8 colors 31 | if (colors[0] < 30 && colors[1] < 30 && colors[2] < 30) { 32 | // most websites are black text on white background, so ignore this setting for now 33 | // return mod + color.black; 34 | return mod + color.white; 35 | } else if (colors[0] > colors[1] * 2 && colors[0] > colors[2] * 2) { 36 | return mod + color.red; 37 | } else if (colors[1] > colors[0] * 2 && colors[1] > colors[2] * 2) { 38 | return mod + color.green; 39 | } else if (colors[2] > colors[1] * 2 && colors[2] > colors[0] * 2) { 40 | return mod + color.blue; 41 | } else if (colors[1] > colors[2] * 2) { 42 | return mod + color.yellow; 43 | } else if (colors[1] > colors[0] * 2) { 44 | return mod + color.cyan; 45 | } else if (colors[0] > colors[1] * 2) { 46 | return mod + color.magenta; 47 | } 48 | 49 | return mod + color.white; 50 | } 51 | 52 | // apply style to the element 53 | function style(element, style, tag) { 54 | let rendered = colorToAnsi(style.color, 'fg') + element + color.reset; 55 | // console.log(tag) 56 | if (/H\d/.test(tag)) { 57 | // heading 58 | return `\n${color.bright}${rendered}`; 59 | // } else if (!/^0px /.test(style.border)) { 60 | // // border 61 | // return `[${rendered}]`; 62 | // } else if (style.background !== 'rgba(0, 0, 0, 0)') { 63 | // // background 64 | // let bg = colorToAnsi(style.background, 'fg'); 65 | // return `${bg}[${color.reset}${rendered}${bg}]${color.reset}` 66 | } 67 | return rendered; 68 | } 69 | 70 | // recursive helper for render() 71 | function _render(context) { 72 | let text = ''; 73 | 74 | context.children.forEach(child => { 75 | let visible = true; 76 | if (typeof child === 'string') { 77 | if (child.trim()) { 78 | text += style(child, context.css, context.tag); 79 | } else { 80 | visible = false; 81 | } 82 | } else { 83 | text += _render(child); 84 | } 85 | if (visible && context.css.display !== 'inline' && text) { 86 | text += '\n'; 87 | } 88 | 89 | }) 90 | 91 | return text; 92 | } 93 | 94 | function render(data) { 95 | console.log(color.blue + color.underscore + url + color.reset); 96 | showShortcuts(); 97 | 98 | if (data.error) { 99 | console.log(color.red + data.error + color.reset); 100 | return; 101 | } 102 | 103 | let output = _render(data.context); 104 | 105 | let border = 4; 106 | let maxLen = parseInt(process.env.COLUMNS * 0.4) - border; // 40% of the window 107 | let lines = output.split('\n'); 108 | lines.forEach(l => { 109 | let chunks = stringToChunks(l.trim(), maxLen); 110 | chunks.forEach(c => { 111 | c && console.log(c); 112 | }); 113 | }); 114 | } 115 | 116 | let cache = readCache(engine, 'current'); 117 | if (Object.keys(cache).length) { 118 | if (cache[url]) { 119 | render(cache[url]); 120 | } else { 121 | render({ 122 | error: 'There was a problem fetching preview for this result.', 123 | }) 124 | } 125 | } else { 126 | render({ 127 | error: 'Preview file could not be loaded.', 128 | }) 129 | } 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Graphene Text Browser 2 | ======================== 3 | [![Graphene Demo](https://asciinema.org/a/249148.svg)](https://asciinema.org/a/249148) 4 | 5 | A text-based browser / search aggregator. It allows you to search the web through the terminal in a style that makes the most 6 | sense in the terminal. It does not try to emulate a GUI browser in the terminal or replace it, because the terminal was not meant for that, and that 7 | just creates a miserable experience for the user. Once you find the page of interest, you can preview it as markdown (`[F1]`) or open it 8 | in your browser of choice (`[Enter]`). 9 | 10 | Main use case is to minimize context switch by starting your search in the termimal (same place you're editing your code), and only jump to the 11 | browser if you found the result you want. This also cuts down distractions that you'll likely encounter in the regular browser, and formats every 12 | page in a consistent way. 13 | 14 | This is still a work in progress, but works decently well. The original inspiration for this project was Ranger File Manager, but after realizing that 15 | adapting it from local browsing to the web would require a significant rewrite, I put the project aside. Then, a couple months later, I stumbled into 16 | FZF, and figured I'd give this tool a try again using FZF instead of Ranger as a way to represent results. And this is the result. 17 | 18 | 19 | Usage 20 | ===== 21 | ``` 22 | graphene [engine] [query] 23 | ``` 24 | or 25 | ``` 26 | graphene url [url] 27 | ``` 28 | 29 | Result: An FZF query with a set of links from the page classified as follows: 30 | - golden/yellow: main group (probably results you meant to search for) 31 | - cyan: navigational links that will result in query to be reperformed instead of opening the page 32 | - green: categories defined for this engine, clicking these will rerender the category URL as new set of search results 33 | - white: regular group, if golden group is wrong, your results may be here (you can adjust weights to reclassify) 34 | - black/gray: most-likely cruft (irrelevant/generic page links) 35 | 36 | Selecting a result will open it in your browser of choice, unless the result is a navigational (cyan) link, which will re-trigger the search with new 37 | offset. Pressing `F1` will instead load the result as markdown version of the page in Graphene (for text-based pages this works well and often can 38 | avoid the unnecessary hop to a browser). 39 | 40 | While previewing the page via `F1`, you can navigate/search the page using search patterns or simple scrolling. For example, typing `#` will filter 41 | all page headings as search results, effectivelly creating a table of contents for the page. Similarly, typing `` ` `` will filter all code blocks 42 | (useful for navigating directly to the example on websites like MDN). 43 | 44 | Other options: 45 | ``` 46 | graphene history # browse search/view history 47 | ``` 48 | 49 | Installation 50 | ============ 51 | Project currently uses the following dependencies: 52 | 53 | - FZF 54 | - node.js 55 | - puppeteer 56 | 57 | To install on OSX/Linux: 58 | 59 | ``` 60 | brew/apt-get/dnf install fzf 61 | npm install puppeteer 62 | ``` 63 | 64 | Add the project directory to `$PATH`, i.e. by adding this to your `.bash_profile`: 65 | 66 | ``` 67 | export PATH="/path/to/graphene:$PATH" 68 | ``` 69 | 70 | Roadmap 71 | ======= 72 | I've built this mainly for myself, the initial set of features are mainly driven by my own use case an aesthetics. What I would like to add (when time allows): 73 | 74 | - Identification of categorizing components (tags (github, npm), search subtypes (github, google, amazon)). 75 | - Authentication encryption 76 | - Ability to trigger a category/subtype search (i.e. search issue list of specific github repo). 77 | - Use of `goodQuery` setting to improve initial calibration. 78 | - `graphene-dsl`: a simplified scripting language for customizing portions of webpage loading logic or performing actions such as complex authentication 79 | 80 | Configuration 81 | ============= 82 | If you want to add a new engine that I haven't included, look at an example of an existing engine in `engines` directory and customize it accordingly. 83 | The only required field is `query` (url used to formulate a search query). For best results, you should fill in as many parameters for the engine as possible. 84 | If your engine works well, feel free to contribute it back to this repository. Here is an explanation of each field: 85 | 86 | ``` 87 | { 88 | "banner": "Banner you want displayed to the user performing the search", 89 | "query": "URL used by the engine as point of entry", 90 | "goodQuery": "Example of a good query that yields a lot of results (not yet used for calibration)", 91 | "badQuery": "Example of a bad query that yields few or no results", 92 | "pager": { 93 | "name": "Name to search for to identify navigational component (i.e. next/prev page of results)", 94 | "href": "Unique field in URL to search for that correlates to navigational offset (i.e. page=, start=, etc.)" 95 | }, 96 | "weights": { 97 | "context": 2, // Amount of context per element. 98 | "coverage": 1, // Amount of space your elements seem to cover on screen (how spread out they are). 99 | "area": 1, // Area correlates with things like font size but may break if you stick a large image inside that's not a main group. 100 | "textLength": 1, // Text length is the combined length of all text inside the given group of links. 101 | "numElements": 0, // Number of elements in the group, higher weight means groups with more elements will be preferred. 102 | } 103 | } 104 | ``` 105 | 106 | Queries are used to calibrate the caching mechanism. Pager info is optional (there is a well functioning set of defaults) and is meant for websites where 107 | defaults. Weights are numeric values (these can be integers or floating point) used to calibrate the browser's determination of search result significance 108 | for specific website. For a regular search engine like Google, results would have longer text length and have more contextual text to summarize the result. 109 | For an image-based search engine like Amazon, area taken up by results may be more significant. If your engine is misclassifying the main group, play with 110 | the weights to adjust it. 111 | 112 | FAQ 113 | === 114 | 115 | #### Will this work with any search engine/website? 116 | Probably not, but it has worked with more than I expected, and will continue to improve. 117 | 118 | #### Can this profile a page that's not a search engine? 119 | Yes it can, and it falls back to defaults, which usually work well but may epic-fail on some websites. You can pass an exact URL instead of the query to open. 120 | Instead of engine, use `url` keyword. This seems to work with websites like Slashdot, with Reddit it fails to find the pager (which loads dynamically via scroll). 121 | If you have ideas for how to handle this case or other improvements, feel free to contribute. 122 | 123 | #### Will this work if I point it to a specific news story or blog entry via `url` keyword? 124 | Not yet, but almost. This is not meant to be a complete replacement for your regular browser. It's designed to process aggregate-based webpages and extracting key 125 | information for each link. It can extract arbitrary text from a webpage and render it as markdown, but for now you need to drive from an aggregate website first. 126 | 127 | #### How does it work? How does it know which groups are signfiicant and which is the main one? 128 | It uses heuristics similar to what a human would do when navigating to a page. Groups that take up more visual space on the page are deemed more important. 129 | Groups whose elements don't change at all between searches are deemed unimportant, groups whose names don't change but urls do are navigational (they apply 130 | to the search in some way but aren't part of the results). 131 | 132 | #### Can this be made faster/smarter by specifying the exact class/id of the results group? 133 | That's typically what scrapers do, and why they're easy to break with minor changes to the search engine. This aggregator uses more generic heuristics 134 | and therefore harder to fool. For example, Google runs some sort of uglifier on their frontend. This uglifier mangles class/id names. These names then 135 | stay consistent between searches (giving you the illusion of your selector working), but change every time Google redeploys their frontend (which happens 136 | several times per week). This aggregator doesn't care about changes like that, it analyzes link significance on the page the same way a human would. Moreover, 137 | even if the engine decides to change the page in a significant way, the aggregator should be able to adapt to it after clearing your old cache. 138 | 139 | #### Does this comply with terms of use for the websites being aggregated? 140 | Most websites should be fine with it (especially since I'm not explicitly blocking ads - they'd just get classified into one of the less relevant categories). 141 | I'm also not monetizing their results in any way, which is typically what triggers them to go after you. Some websites do indeed have very draconic 142 | (and probably unenforcable) policies, worst thing they'll do is block puppeteer from being able to crawl their website or temporarily ban the abusing IP. 143 | -------------------------------------------------------------------------------- /utils.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require("path"); 3 | 4 | // color scheme 5 | const color = { 6 | reset: '\x1b[0m', 7 | bright: '\x1b[1m', 8 | italic: '\x1b[3m', 9 | underscore: '\x1b[4m', 10 | black: '\x1b[30m', 11 | red: '\x1b[31m', 12 | green: '\x1b[32m', 13 | yellow: '\x1b[33m', 14 | blue: '\x1b[34m', 15 | magenta: '\x1b[35m', 16 | cyan: '\x1b[36m', 17 | white: '\x1b[37m', 18 | }; 19 | 20 | // reads previously written cache 21 | function readCache(engine, type) { 22 | let json; 23 | try { 24 | json = require('./.cache/' + engine + '-' + type); 25 | } catch (e) { 26 | json = {}; 27 | } 28 | 29 | return json; 30 | }; 31 | 32 | // writes page as a cache file to disk 33 | function writeCache(engine, type, json) { 34 | fs.writeFileSync(path.resolve(__dirname, './.cache/' + engine + '-' + type + '.json'), JSON.stringify(json)); 35 | }; 36 | 37 | // scans config for placeholders for user to fill in, and requests them from user 38 | let input; 39 | function initInput() { 40 | input = require('readline').createInterface({ 41 | input: process.stdin, 42 | output: process.stdout 43 | }); 44 | // overwrite to hide password 45 | input._writeToOutput = function _writeToOutput(stringToWrite) { 46 | if (input.stdoutMuted) 47 | //input.output.write("\x1B[2K\x1B[200D"+input.query+"["+((input.line.length%2==1)?"=-":"-=")+"]"); 48 | input.output.write("*"); 49 | else 50 | input.output.write(stringToWrite); 51 | }; 52 | } 53 | // helper function to get user input and avoid pyramid of doom 54 | function ask(question, test, hide=false) { 55 | return new Promise((resolve) => { 56 | if (test()) { 57 | input.stdoutMuted = hide; 58 | input.question(`${question}: `, (answer) => { 59 | resolve(answer); 60 | }); 61 | } else { 62 | resolve(undefined); 63 | } 64 | }); 65 | } 66 | async function requestUserFields(engine, settings) { 67 | let userConfig = readCache(engine, 'auth'); 68 | 69 | if (!Object.keys(userConfig).length) { 70 | initInput(); 71 | // website 72 | userConfig.url = await ask('This engine needs a host website', () => /{{URL}}/.test(settings.query)); 73 | // userConfig.url && console.log('Website:', userConfig.url); 74 | 75 | if (settings.authentication) { 76 | const auth = settings.authentication; 77 | // username 78 | userConfig.username = await ask('Enter username', () => auth.username && /{{USERNAME}}/.test(auth.username)); 79 | // userConfig.username && console.log('User Name:', userConfig.username); 80 | 81 | // password 82 | userConfig.password = await ask('Enter password', () => { 83 | return auth.password && /{{PASSWORD}}/.test(auth.password) 84 | }, true); 85 | // userConfig.password && console.log('Password:', ''.padEnd(userConfig.password.length, '*')); 86 | } 87 | 88 | let save = (await ask('Save for future use? [yN]', () => Object.keys(userConfig).length)) === 'y'; 89 | if (save) { 90 | writeCache(engine, 'auth', userConfig); 91 | } 92 | input.close(); 93 | } 94 | // console.log(userConfig) 95 | return userConfig; 96 | } 97 | 98 | // writes entry to history file 99 | // types: S (search), U (url), N (navigational), R (result), X (external) 100 | function writeHistory(url, type, params, initial=false) { 101 | let time = Date.now(); 102 | let context = ''; 103 | if (params) { 104 | if (type === 'S') { 105 | context = `${params.engine}> "${params.query}" `; 106 | } else if (type === 'R') { 107 | context = `graphene> "${params.title}" `; 108 | } else if (type === 'X') { 109 | context = `${params.engine}>open> "${params.result}" `; 110 | } else if (type === 'N') { 111 | context = `${params.engine}>nav> "${params.title}" `; 112 | } else if (type === 'U') { 113 | context = `url> "${params.title}" `; 114 | } 115 | } 116 | fs.appendFile(path.resolve(__dirname, './.cache/history'), `${initial ? '' : ' '}${time} ${type} ${context}${url}\n`, (err) => { 117 | if (err) { 118 | throw err; 119 | } 120 | }); 121 | } 122 | 123 | // split a long string into shorter chunks 124 | function stringToChunks(str, size) { 125 | // const numChunks = Math.ceil(str.length / size); 126 | const chunks = []; 127 | 128 | let index = 0; 129 | while (str.length && index < str.length - 1) { 130 | let line = str.substr(index, size); 131 | index += size; 132 | if (index < str.length - 1 && str[index-1] !== ' ' && str[index] !== ' ') { 133 | // we're mid-word 134 | let offset = 0 135 | while (index && str[index-1] !== ' ') { 136 | index--; 137 | offset++; 138 | } 139 | line = line.substr(0, line.length-offset); 140 | } 141 | chunks.push(line); 142 | } 143 | 144 | return chunks; 145 | } 146 | 147 | 148 | // dictionary of common element names 149 | let dictionary = { 150 | // groups that are typically not actionable from terminal 151 | cruft: [ 152 | // account 153 | 'sign in', 154 | 'log in', 155 | 'login', 156 | 'sign up', 157 | 'join', 158 | 'register', 159 | 160 | // menus 161 | 'about', 162 | 'blog', 163 | 'contact us', 164 | 'cookie policy', 165 | 'feedback', 166 | 'help', 167 | 'home', 168 | 'jobs', 169 | 'legal', 170 | 'privacy', 171 | 'privacy policy', 172 | 'return policy', 173 | 'security', 174 | 'settings', 175 | 'terms', 176 | 'terms of service', 177 | 'terms of use', 178 | 179 | // categorization 180 | 'questions', 181 | 'tags', 182 | 'users', 183 | 'votes', 184 | 185 | // media sharing 186 | 'facebook', 187 | 'linkedin', 188 | 'reddit', 189 | 'twitch', 190 | 'twitter', 191 | 'youtube', 192 | ], 193 | // navigation elements/groups 194 | navigation: { 195 | name: [ 196 | '^1$', 197 | '^2$', 198 | '^3$', 199 | '^4$', 200 | '^5$', 201 | '^next\\b', 202 | '^prev\\b', 203 | '^previous\\b', 204 | '^back\\b', 205 | '^newer\\b', 206 | '^older\\b', 207 | ], 208 | href: [ 209 | '\\bstart=\\d+\\b', 210 | '\\bpage=\\d+\\b', 211 | '\\bp=\\d+\\b', 212 | '\\bpstart=\\d+\\b', 213 | ] 214 | } 215 | }; 216 | 217 | // weights to apply when evaluating significance of each group 218 | const weights = { 219 | context: 2, // amount of context per element 220 | coverage: 1, // amount of space your elements seem to cover on screen (how spread out they are) 221 | area: 1, // area correlates with things like font size but may break if you stick a large image inside that's not a main group 222 | textLength: 1, // text length is the combined length of all text inside the given group of links 223 | numElements: 0, // number of elements in the group 224 | } 225 | 226 | // minimum thresholds each group has to meet to be considered significant 227 | const thresholds = { 228 | coverage: 30000, 229 | numElements: 5, 230 | }; 231 | 232 | 233 | 234 | // taken from https://github.com/alanwsmith/markdown_table_formatter 235 | // Not the prettiest code, but it gets the job done 236 | function MarkdownTableFormatter() { 237 | 238 | // Setup instance variables. 239 | this.cells = new Array(); 240 | this.column_widths = new Array(); 241 | this.output_table = ""; 242 | 243 | } 244 | 245 | MarkdownTableFormatter.prototype.add_missing_cell_columns = function() { 246 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) { 247 | for (var col_i = 0, col_l = this.column_widths.length; col_i < col_l; col_i = col_i + 1) { 248 | if (typeof this.cells[row_i][col_i] === 'undefined') { 249 | this.cells[row_i][col_i] = ''; 250 | } 251 | } 252 | } 253 | } 254 | 255 | MarkdownTableFormatter.prototype.format_table = function(table) { 256 | 257 | this.import_table(table); 258 | this.get_column_widths(); 259 | this.add_missing_cell_columns(); 260 | this.pad_cells_for_output(); 261 | 262 | // Header 263 | this.output_table = "| "; 264 | this.output_table += this.cells[0].join(" | "); 265 | this.output_table += " |\n"; 266 | 267 | // Separator 268 | this.output_table += "|-"; 269 | this.output_table += this.cells[1].join("-|-"); 270 | this.output_table += "-|\n"; 271 | 272 | 273 | for (var row_i = 2, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) { 274 | this.output_table += "| "; 275 | this.output_table += this.cells[row_i].join(" | "); 276 | this.output_table += " |\n"; 277 | } 278 | 279 | } 280 | 281 | MarkdownTableFormatter.prototype.get_column_widths = function() { 282 | 283 | this.column_widths = new Array(); 284 | 285 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) { 286 | for (var col_i = 0, col_l = this.cells[row_i].length; col_i < col_l; col_i = col_i + 1) { 287 | if (typeof this.column_widths[col_i] === 'undefined') { 288 | this.column_widths[col_i] = this.cells[row_i][col_i].length; 289 | } 290 | else if (this.column_widths[col_i] < this.cells[row_i][col_i].length) { 291 | this.column_widths[col_i] = this.cells[row_i][col_i].length; 292 | } 293 | } 294 | } 295 | } 296 | 297 | MarkdownTableFormatter.prototype.import_table = function(table) { 298 | 299 | var table_rows = table.split("\n"); 300 | 301 | // Remove leading empty lines 302 | while (table_rows[0].indexOf('|') == -1) { 303 | table_rows.shift(); 304 | } 305 | 306 | for (var row_i = 0, row_l = table_rows.length; row_i < row_l; row_i = row_i + 1) { 307 | 308 | // TODO: Set up the indexes so that empty lines at either the top or bottom will 309 | // be removed. Right now, this is only helpful for empty lines at the bottom. 310 | if(table_rows[row_i].indexOf('|') == -1) { 311 | continue; 312 | } 313 | 314 | this.cells[row_i] = new Array(); 315 | 316 | var row_columns = table_rows[row_i].split("\|"); 317 | 318 | for (var col_i = 0, col_l = row_columns.length; col_i < col_l; col_i = col_i + 1) { 319 | this.cells[row_i][col_i] = row_columns[col_i] 320 | this.cells[row_i][col_i] = this.cells[row_i][col_i].replace(/^\s+/g,""); 321 | this.cells[row_i][col_i] = this.cells[row_i][col_i].replace(/\s+$/g,""); 322 | 323 | // If it's the separator row, parse down the dashes 324 | // Only do this if it matches to avoid adding a 325 | // dash in an empty column and messing with the column widths. 326 | if (row_i == 1) { 327 | this.cells[row_i][col_i] = this.cells[row_i][col_i].replace(/-+/g,"-"); 328 | } 329 | } 330 | } 331 | 332 | 333 | // Remove leading and trailing rows if they are empty. 334 | this.get_column_widths(); 335 | 336 | if (this.column_widths[0] == 0) { 337 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) { 338 | this.cells[row_i].shift(); 339 | } 340 | } 341 | 342 | this.get_column_widths(); 343 | 344 | // check to see if the last item in column widths is empty 345 | if (this.column_widths[ (this.column_widths.length - 1) ] == 0) { 346 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) { 347 | // Only remove the row if it is in the proper last slot. 348 | if (this.cells[row_i].length == this.column_widths.length) { 349 | this.cells[row_i].pop(); 350 | } 351 | } 352 | } 353 | 354 | this.get_column_widths(); 355 | 356 | } 357 | 358 | MarkdownTableFormatter.prototype.pad_cells_for_output = function() { 359 | 360 | for (var row_i = 0, row_l = this.cells.length; row_i < row_l; row_i = row_i + 1) { 361 | for (var col_i = 0, col_l = this.cells[row_i].length; col_i < col_l; col_i = col_i + 1) { 362 | 363 | // Handle anything that's not the separator row 364 | if (row_i != 1) { 365 | while(this.cells[row_i][col_i].length < this.column_widths[col_i]) { 366 | this.cells[row_i][col_i] += " "; 367 | } 368 | } 369 | // Handle the separator row. 370 | else { 371 | while(this.cells[row_i][col_i].length < this.column_widths[col_i]) { 372 | this.cells[row_i][col_i] += "-"; 373 | } 374 | } 375 | } 376 | } 377 | } 378 | 379 | module.exports = { 380 | color, 381 | requestUserFields, 382 | readCache, 383 | writeCache, 384 | writeHistory, 385 | dictionary, 386 | stringToChunks, 387 | weights, 388 | thresholds, 389 | MarkdownTableFormatter 390 | }; 391 | -------------------------------------------------------------------------------- /search.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const { color, dictionary, requestUserFields, readCache, writeCache, writeHistory, weights, thresholds } = require('./utils'); 3 | 4 | const engine = process.argv[2]; 5 | const query = process.argv[3]; 6 | 7 | 8 | // populate banner 9 | function banner() { 10 | if (engine !== 'url' && settings.banner) { 11 | let banner = settings.banner; 12 | let matches = banner.match(/{(.*?)}/g); 13 | matches.forEach((match, i) => { 14 | let escCode = ''; 15 | let fields = match.slice(1,-1).split(','); 16 | fields.forEach(field => { 17 | props = field.split('='); 18 | let colorModifier = props[0].trim() === 'bg' ? 10 : 0; 19 | if (props[1].trim().startsWith('color')) { 20 | escCode += '\x1b[' + (38 + colorModifier) + ';5;' + props[1].trim().slice(5) + 'm'; 21 | } else { 22 | escCode += color[props[1].trim()]; 23 | if (colorModifier) { 24 | escCode = escCode.replace('[3', '[4'); 25 | } 26 | } 27 | }); 28 | if (!i) escCode += ' '; // pad the beginning 29 | banner = banner.replace(match, escCode); 30 | }); 31 | return `${banner} ${color.reset} ${color.bright}${query}${color.reset}`; 32 | } else { 33 | return `${color.red}${engine} ${color.reset} ${color.bright}${query}${color.reset}`; 34 | } 35 | } 36 | 37 | // output data 38 | function outputToTerminal(format, groups) { 39 | // return if we're just caching 40 | if (!groups) return; 41 | 42 | if (format === "json") { 43 | console.log(JSON.stringify(groups)); 44 | } else if (format === 'shell') { 45 | var curated = banner() + '\n'; 46 | 47 | groups.forEach(function (group, index) { 48 | let groupColor; 49 | if (group.groupType === MAIN) { 50 | groupColor = color.yellow; 51 | } else if (group.groupType === PAGER) { 52 | groupColor = color.bright + color.cyan; 53 | } else if (group.groupType === GENERIC) { 54 | groupColor = color.black + color.bright; 55 | } else if (group.groupType === OTHER) { 56 | groupColor = color.black + color.bright; 57 | } else if (group.groupType === CATEGORY) { 58 | groupColor = color.green; 59 | } else { 60 | groupColor = color.white; 61 | } 62 | 63 | group.elements.forEach(function (element) { 64 | if (!process.env.ONLY_MAIN || group.groupType === MAIN || group.groupType === PAGER) { 65 | curated += 66 | groupColor + element.name.replace(/\n/g, ', ').padEnd(parseInt(120 * 2 / 3)) + color.reset + '\t' + 67 | color.blue + color.underscore + element.href + color.reset + ( 68 | group.groupType === PAGER || group.groupType === CATEGORY ? '\t\t(pager)' : '' 69 | ) + '\n'; 70 | } 71 | }); 72 | }); 73 | console.log(curated); 74 | } else { 75 | console.log('No format specified'); 76 | } 77 | } 78 | 79 | // helper function for determining if paths are the same 80 | function isSamePath(a, b) { 81 | return a.path.every((element, index) => element === b.path[index]); 82 | } 83 | 84 | // finds a group with the same style in current results 85 | // chances are groups will be in the same order, but there may be missing/new 86 | // groups depending on what the search engine inserts into the page (ads, previews, maps, cards) 87 | function findGroupByStyle(currentResults, style) { 88 | for (var index = 0; index < currentResults.groups.length; index++) { 89 | let group = currentResults.groups[index]; 90 | if ( 91 | group.style.fontSize === style.fontSize && 92 | group.style.fontFamily === style.fontFamily && 93 | group.style.fontWeight === style.fontWeight && 94 | group.style.color === style.color && 95 | group.style.border === style.border && 96 | group.style.visible === style.visible && 97 | isSamePath(group.style, style) 98 | ) { 99 | return index; 100 | } 101 | } 102 | return -1; 103 | } 104 | 105 | 106 | // returns domain name from passed URL 107 | function domain(url) { 108 | let hostname; 109 | if (url.indexOf("//") > -1) { 110 | hostname = url.split('/')[2]; 111 | } else { 112 | hostname = url.split('/')[0]; 113 | } 114 | 115 | // find & remove port number 116 | hostname = hostname.split(':')[0]; 117 | // find & remove "?" 118 | hostname = hostname.split('?')[0]; 119 | 120 | return hostname; 121 | } 122 | 123 | // helper functions used by classifier 124 | const mostly = (g, group) => g.length / group.elements.length > 0.6; 125 | function isNavigation(element) { 126 | // if (element.name.slice(0, 2) === 'Old') 127 | let names = dictionary.navigation.name; 128 | let links = dictionary.navigation.href; 129 | let elementName = element.name.toLowerCase(); 130 | let elementHref = element.href.toLowerCase(); 131 | for (var nameIndex = 0; nameIndex < names.length; nameIndex++) { 132 | if (new RegExp(names[nameIndex], 'u').test(elementName)) { 133 | // name passes navigation check 134 | for (var hrefIndex = 0; hrefIndex < links.length; hrefIndex++) { 135 | if (new RegExp(links[hrefIndex], 'u').test(elementHref)) { 136 | return true; 137 | } 138 | } 139 | } 140 | } 141 | return false; 142 | } 143 | 144 | // constants for group types 145 | const MAIN = 0; 146 | const PAGER = 1; 147 | const CATEGORY =2; 148 | const CATEGORY2=3; 149 | const DEFAULT = 4; 150 | const GENERIC = 5; 151 | const OTHER = 6; 152 | 153 | // removes any groups/elements that are static between pages, pages are cached 154 | function removeCruftAndClassify(currentResults) { 155 | let urlMap = {}; 156 | if (process.env.CACHING) { 157 | writeCache(engine, 'template', currentResults); 158 | return; 159 | } else if (engine === 'url') { 160 | currentResults.groups.slice(0).forEach(group => { 161 | group.groupType = DEFAULT; 162 | 163 | let cruft = []; 164 | let jsLink = []; 165 | let generic = []; 166 | group.elements.forEach(element => { 167 | if (dictionary.cruft.includes(element.name.toLowerCase())) { 168 | cruft.push(element); 169 | } else if (isNavigation(element)) { 170 | group.groupType = PAGER; 171 | } else if (element.href.slice(0, 11) === "javascript:") { 172 | jsLink.push(element); 173 | } 174 | 175 | group.elements.forEach(e => { 176 | urlMap[e.href] = e; 177 | }) 178 | }); 179 | 180 | let currentIndex = currentResults.groups.indexOf(group); 181 | if (mostly(cruft, group)) { 182 | // a lot of generic elements 183 | currentResults.groups.splice(currentIndex, 1); 184 | } else if (mostly(jsLink, group)) { 185 | // a lot of elements that only execute JS, we can't do anything with them yet 186 | currentResults.groups.splice(currentIndex, 1); 187 | } else if (group.coverage < thresholds.coverage || group.elements.length < thresholds.numElements) { 188 | // group is too small to seem significant 189 | group.groupType = OTHER; 190 | } 191 | }); 192 | } else { 193 | let cache = readCache(engine, 'template'); 194 | // filter out results based on cache 195 | currentResults.groups.slice(0).forEach(group => { 196 | group.groupType = DEFAULT; 197 | 198 | let index = findGroupByStyle(cache, group.style); 199 | let cruft = []; 200 | let jsLink = []; 201 | let generic = []; 202 | let currentIndex = currentResults.groups.indexOf(group); 203 | if (index !== -1) { 204 | let cachedGroup = cache.groups[index]; 205 | group.elements.forEach(element => { 206 | let found = cachedGroup.elements.find(currentElement => { 207 | return currentElement.name === element.name; 208 | }); 209 | if (found) { 210 | if (found.href === element.href || !found.name) { 211 | // 100% cruft (url and name match) 212 | cruft.push(found); 213 | } else if (dictionary.cruft.includes(element.name.toLowerCase())) { 214 | cruft.push(found); 215 | } else if (settings.pager) { 216 | // generic navigational component that may be related to current search 217 | // (name matches, url does not) 218 | generic.push(found); 219 | if (found.name === settings.pager.name && 220 | found.href.includes(settings.pager.href) && 221 | domain(element.href) === domain(settings.query) 222 | ) { 223 | // this is a pager group 224 | group.groupType = PAGER; 225 | } 226 | } 227 | } else if (element.href.slice(0, 11) === "javascript:") { 228 | jsLink.push(element); 229 | } 230 | if (isNavigation(element)) { 231 | group.groupType = PAGER; 232 | } 233 | group.elements.forEach(e => { 234 | urlMap[e.href] = e; 235 | }) 236 | }); 237 | 238 | if (mostly(cruft, group)) { 239 | // a lot of generic elements 240 | currentResults.groups.splice(currentIndex, 1); 241 | } else if (!group.pagers && group.elements.length < 2) { 242 | // only 1 element in group 243 | currentResults.groups.splice(currentIndex, 1); 244 | } else if (mostly(generic, group) && group.groupType !== PAGER) { 245 | // group of generically-named components 246 | group.groupType = GENERIC; 247 | } 248 | } else { 249 | let categoryElements = []; 250 | let spliceOffset = 0; 251 | group.elements.slice(0).forEach((e, i) => { 252 | urlMap[e.href] = e; 253 | if (isNavigation(e)) { 254 | // this is needed for now since we're going off of bad query, since the query may not yield 255 | // other pages, as we improve caching logic, we can probbaly remove this 256 | group.groupType = PAGER; 257 | } else if (settings.categories && !(group.groupType === PAGER)) { 258 | Object.keys(settings.categories).forEach(category => { 259 | settings.categories[category].forEach(rule => { 260 | if (rule.find) { 261 | // a rule that recategorizes existing results 262 | if (rule.find.href && new RegExp(rule.find.href, 'u').test(e.href)) { 263 | if (rule.find.name && !(new RegExp(rule.find.name, 'u').test(e.name))) { 264 | return; 265 | } 266 | e.name = category + ': ' + e.name; 267 | categoryElements.push(e); 268 | group.elements.splice(i - spliceOffset, 1); 269 | spliceOffset++; 270 | } 271 | } 272 | }); 273 | }); 274 | } else if (e.href.slice(0, 11) === "javascript:") { 275 | jsLink.push(e); 276 | } 277 | }); 278 | if (categoryElements.length) { 279 | // some elements were categorized 280 | if (!group.elements.length) { 281 | // entire group got categorized 282 | group.elements = categoryElements; 283 | group.groupType = CATEGORY; 284 | } else { 285 | // part of the group got categorized 286 | // TODO; technically group areas need to be recomputed and they need to be resorted 287 | let categoryGroup = { ...group, groupType: CATEGORY, elements: categoryElements }; 288 | currentResults.groups.splice(currentIndex, 0, categoryGroup); 289 | } 290 | } 291 | } 292 | 293 | // further classify the group 294 | if (mostly(jsLink, group)) { 295 | // a lot of elements that only execute JS, we can't do anything with them yet 296 | currentResults.groups.splice(currentIndex, 1); 297 | } else if ( 298 | group.groupType !== PAGER && ( 299 | group.coverage < (settings.minGroupSize ? settings.minGroupSize : thresholds.coverage) || 300 | group.elements.length < thresholds.numElements 301 | ) 302 | ) { 303 | // group is too small to seem significant 304 | //group.groupType = OTHER; 305 | } 306 | }) 307 | } 308 | writeCache(engine, 'current', urlMap); 309 | 310 | // find main group 311 | let groupIndex = 0; 312 | while (groupIndex < currentResults.groups.length) { 313 | if (currentResults.groups[groupIndex].groupType === DEFAULT) { 314 | currentResults.groups[groupIndex].groupType = MAIN; 315 | break; 316 | } 317 | groupIndex++; 318 | } 319 | 320 | return currentResults.groups.sort((a, b) => a.groupType - b.groupType); 321 | } 322 | 323 | // load engine-specific settings 324 | let settings = {}; 325 | if (engine !== 'url') { 326 | try { 327 | settings = require('./engines/' + engine); 328 | } catch (e) { 329 | if (/Cannot find module/.test(e)) { 330 | console.log('No configuration exists for ' + engine); 331 | } else { 332 | console.log(engine + '.json: ' + e); 333 | } 334 | process.exit(1); 335 | } 336 | } 337 | 338 | const isValidUrl = (string) => { 339 | try { 340 | new URL(string); 341 | return true; 342 | } catch (_) { 343 | return false; 344 | } 345 | } 346 | 347 | (async () => { 348 | const browser = await puppeteer.launch({ 349 | args: [ 350 | '--no-sandbox', 351 | '--disable-setuid-sandbox', 352 | '--disable-infobars', 353 | '--window-position=0,0', 354 | '--ignore-certifcate-errors', 355 | '--ignore-certifcate-errors-spki-list', 356 | '--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"', 357 | '--disk-cache-dir=/tmp', 358 | ], 359 | ignoreHTTPSErrors: true, 360 | }); 361 | const page = await browser.newPage(); 362 | await page.setRequestInterception(true); 363 | 364 | // skip downloading images 365 | page.on('request', request => { 366 | if (request.resourceType() === 'image') { 367 | request.abort(); 368 | } else { 369 | request.continue(); 370 | } 371 | }); 372 | 373 | // login, if relevant info is available 374 | if (settings.authentication && !process.env.CACHING) { 375 | const auth = settings.authentication; 376 | let config = readCache(engine, 'auth'); 377 | let cookieData = readCache(engine, 'cookies'); 378 | if (config.url) { 379 | auth.loginPage = auth.loginPage.replace('{{URL}}', config.url); 380 | settings.query = settings.query.replace('{{URL}}', config.url); 381 | } 382 | 383 | if (cookieData.cookies) { 384 | // we already have cookies, set them and continue 385 | // TODO: we need to test for expired cookies 386 | for (let cookie of cookieData.cookies) { 387 | await page.setCookie(cookie); 388 | } 389 | } else { 390 | // no cookies, perform login 391 | if (config.username) { 392 | auth.username = auth.username.replace('{{USERNAME}}', config.username); 393 | } 394 | if (config.password) { 395 | auth.password = auth.password.replace('{{PASSWORD}}', config.password); 396 | } 397 | 398 | if (auth.submitUsernameSelector) { 399 | // 2-page authentication system (i.e. gmail) 400 | await page.goto(auth.loginPage); 401 | await page.type(auth.usernameSelector, auth.username); 402 | await Promise.all([ 403 | page.click(auth.submitUsernameSelector), 404 | page.waitForNavigation({ waitUntil: 'networkidle0' }), 405 | ]); 406 | await page.type(auth.passwordSelector, auth.password); 407 | await Promise.all([ 408 | page.click(auth.submitPasswordSelector), 409 | page.waitForNavigation({ waitUntil: 'networkidle0' }), 410 | ]); 411 | // await page.screenshot({path: 'postlogin.png'}); 412 | } else { 413 | // regular 1-page authentication 414 | await page.goto(auth.loginPage); 415 | // await page.screenshot({path: 'login.png'}); 416 | await page.type(auth.usernameSelector, auth.username); 417 | await page.type(auth.passwordSelector, auth.password); 418 | await Promise.all([ 419 | page.click(auth.submitSelector), 420 | page.waitForNavigation({ waitUntil: 'networkidle0' }), 421 | ]); 422 | // await page.screenshot({path: 'postlogin.png'}); 423 | } 424 | 425 | // get cookies for future use 426 | // for now we'll jsut reauthenticate each time, in the future we should test 427 | // cookies first, and have a way to test if we're already logged in:w 428 | const cookies = await page.cookies(); 429 | writeCache(engine, 'cookies', { cookies: cookies }); 430 | } 431 | } 432 | 433 | // page.on('console', msg => console.log('page log: ' + msg.text())); 434 | 435 | if (process.env.CACHING) { 436 | let config = await requestUserFields(engine, settings); 437 | settings.query = settings.query.replace('{{URL}}', config.url); 438 | // caching page structure 439 | await page.goto(settings.query + encodeURIComponent(settings.badQuery)); 440 | } else if (engine === "url") { 441 | // go directly to this page (direct) 442 | let url = query; 443 | if (!isValidUrl(url)) { 444 | url = 'http://' + url; 445 | } 446 | await page.goto(url); 447 | let title = await page.title(); 448 | writeHistory(url, 'U', { title: title }, true); 449 | } else if (isValidUrl(query) && domain(query) === domain(settings.query)) { 450 | // go directly to this page (navigational) 451 | await page.goto(query); 452 | let title = await page.title(); 453 | writeHistory(query, 'N', { engine: engine, title: title }); 454 | } else { 455 | // start a new search with query 456 | let modifier = settings.resultModifier ? settings.resultModifier + (process.env.RESULTS || settings.resultsPerPage || 20) : ''; 457 | let searchQuery = settings.query + encodeURIComponent(query) + modifier; 458 | await page.goto(searchQuery); 459 | writeHistory(searchQuery, 'S', { engine: engine, query: query }, true); 460 | } 461 | // await page.screenshot({path: 'example.png'}); 462 | let results = await page.evaluate((columns, weights, settings) => { 463 | 464 | /** LIST OF LOGIC TO BE USED */ 465 | 466 | 467 | // test if DOM element is visible to end user 468 | function isVisible(elem) { 469 | if (!(elem instanceof Element)) throw Error('DomUtil: elem is not an element.'); 470 | var style = getComputedStyle(elem); 471 | var rect = elem.getBoundingClientRect(); 472 | if (style.display === 'none') return false; 473 | if (style.visibility !== 'visible') return false; 474 | if (parseFloat(style.opacity) < 0.1) return false; 475 | if (elem.offsetWidth + elem.offsetHeight + rect.height + rect.width === 0) { 476 | return false; 477 | } 478 | return true; 479 | } 480 | 481 | // squishes node into its CSS selector 482 | function extractCssSelector(node) { 483 | return node.tagName + 484 | (node.id ? '#' + node.id : '') + 485 | (node.className ? '.' + Array.prototype.join.call(node.classList, '.') : ''); 486 | } 487 | 488 | // find DOM element ancestors 489 | function listParents(node) { 490 | var nodes = [extractCssSelector(node)] 491 | for (; node; node = node.parentNode) { 492 | nodes.unshift(extractCssSelector(node)) 493 | } 494 | return nodes 495 | } 496 | 497 | // get visual style for a single DOM element 498 | function getStyle(element) { 499 | var style = window.getComputedStyle(element); 500 | var dimensions = element.getBoundingClientRect(); 501 | return { 502 | fontSize: style.fontSize, 503 | fontFamily: style.fontFamily, 504 | fontWeight: style.fontWeight, 505 | color: style.color, 506 | background: style.backgroundColor, 507 | border: style.border, 508 | visible: isVisible(element), 509 | display: style.display, 510 | loc: { 511 | x: dimensions.left, 512 | y: dimensions.top, 513 | h: dimensions.height, 514 | w: dimensions.width 515 | } 516 | }; 517 | } 518 | 519 | // extract important DOM element properties into serializable JSON object 520 | function extract(element) { 521 | return { 522 | tag: element.tagName, 523 | css: getStyle(element), 524 | href: element.href, 525 | name: element.innerText ? element.innerText.trim() : '', 526 | classes: [...element.classList], 527 | path: listParents(element), 528 | id: element.id 529 | }; 530 | } 531 | 532 | // compute encompassing region given 2 child regions 533 | function combineRegion(region1, region2) { 534 | var minX = Math.min(region1.x, region2.x); 535 | var minY = Math.min(region1.y, region2.y); 536 | var maxX = Math.max(region1.x + region1.w, region2.x + region2.w); 537 | var maxY = Math.max(region1.y + region1.h, region2.y + region2.h); 538 | 539 | return { 540 | x: minX, 541 | y: minY, 542 | w: maxX - minX, 543 | h: maxY - minY 544 | }; 545 | } 546 | 547 | // helper function for expandSelection 548 | function isSameStyle(a, b) { 549 | a = getStyle(a); 550 | b = getStyle(b); 551 | 552 | if ( 553 | a.fontSize === b.fontSize && 554 | a.fontFamily === b.fontFamily && 555 | a.fontWeight === b.fontWeight && 556 | a.color === b.color && 557 | a.border === b.border && 558 | a.visible === b.visible 559 | ) { 560 | return true; 561 | } 562 | return false; 563 | } 564 | 565 | // expands selection to elements encompassing the link elements until largest common 566 | // ancestor is found for all elements in the group (a basis for better preview) 567 | function expandSelection(elements) { 568 | let parents = [...elements].map(e => { 569 | let node = e._node; 570 | delete e._node; 571 | return node; 572 | }); 573 | if (parents.length === 1) { 574 | // there won't be other elements to compare the context to, assume no context 575 | return parents; 576 | } 577 | let grandParents; 578 | while (true) { 579 | grandParents = []; 580 | for (var i=0; i < parents.length; i++) { 581 | let parent = parents[i].parentNode; 582 | if (parent === window) { 583 | return parents; 584 | } 585 | if (grandParents.length) { 586 | let prev = grandParents[grandParents.length-1]; 587 | if (prev === parent) { 588 | // at least two elements joined, stop analyzing 589 | return parents; 590 | } else if (!isSameStyle(prev, parent)) { 591 | // styles don't match 592 | return parents; 593 | } 594 | } 595 | grandParents.push(parent); 596 | } 597 | parents = grandParents; 598 | } 599 | return parents; 600 | } 601 | 602 | // fetches details from current selection suitable for rendering later 603 | function getRenderDetail(node) { 604 | let detail = extract(node); 605 | if (detail.css.visible) { 606 | return { 607 | ...detail, 608 | children: Array.prototype.map.call(node.childNodes, (node) => { 609 | if (node.nodeType === Node.TEXT_NODE) { 610 | return node.textContent; 611 | } else if (node.nodeType === Node.ELEMENT_NODE) { 612 | return getRenderDetail(node); 613 | } else { 614 | return ''; 615 | } 616 | }) 617 | } 618 | } else { 619 | return ''; 620 | } 621 | } 622 | 623 | // compares children of each node 624 | function isSameRenderDetail(a, b) { 625 | 626 | // there may be undefined nodes 627 | if (a === undefined) { 628 | if (b === undefined) { 629 | return true; 630 | } else { 631 | return false; 632 | } 633 | } else if (b === undefined) { 634 | return false; 635 | } 636 | 637 | // there may be text nodes 638 | if (a.nodeType === Node.TEXT_NODE) { 639 | if (b.nodeType === Node.TEXT_NODE) { 640 | return true; 641 | } else { 642 | return false; 643 | } 644 | } else if (b.nodeType === Node.TEXT_NODE) { 645 | return false; 646 | } 647 | 648 | let aSummary = getRenderDetail(a); 649 | let bSummary = getRenderDetail(b); 650 | 651 | // there may be invisible nodes 652 | if (aSummary === '') { 653 | if (bSummary === '') { 654 | return true; 655 | } else { 656 | return false; 657 | } 658 | } else if (bSummary === '') { 659 | return false; 660 | } 661 | 662 | if ( 663 | aSummary.css.fontSize === bSummary.css.fontSize && 664 | aSummary.css.fontFamily === bSummary.css.fontFamily && 665 | aSummary.css.fontWeight === bSummary.css.fontWeight && 666 | aSummary.css.color === bSummary.css.color && 667 | aSummary.css.border === bSummary.css.border && 668 | [...a.childNodes].every((child, i) => isSameRenderDetail(child, b.childNodes[i])) 669 | ) { 670 | return true; 671 | } 672 | return false; 673 | } 674 | 675 | // these parameters are used for normalization later 676 | let metrics = { 677 | 'max-area': 0, 678 | 'max-coverage': 0, 679 | 'max-textLength': 0, 680 | 'max-context': 0 681 | }; 682 | // gather metrics 683 | const updateMax = (group, type) => { 684 | metrics['max-' + type] = Math.max(metrics['max-' + type], group[type]); 685 | } 686 | 687 | // group a list of DOM elements by visual style 688 | function groupByStyle(elements) { 689 | var groups = []; 690 | elements.forEach(function (e) { 691 | 692 | // if group already exists, find it and append to it 693 | for (var i = 0; i < groups.length; i++) { 694 | var style = groups[i].style; 695 | if ( 696 | // group should have same color/font 697 | e.css.color === style.color && 698 | e.css.fontFamily === style.fontFamily && 699 | e.css.fontSize === style.fontSize && 700 | e.css.fontWeight === style.fontWeight && ( 701 | // group should resemble some sort of list/tile layout 702 | e.css.loc.x === style.loc.x || 703 | e.css.loc.y === style.loc.y || 704 | e.css.loc.x + e.css.loc.w === style.loc.x + style.loc.w || 705 | e.css.loc.y + e.css.loc.h === style.loc.y + style.loc.h 706 | ) && isSameRenderDetail(e._node, groups[i].elements[0]._node) 707 | ) { 708 | groups[i].elements.push(e); 709 | groups[i].style.loc = combineRegion( 710 | groups[i].style.loc, 711 | e.css.loc 712 | ); 713 | groups[i].coverage = groups[i].style.loc.w * groups[i].style.loc.h; 714 | groups[i].area += e.css.loc.w * e.css.loc.h; 715 | 716 | return; 717 | } 718 | } 719 | 720 | // group doesn't exist, start a new group 721 | groups.push({ 722 | // deep-copy the structure, since we will edit size 723 | style: { ...e.css, loc: { ...e.css.loc}, path: e.path }, 724 | elements: [e], 725 | area: e.css.loc.w * e.css.loc.h, 726 | coverage: e.css.loc.w * e.css.loc.h 727 | }); 728 | }); 729 | 730 | groups.forEach(group => { 731 | group.textLength = group.elements.reduce((a, v) => { return a + v.name.length }, 0); 732 | 733 | updateMax(group, 'area'); 734 | updateMax(group, 'coverage'); 735 | updateMax(group, 'textLength'); 736 | }); 737 | 738 | return groups; 739 | } 740 | 741 | // helper logic for normalizing significance params and applying weights 742 | const weigh = (group, field) => { 743 | let weight = weights[field]; 744 | if (settings && settings.weights && settings.weights[field]) { 745 | weight = settings.weights[field]; 746 | } 747 | return weight * group[field] / metrics['max-' + field]; 748 | } 749 | 750 | // returns relative significance of the group based on a number of heuristics 751 | function significance(group) { 752 | return weigh(group, 'coverage') + weigh(group, 'area') + weigh(group, 'textLength') + weigh(group, 'context'); 753 | } 754 | 755 | 756 | 757 | /** END LIST, BEGIN PROGRAM **/ 758 | 759 | 760 | 761 | let elements = document.querySelectorAll('a'); 762 | let relevant = []; 763 | for (var i = 0; i < elements.length; i++) { 764 | var e = extract(elements[i]); 765 | e._node = elements[i]; 766 | if (e.css.visible) { 767 | relevant.push(e); 768 | } 769 | } 770 | 771 | // fill in extra context for better preview later 772 | let groups = groupByStyle(relevant); 773 | groups.forEach(group => { 774 | let parents = expandSelection(group.elements); 775 | group.context = 0; 776 | group.elements.forEach((element, index) => { 777 | element.context = getRenderDetail(parents[index]); 778 | group.context += parents[index].innerText.length; 779 | }); 780 | updateMax(group, 'context'); 781 | }); 782 | groups = groups.sort((a, b) => significance(a) < significance(b) ? 1 : -1); 783 | 784 | return { 785 | groups: groups 786 | }; 787 | }, process.stdout.columns, weights, settings); 788 | 789 | outputToTerminal('shell', removeCruftAndClassify(results)); 790 | 791 | await browser.close(); 792 | })(); 793 | --------------------------------------------------------------------------------