├── .editorconfig ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .nvmrc ├── .prettierrc ├── LICENSE ├── README.md ├── __tests__ └── app.test.js ├── app ├── app.js ├── server.js └── validator.js ├── ecosystem.config.js ├── package-lock.json ├── package.json └── script └── bootstrap.sh /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: https://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | [*] 7 | end_of_line = lf 8 | insert_final_newline = true 9 | charset = utf-8 10 | 11 | [*.{js}] 12 | indent_style = space 13 | indent_size = 4 14 | 15 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | strategy: 11 | matrix: 12 | node-version: [18.x] 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Use Node.js ${{ matrix.node-version }} 17 | uses: actions/setup-node@v3 18 | with: 19 | node-version: ${{ matrix.node-version }} 20 | cache: 'npm' 21 | - run: npm install 22 | - run: npm run build --if-present 23 | - run: npm test 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | users 2 | node_modules 3 | log 4 | package-lock.checksum 5 | vendor -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | 11.6.0 -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "printWidth": 120, 4 | "trailingComma": "none", 5 | "tabWidth": 4 6 | } 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Feedbin, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Extract 2 | ======= 3 | 4 | Extract just the content from a web page. 5 | 6 | Extract is a wrapper to turn the [Mercury Parser](https://github.com/postlight/parser) into a web service. 7 | 8 | Why? 9 | ---- 10 | 11 | Mercury already offers an [API component](https://github.com/postlight/parser-api), meant to be deployed to AWS Lambda. There are a few reasons why this exists as an alternative. 12 | 13 | 1. Deploy elsewhere. Extract is a vanilla Node.js app, that is meant to run in a VM, and has no platform specific dependencies. 14 | 15 | 2. Built-in authorization system. 16 | 17 | 3. Performance. In my experience, running it on a VM has been faster than the lambda version. 18 | 19 | Here's a graph where you can see a decrease in average response time around the `17. Feb` mark. This is when Feedbin switched from the lambda hosted version, to extract running on a VPS. 20 | 21 | ![Response Time](https://user-images.githubusercontent.com/133809/53254496-54e85b00-3678-11e9-949a-f61824a4ac96.png) 22 | 23 | Installation 24 | ------------ 25 | 26 | 1. Install [Node.js](https://nodejs.org/en/) and [npm](https://www.npmjs.com/). 27 | 28 | 2. Clone extract 29 | 30 | ```bash 31 | git clone https://github.com/feedbin/extract.git 32 | ``` 33 | 34 | 3. Install the dependencies. 35 | 36 | ```bash 37 | cd extract 38 | npm install 39 | ``` 40 | 41 | 4. Run the server 42 | 43 | ```bash 44 | node app/server.js 45 | ``` 46 | 47 | Alternatively, extract includes an `ecosystem.config.js` to use with [pm2](https://github.com/Unitech/pm2). You could use this in production. 48 | 49 | ```bash 50 | npm install --global pm2 51 | pm2 start ecosystem.config.js 52 | ``` 53 | 54 | Usage 55 | ----- 56 | 57 | Extract has a simple, file-based system for creating users and secret keys. This allows users to be added/removed while the system is running. In the `./users` directory, the filename is the username and the contents is the secret key. To make a new user, run the following: 58 | 59 | ``` 60 | cd extract 61 | mkdir users 62 | 63 | # use your own secret key and username 64 | echo "SECRET_KEY" > users/USERNAME 65 | ``` 66 | 67 | Once a username and password has been created, you can make a request. 68 | 69 | An example request looks like: 70 | 71 | ``` 72 | http://localhost:3000/parser/:username/:signature?base64_url=:base64_url 73 | ``` 74 | 75 | The parts that you need are: 76 | 77 | - `username` your username 78 | - `signature` the hexadecimal HMAC-SHA1 signature of the URL you want to parse 79 | - `base64_url` base64 encoded version of the URL you want to parse 80 | 81 | The URL is base64-encoded to avoid any issues in the way different systems encode URLs. It must use the [RFC 4648](https://tools.ietf.org/html/rfc4648#section-5) url-safe variant with no newlines. 82 | 83 | If your platform does not offer a URL safe base64 option, you can replicate it. First create the base64 encoded string. Then replace the following characters: 84 | 85 | - `+` => `-` 86 | - `/` => `_` 87 | - `\n` => `""` 88 | 89 | Here's a sample implementation in ruby. You can use this as a reference for matching your implementation. 90 | 91 | ```ruby 92 | require "uri" 93 | require "openssl" 94 | require "base64" 95 | 96 | username = "username" 97 | secret = "secret" 98 | host = "localhost" 99 | port = 3000 100 | url = "https://feedbin.com/blog/2018/09/11/private-by-default/" 101 | 102 | digest = OpenSSL::Digest.new("sha1") 103 | signature = OpenSSL::HMAC.hexdigest(digest, secret, url) 104 | 105 | base64_url = Base64.urlsafe_encode64(url).gsub("\n", "") 106 | 107 | URI::HTTPS.build({ 108 | host: host, 109 | port: port, 110 | path: "/parser/#{username}/#{signature}", 111 | query: "base64_url=#{base64_url}" 112 | }).to_s 113 | ``` 114 | 115 | The above example would produce: 116 | 117 | ``` 118 | https://localhost:3000/parser/username/e4696f8630bb68c21d77a9629ce8d063d8e5f81c?base64_url=aHR0cHM6Ly9mZWVkYmluLmNvbS9ibG9nLzIwMTgvMDkvMTEvcHJpdmF0ZS1ieS1kZWZhdWx0Lw== 119 | ``` 120 | 121 | With the output: 122 | 123 | ```json 124 | { 125 | "title": "Private by Default", 126 | "author": null, 127 | "date_published": "2018-09-11T00:00:00.000Z", 128 | "dek": null, 129 | "lead_image_url": "https://assets.feedbin.com/assets-site/blog/2018-09-11/embed-3f43088538ae5ed7e585c00013adc13a915fd35de31990b3081a085b963ed7dd.png", 130 | "content": "
content
", 131 | "next_page_url": null, 132 | "url": "https://feedbin.com/blog/2018/09/11/private-by-default/", 133 | "domain": "feedbin.com", 134 | "excerpt": "September 11, 2018 by Ben Ubois I want Feedbin to be the opposite of Big Social. I think people should have the right not to be tracked on the Internet and Feedbin can help facilitate that. Since…", 135 | "word_count": 787, 136 | "direction": "ltr", 137 | "total_pages": 1, 138 | "rendered_pages": 1 139 | } 140 | ``` 141 | -------------------------------------------------------------------------------- /__tests__/app.test.js: -------------------------------------------------------------------------------- 1 | const request = require("supertest") 2 | const app = require("../app/app") 3 | const mkdirp = require('mkdirp') 4 | const fs = require('fs') 5 | const path = require("path") 6 | const hmac = require("crypto-js/hmac-sha1") 7 | 8 | describe("Test the health check", () => { 9 | test("It should respond to GET", (done) => { 10 | request(app).get("/health_check").expect(200).end(done) 11 | }) 12 | }) 13 | 14 | describe("Test the parser", () => { 15 | 16 | const user = "test" 17 | const password = "test" 18 | const url = "http://example.com" 19 | const signature = hmac(url, password).toString() 20 | const base64_url = new Buffer.from(url).toString("base64") 21 | const file = path.normalize(path.join(__dirname, "..", "users")) 22 | const writeFile = async (dir, path, content) => { 23 | await mkdirp(dir) 24 | fs.writeFileSync(path, content) 25 | } 26 | writeFile(file, path.join(file, user), password) 27 | 28 | test("It should respond to GET", (done) => { 29 | request(app).get(`/parser/${user}/${signature}?base64_url=${base64_url}`).expect(200).end(done) 30 | }) 31 | 32 | test("It should fail with invalid user", (done) => { 33 | request(app).get(`/parser/invalid_user/${signature}?base64_url=${base64_url}`).expect(400, { 34 | error: true, 35 | messages: "User does not exist: invalid_user." 36 | }, done) 37 | }) 38 | 39 | test("It should fail with invalid signature", (done) => { 40 | request(app).get(`/parser/${user}/invalid_signature?base64_url=${base64_url}`).expect(400, { 41 | error: true, 42 | messages: "Invalid signature." 43 | }, done) 44 | }) 45 | 46 | test("It should fail with missing params", (done) => { 47 | request(app).get(`/parser/${user}/${signature}`).expect(400, { 48 | error: true, 49 | messages: "Invalid request. Missing base64_url parameter." 50 | }, done) 51 | }) 52 | 53 | }) -------------------------------------------------------------------------------- /app/app.js: -------------------------------------------------------------------------------- 1 | const express = require("express") 2 | const app = express() 3 | const parser = require("@postlight/parser") 4 | const validator = require("./validator") 5 | 6 | function decodeURL(encodedURL) { 7 | return Buffer.from(encodedURL, "base64").toString("utf-8") 8 | } 9 | 10 | function getParams(request) { 11 | const user = request.params.user 12 | const signature = request.params.signature 13 | const base64url = request.query.base64_url.replace(/ /g, "+") 14 | const url = decodeURL(base64url) 15 | return { user, signature, url } 16 | } 17 | 18 | function log(request, extra) { 19 | let output = `[${request.ip}] - ${request.method} ${request.url}` 20 | if (extra) { 21 | output = `${output}: ${extra}` 22 | } 23 | console.log(output) 24 | } 25 | 26 | function errorHandler(request, response, next, error, message) { 27 | log(request, message) 28 | response.status(400).json({ 29 | error: true, 30 | messages: message 31 | }) 32 | next(error) 33 | } 34 | 35 | app.get("/health_check", (request, response) => { 36 | log(request) 37 | response.send("200 OK") 38 | }) 39 | 40 | app.get("/parser/:user/:signature", async (request, response, next) => { 41 | try { 42 | let { user, signature, url } = getParams(request) 43 | 44 | try { 45 | const auth = new validator(user, url, signature) 46 | await auth.validate() 47 | } catch (error) { 48 | errorHandler(request, response, next, error, error.message) 49 | return 50 | } 51 | 52 | try { 53 | let result = await parser.parse(url) 54 | const code = "error" in result ? 400 : 200 55 | log(request) 56 | response.status(code).send(result) 57 | } catch (error) { 58 | errorHandler(request, response, next, error, "Cannot extract this URL.") 59 | return 60 | } 61 | } catch (error) { 62 | errorHandler(request, response, next, error, "Invalid request. Missing base64_url parameter.") 63 | return 64 | } 65 | }) 66 | 67 | module.exports = app 68 | -------------------------------------------------------------------------------- /app/server.js: -------------------------------------------------------------------------------- 1 | const app = require("./app") 2 | const serverPort = process.env.PORT || 3000 3 | const server = app.listen(serverPort, () => { 4 | console.log(`Extract started on port ${serverPort}`) 5 | }) 6 | 7 | process.on("SIGINT", () => { 8 | if (process.env.NODE_ENV === "production") { 9 | server.close(function (error) { 10 | console.error("SIGINT received, shutting down") 11 | if (error) { 12 | console.error(err) 13 | process.exit(1) 14 | } 15 | }) 16 | } else { 17 | process.exit(0) 18 | } 19 | }) 20 | -------------------------------------------------------------------------------- /app/validator.js: -------------------------------------------------------------------------------- 1 | const hmac = require("crypto-js/hmac-sha1") 2 | const fs = require("fs") 3 | const path = require("path") 4 | 5 | class Validator { 6 | constructor(user, data, signature) { 7 | this.user = user 8 | this.data = data 9 | this.signature = signature 10 | } 11 | 12 | async validate() { 13 | let key 14 | try { 15 | key = await this.key() 16 | } catch (e) { 17 | throw Error(`User does not exist: ${this.user}.`) 18 | } 19 | 20 | if (this.calculateSignature(key) !== this.signature) { 21 | throw Error(`Invalid signature.`) 22 | } 23 | } 24 | 25 | async key() { 26 | return new Promise((resolve, reject) => { 27 | const filepath = path.normalize(path.join(__dirname, "..", "users", this.user)) 28 | fs.readFile(filepath, {encoding: "utf-8"}, (error, data) => { 29 | if (error) { 30 | reject(error) 31 | } else { 32 | resolve(data.trim()) 33 | } 34 | }) 35 | }) 36 | } 37 | 38 | calculateSignature(key) { 39 | return hmac(this.data, key).toString() 40 | } 41 | } 42 | 43 | module.exports = Validator -------------------------------------------------------------------------------- /ecosystem.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | apps : [{ 3 | name: "extract", 4 | script: process.env.PWD + "/app/server.js", 5 | instances: "max", 6 | cwd: process.env.PWD, 7 | env: { 8 | NODE_ENV: "development", 9 | }, 10 | env_production: { 11 | NODE_ENV: "production", 12 | } 13 | }] 14 | } 15 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "extract", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "yarn jest" 8 | }, 9 | "keywords": [], 10 | "author": "", 11 | "license": "ISC", 12 | "dependencies": { 13 | "@postlight/parser": "^2.2.2", 14 | "crypto-js": "^4.1.1", 15 | "express": "^4.18.1", 16 | "moment-timezone": "0.5.37" 17 | }, 18 | "devDependencies": { 19 | "jest": "^29.1.1", 20 | "superagent": "^4.1.0", 21 | "supertest": "^4.0.2", 22 | "mkdirp": "1.0.4" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /script/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | sha=$(shasum package-lock.json) 6 | sha_file="deploy/package-lock.checksum" 7 | 8 | run () { 9 | echo "Installing dependencies..." 10 | 11 | # Running twice fixes missing 12 | npm install && npm install 13 | 14 | echo "${sha}" > "${sha_file}" 15 | } 16 | 17 | run 18 | # if test -f "${sha_file}"; then 19 | # old_sha=$(cat $sha_file) 20 | # if [[ "${old_sha}" != "${sha}" ]]; then 21 | # run 22 | # fi 23 | # else 24 | # run 25 | # fi 26 | # 27 | --------------------------------------------------------------------------------