├── .editorconfig
├── .github
└── workflows
│ └── ci.yml
├── .gitignore
├── .nvmrc
├── .prettierrc
├── LICENSE
├── README.md
├── __tests__
└── app.test.js
├── app
├── app.js
├── server.js
└── validator.js
├── ecosystem.config.js
├── package-lock.json
├── package.json
└── script
└── bootstrap.sh
/.editorconfig:
--------------------------------------------------------------------------------
1 | # EditorConfig is awesome: https://EditorConfig.org
2 |
3 | # top-most EditorConfig file
4 | root = true
5 |
6 | [*]
7 | end_of_line = lf
8 | insert_final_newline = true
9 | charset = utf-8
10 |
11 | [*.{js}]
12 | indent_style = space
13 | indent_size = 4
14 |
15 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 |
10 | strategy:
11 | matrix:
12 | node-version: [18.x]
13 |
14 | steps:
15 | - uses: actions/checkout@v3
16 | - name: Use Node.js ${{ matrix.node-version }}
17 | uses: actions/setup-node@v3
18 | with:
19 | node-version: ${{ matrix.node-version }}
20 | cache: 'npm'
21 | - run: npm install
22 | - run: npm run build --if-present
23 | - run: npm test
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | users
2 | node_modules
3 | log
4 | package-lock.checksum
5 | vendor
--------------------------------------------------------------------------------
/.nvmrc:
--------------------------------------------------------------------------------
1 | 11.6.0
--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "semi": true,
3 | "printWidth": 120,
4 | "trailingComma": "none",
5 | "tabWidth": 4
6 | }
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Feedbin, Inc.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Extract
2 | =======
3 |
4 | Extract just the content from a web page.
5 |
6 | Extract is a wrapper to turn the [Mercury Parser](https://github.com/postlight/parser) into a web service.
7 |
8 | Why?
9 | ----
10 |
11 | Mercury already offers an [API component](https://github.com/postlight/parser-api), meant to be deployed to AWS Lambda. There are a few reasons why this exists as an alternative.
12 |
13 | 1. Deploy elsewhere. Extract is a vanilla Node.js app, that is meant to run in a VM, and has no platform specific dependencies.
14 |
15 | 2. Built-in authorization system.
16 |
17 | 3. Performance. In my experience, running it on a VM has been faster than the lambda version.
18 |
19 | Here's a graph where you can see a decrease in average response time around the `17. Feb` mark. This is when Feedbin switched from the lambda hosted version, to extract running on a VPS.
20 |
21 | 
22 |
23 | Installation
24 | ------------
25 |
26 | 1. Install [Node.js](https://nodejs.org/en/) and [npm](https://www.npmjs.com/).
27 |
28 | 2. Clone extract
29 |
30 | ```bash
31 | git clone https://github.com/feedbin/extract.git
32 | ```
33 |
34 | 3. Install the dependencies.
35 |
36 | ```bash
37 | cd extract
38 | npm install
39 | ```
40 |
41 | 4. Run the server
42 |
43 | ```bash
44 | node app/server.js
45 | ```
46 |
47 | Alternatively, extract includes an `ecosystem.config.js` to use with [pm2](https://github.com/Unitech/pm2). You could use this in production.
48 |
49 | ```bash
50 | npm install --global pm2
51 | pm2 start ecosystem.config.js
52 | ```
53 |
54 | Usage
55 | -----
56 |
57 | Extract has a simple, file-based system for creating users and secret keys. This allows users to be added/removed while the system is running. In the `./users` directory, the filename is the username and the contents is the secret key. To make a new user, run the following:
58 |
59 | ```
60 | cd extract
61 | mkdir users
62 |
63 | # use your own secret key and username
64 | echo "SECRET_KEY" > users/USERNAME
65 | ```
66 |
67 | Once a username and password has been created, you can make a request.
68 |
69 | An example request looks like:
70 |
71 | ```
72 | http://localhost:3000/parser/:username/:signature?base64_url=:base64_url
73 | ```
74 |
75 | The parts that you need are:
76 |
77 | - `username` your username
78 | - `signature` the hexadecimal HMAC-SHA1 signature of the URL you want to parse
79 | - `base64_url` base64 encoded version of the URL you want to parse
80 |
81 | The URL is base64-encoded to avoid any issues in the way different systems encode URLs. It must use the [RFC 4648](https://tools.ietf.org/html/rfc4648#section-5) url-safe variant with no newlines.
82 |
83 | If your platform does not offer a URL safe base64 option, you can replicate it. First create the base64 encoded string. Then replace the following characters:
84 |
85 | - `+` => `-`
86 | - `/` => `_`
87 | - `\n` => `""`
88 |
89 | Here's a sample implementation in ruby. You can use this as a reference for matching your implementation.
90 |
91 | ```ruby
92 | require "uri"
93 | require "openssl"
94 | require "base64"
95 |
96 | username = "username"
97 | secret = "secret"
98 | host = "localhost"
99 | port = 3000
100 | url = "https://feedbin.com/blog/2018/09/11/private-by-default/"
101 |
102 | digest = OpenSSL::Digest.new("sha1")
103 | signature = OpenSSL::HMAC.hexdigest(digest, secret, url)
104 |
105 | base64_url = Base64.urlsafe_encode64(url).gsub("\n", "")
106 |
107 | URI::HTTPS.build({
108 | host: host,
109 | port: port,
110 | path: "/parser/#{username}/#{signature}",
111 | query: "base64_url=#{base64_url}"
112 | }).to_s
113 | ```
114 |
115 | The above example would produce:
116 |
117 | ```
118 | https://localhost:3000/parser/username/e4696f8630bb68c21d77a9629ce8d063d8e5f81c?base64_url=aHR0cHM6Ly9mZWVkYmluLmNvbS9ibG9nLzIwMTgvMDkvMTEvcHJpdmF0ZS1ieS1kZWZhdWx0Lw==
119 | ```
120 |
121 | With the output:
122 |
123 | ```json
124 | {
125 | "title": "Private by Default",
126 | "author": null,
127 | "date_published": "2018-09-11T00:00:00.000Z",
128 | "dek": null,
129 | "lead_image_url": "https://assets.feedbin.com/assets-site/blog/2018-09-11/embed-3f43088538ae5ed7e585c00013adc13a915fd35de31990b3081a085b963ed7dd.png",
130 | "content": "
content
",
131 | "next_page_url": null,
132 | "url": "https://feedbin.com/blog/2018/09/11/private-by-default/",
133 | "domain": "feedbin.com",
134 | "excerpt": "September 11, 2018 by Ben Ubois I want Feedbin to be the opposite of Big Social. I think people should have the right not to be tracked on the Internet and Feedbin can help facilitate that. Since…",
135 | "word_count": 787,
136 | "direction": "ltr",
137 | "total_pages": 1,
138 | "rendered_pages": 1
139 | }
140 | ```
141 |
--------------------------------------------------------------------------------
/__tests__/app.test.js:
--------------------------------------------------------------------------------
1 | const request = require("supertest")
2 | const app = require("../app/app")
3 | const mkdirp = require('mkdirp')
4 | const fs = require('fs')
5 | const path = require("path")
6 | const hmac = require("crypto-js/hmac-sha1")
7 |
8 | describe("Test the health check", () => {
9 | test("It should respond to GET", (done) => {
10 | request(app).get("/health_check").expect(200).end(done)
11 | })
12 | })
13 |
14 | describe("Test the parser", () => {
15 |
16 | const user = "test"
17 | const password = "test"
18 | const url = "http://example.com"
19 | const signature = hmac(url, password).toString()
20 | const base64_url = new Buffer.from(url).toString("base64")
21 | const file = path.normalize(path.join(__dirname, "..", "users"))
22 | const writeFile = async (dir, path, content) => {
23 | await mkdirp(dir)
24 | fs.writeFileSync(path, content)
25 | }
26 | writeFile(file, path.join(file, user), password)
27 |
28 | test("It should respond to GET", (done) => {
29 | request(app).get(`/parser/${user}/${signature}?base64_url=${base64_url}`).expect(200).end(done)
30 | })
31 |
32 | test("It should fail with invalid user", (done) => {
33 | request(app).get(`/parser/invalid_user/${signature}?base64_url=${base64_url}`).expect(400, {
34 | error: true,
35 | messages: "User does not exist: invalid_user."
36 | }, done)
37 | })
38 |
39 | test("It should fail with invalid signature", (done) => {
40 | request(app).get(`/parser/${user}/invalid_signature?base64_url=${base64_url}`).expect(400, {
41 | error: true,
42 | messages: "Invalid signature."
43 | }, done)
44 | })
45 |
46 | test("It should fail with missing params", (done) => {
47 | request(app).get(`/parser/${user}/${signature}`).expect(400, {
48 | error: true,
49 | messages: "Invalid request. Missing base64_url parameter."
50 | }, done)
51 | })
52 |
53 | })
--------------------------------------------------------------------------------
/app/app.js:
--------------------------------------------------------------------------------
1 | const express = require("express")
2 | const app = express()
3 | const parser = require("@postlight/parser")
4 | const validator = require("./validator")
5 |
6 | function decodeURL(encodedURL) {
7 | return Buffer.from(encodedURL, "base64").toString("utf-8")
8 | }
9 |
10 | function getParams(request) {
11 | const user = request.params.user
12 | const signature = request.params.signature
13 | const base64url = request.query.base64_url.replace(/ /g, "+")
14 | const url = decodeURL(base64url)
15 | return { user, signature, url }
16 | }
17 |
18 | function log(request, extra) {
19 | let output = `[${request.ip}] - ${request.method} ${request.url}`
20 | if (extra) {
21 | output = `${output}: ${extra}`
22 | }
23 | console.log(output)
24 | }
25 |
26 | function errorHandler(request, response, next, error, message) {
27 | log(request, message)
28 | response.status(400).json({
29 | error: true,
30 | messages: message
31 | })
32 | next(error)
33 | }
34 |
35 | app.get("/health_check", (request, response) => {
36 | log(request)
37 | response.send("200 OK")
38 | })
39 |
40 | app.get("/parser/:user/:signature", async (request, response, next) => {
41 | try {
42 | let { user, signature, url } = getParams(request)
43 |
44 | try {
45 | const auth = new validator(user, url, signature)
46 | await auth.validate()
47 | } catch (error) {
48 | errorHandler(request, response, next, error, error.message)
49 | return
50 | }
51 |
52 | try {
53 | let result = await parser.parse(url)
54 | const code = "error" in result ? 400 : 200
55 | log(request)
56 | response.status(code).send(result)
57 | } catch (error) {
58 | errorHandler(request, response, next, error, "Cannot extract this URL.")
59 | return
60 | }
61 | } catch (error) {
62 | errorHandler(request, response, next, error, "Invalid request. Missing base64_url parameter.")
63 | return
64 | }
65 | })
66 |
67 | module.exports = app
68 |
--------------------------------------------------------------------------------
/app/server.js:
--------------------------------------------------------------------------------
1 | const app = require("./app")
2 | const serverPort = process.env.PORT || 3000
3 | const server = app.listen(serverPort, () => {
4 | console.log(`Extract started on port ${serverPort}`)
5 | })
6 |
7 | process.on("SIGINT", () => {
8 | if (process.env.NODE_ENV === "production") {
9 | server.close(function (error) {
10 | console.error("SIGINT received, shutting down")
11 | if (error) {
12 | console.error(err)
13 | process.exit(1)
14 | }
15 | })
16 | } else {
17 | process.exit(0)
18 | }
19 | })
20 |
--------------------------------------------------------------------------------
/app/validator.js:
--------------------------------------------------------------------------------
1 | const hmac = require("crypto-js/hmac-sha1")
2 | const fs = require("fs")
3 | const path = require("path")
4 |
5 | class Validator {
6 | constructor(user, data, signature) {
7 | this.user = user
8 | this.data = data
9 | this.signature = signature
10 | }
11 |
12 | async validate() {
13 | let key
14 | try {
15 | key = await this.key()
16 | } catch (e) {
17 | throw Error(`User does not exist: ${this.user}.`)
18 | }
19 |
20 | if (this.calculateSignature(key) !== this.signature) {
21 | throw Error(`Invalid signature.`)
22 | }
23 | }
24 |
25 | async key() {
26 | return new Promise((resolve, reject) => {
27 | const filepath = path.normalize(path.join(__dirname, "..", "users", this.user))
28 | fs.readFile(filepath, {encoding: "utf-8"}, (error, data) => {
29 | if (error) {
30 | reject(error)
31 | } else {
32 | resolve(data.trim())
33 | }
34 | })
35 | })
36 | }
37 |
38 | calculateSignature(key) {
39 | return hmac(this.data, key).toString()
40 | }
41 | }
42 |
43 | module.exports = Validator
--------------------------------------------------------------------------------
/ecosystem.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | apps : [{
3 | name: "extract",
4 | script: process.env.PWD + "/app/server.js",
5 | instances: "max",
6 | cwd: process.env.PWD,
7 | env: {
8 | NODE_ENV: "development",
9 | },
10 | env_production: {
11 | NODE_ENV: "production",
12 | }
13 | }]
14 | }
15 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "extract",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "index.js",
6 | "scripts": {
7 | "test": "yarn jest"
8 | },
9 | "keywords": [],
10 | "author": "",
11 | "license": "ISC",
12 | "dependencies": {
13 | "@postlight/parser": "^2.2.2",
14 | "crypto-js": "^4.1.1",
15 | "express": "^4.18.1",
16 | "moment-timezone": "0.5.37"
17 | },
18 | "devDependencies": {
19 | "jest": "^29.1.1",
20 | "superagent": "^4.1.0",
21 | "supertest": "^4.0.2",
22 | "mkdirp": "1.0.4"
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/script/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | sha=$(shasum package-lock.json)
6 | sha_file="deploy/package-lock.checksum"
7 |
8 | run () {
9 | echo "Installing dependencies..."
10 |
11 | # Running twice fixes missing
12 | npm install && npm install
13 |
14 | echo "${sha}" > "${sha_file}"
15 | }
16 |
17 | run
18 | # if test -f "${sha_file}"; then
19 | # old_sha=$(cat $sha_file)
20 | # if [[ "${old_sha}" != "${sha}" ]]; then
21 | # run
22 | # fi
23 | # else
24 | # run
25 | # fi
26 | #
27 |
--------------------------------------------------------------------------------