├── .gitattributes ├── .gitignore ├── media ├── articles │ ├── the-onion-1.jpg │ ├── the-onion-2.jpg │ ├── the-onion-3.jpg │ └── wait-but-why-1.png ├── favicon.ico ├── icons │ ├── ai.svg │ ├── body.svg │ ├── code_review.svg │ ├── developer_activity.svg │ └── stripe.svg ├── logo.svg └── promo.jpg ├── package.json ├── readme.md ├── saasify.json ├── src ├── detail.ts ├── extract-article-content.js ├── extract.ts ├── summarize.ts └── types.ts └── yarn.lock /.gitattributes: -------------------------------------------------------------------------------- 1 | fixtures/* linguist-vendored 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # next.js build output 79 | .next 80 | 81 | # nuxt.js build output 82 | .nuxt 83 | 84 | # gatsby files 85 | .cache/ 86 | public 87 | 88 | # vuepress build output 89 | .vuepress/dist 90 | 91 | # Serverless directories 92 | .serverless/ 93 | 94 | # FuseBox cache 95 | .fusebox/ 96 | 97 | # DynamoDB Local files 98 | .dynamodb/ 99 | 100 | # TernJS port file 101 | .tern-port 102 | 103 | # ZEIT now dev cache 104 | .now/ 105 | -------------------------------------------------------------------------------- /media/articles/the-onion-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saasify-sh/synopsis/82ae8feb67a15d3bd3f66a971cc23fa36aa1118e/media/articles/the-onion-1.jpg -------------------------------------------------------------------------------- /media/articles/the-onion-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saasify-sh/synopsis/82ae8feb67a15d3bd3f66a971cc23fa36aa1118e/media/articles/the-onion-2.jpg -------------------------------------------------------------------------------- /media/articles/the-onion-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saasify-sh/synopsis/82ae8feb67a15d3bd3f66a971cc23fa36aa1118e/media/articles/the-onion-3.jpg -------------------------------------------------------------------------------- /media/articles/wait-but-why-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saasify-sh/synopsis/82ae8feb67a15d3bd3f66a971cc23fa36aa1118e/media/articles/wait-but-why-1.png -------------------------------------------------------------------------------- /media/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saasify-sh/synopsis/82ae8feb67a15d3bd3f66a971cc23fa36aa1118e/media/favicon.ico -------------------------------------------------------------------------------- /media/icons/ai.svg: -------------------------------------------------------------------------------- 1 | Artificial intelligence -------------------------------------------------------------------------------- /media/icons/body.svg: -------------------------------------------------------------------------------- 1 | static_page -------------------------------------------------------------------------------- /media/icons/code_review.svg: -------------------------------------------------------------------------------- 1 | code review -------------------------------------------------------------------------------- /media/icons/developer_activity.svg: -------------------------------------------------------------------------------- 1 | developer activity -------------------------------------------------------------------------------- /media/icons/stripe.svg: -------------------------------------------------------------------------------- 1 | stripe payments -------------------------------------------------------------------------------- /media/logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /media/promo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saasify-sh/synopsis/82ae8feb67a15d3bd3f66a971cc23fa36aa1118e/media/promo.jpg -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "synopsis", 3 | "private": true, 4 | "version": "0.1.0", 5 | "description": "Automagical AI-powered summarization for webpages and articles.", 6 | "author": "Saasify ", 7 | "license": "MIT", 8 | "repository": "https://github.com/saasify-sh/synopsis", 9 | "engines": { 10 | "node": ">=10" 11 | }, 12 | "dependencies": { 13 | "is-html": "^2.0.0", 14 | "lodash.pick": "^4.4.0", 15 | "scrapex": "^0.3.0", 16 | "text-summarization": "^1.0.2" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | 5 |

6 | 7 | # synopsis 8 | 9 | > Automagical AI-powered summarization for webpages and articles. 10 | 11 | - Uses state of the art AI to extract the core content of any webpage 12 | - Uses a variety of metrics to generate quality extractive text summaries 13 | - Summarizes html or text content 14 | - Utilizes html structure as a signal of text importance 15 | - Includes basic abstractive shortening of extracted sentences 16 | - Hosted by [Saasify](https://saasify.sh) 17 | - Thoroughly tested and used in production 18 | 19 | ## Examples 20 | 21 | The following examples all use [HTTPie](https://httpie.org/), a more intuitive version of `curl`. 22 | 23 | ### The Onion Example 1 24 | 25 | *Input:* ([article](https://www.theonion.com/fun-toy-banned-because-of-three-stupid-dead-kids-1819565691)) 26 | ```bash 27 | http POST \ 28 | 'https://ssfy.sh/dev/synopsis/summarize' \ 29 | 'url=https://www.theonion.com/fun-toy-banned-because-of-three-stupid-dead-kids-1819565691' 30 | ``` 31 | 32 | *Output:* 33 | ```json 34 | [ 35 | "Fun Toy Banned Because Of Three Stupid Dead Kids", 36 | "So now we have to do a full recall and halt production on what was a really awesome toy.", 37 | "But now I'll never see it again, all because three stupid idiots had to go and wreck everything.\"", 38 | "\"She thought the broken shards were candy.", 39 | "That's what you'd assume after breaking a plastic, inedible toy, right?", 40 | "\"I considered this for a while, but then I decided no. No way.", 41 | "If you're 11 years old, you should know that it's impossible to fly.", 42 | "And poor Wizco's probably going to go bankrupt because of this shit." 43 | ] 44 | ``` 45 | 46 | 47 | 48 | ### The Onion Example 2 49 | 50 | *Input:* ([article](https://local.theonion.com/plan-to-get-laid-at-dragoncon-2001-fails-1819566152)) 51 | ```bash 52 | http POST \ 53 | 'https://ssfy.sh/dev/synopsis/summarize' \ 54 | 'url=https://local.theonion.com/plan-to-get-laid-at-dragoncon-2001-fails-1819566152' 55 | ``` 56 | 57 | *Output:* 58 | ```json 59 | [ 60 | "Plan To Get Laid At DragonCon 2001 Fails", 61 | "\"I know a lot of girls online, but that's not really the same,\" Melcher said.", 62 | "\"I imagined some girl and I talking about the new Lord Of The Rings movie,\" Melcher said.", 63 | "\"I guess girls aren't into dragons and superheroes as much as guys are,\" Melcher said.", 64 | "\"Andy and I went to this Sailor Moon thing because we knew girls would be there,\" Melcher said.", 65 | "\"Make no mistake—we do not like Sailor Moon.", 66 | "The women, however, were only interested in talking about Sailor Moon.", 67 | "\"This one girl asked me if I wrote fan fiction, and I said yes,\" Melcher said.", 68 | "The following night, Melcher attended a party he had heard about in an online chat room." 69 | ] 70 | ``` 71 | 72 | 73 | 74 | ### The Onion Example 3 75 | 76 | *Input:* ([article](https://www.theonion.com/everyone-involved-in-pizzas-preparation-delivery-purc-1819564897)) 77 | ```bash 78 | http POST \ 79 | 'https://ssfy.sh/dev/synopsis/summarize' \ 80 | 'url=https://www.theonion.com/everyone-involved-in-pizzas-preparation-delivery-purc-1819564897' 81 | ``` 82 | 83 | *Output:* 84 | ```json 85 | [ 86 | "Everyone Involved In Pizza's Preparation, Delivery, Buy Extremely High", 87 | "After taking the order, Lindeman relayed it to co-worker and fellow stoner Greg Kanner.", 88 | "At 1 a.m. Monday, the pizza came into material being for the first time.", 89 | "\"After all, it's just pizza, right?", 90 | "Also, Bickell and Wang had forgotten to include their apartment number with the order.", 91 | "Fuck!\" Behr later described the prolonged Blount Street search as \"a serious fucking hassle.\"", 92 | "\"They were seriously bitching me out,\" said Lindeman, who was royally baked at the time.", 93 | "\"I was like, 'Dude, just chill, your pizza will be there any sec.'\"", 94 | "Finally, at 3:10 a.m., more than three hours after the order was placed, the pizza reached its destination." 95 | ] 96 | ``` 97 | 98 | 99 | 100 | ### Wait But Why Example 101 | 102 | *Input:* ([article](https://waitbutwhy.com/2015/01/artificial-intelligence-revolution-1.html)) 103 | ```bash 104 | http POST \ 105 | 'https://ssfy.sh/dev/synopsis/summarize' \ 106 | 'url=https://waitbutwhy.com/2015/01/artificial-intelligence-revolution-1.html' 107 | ``` 108 | 109 | *Output:* 110 | ```json 111 | [ 112 | "The AI Revolution: The Road to Superintelligence", 113 | "The Far Future—Coming Soon", 114 | "The Road to Superintelligence", 115 | "What Is AI?", 116 | "Where We Are Now—A World Running on ANI", 117 | "The Road From ANI to AGI", 118 | "Plagiarize the brain.", 119 | "Try to make evolution do what it did before but for us this time.", 120 | "Make this whole thing the computer’s problem, not ours.", 121 | "All This Could Happen Soon" 122 | ] 123 | ``` 124 | 125 | 126 | 127 | ## Metrics 128 | 129 | Replace `/summarize` with `/detail` to see the full metrics for how the input was processed which will give you a better understanding of why certain sentences were deemed more important by the algorithm. 130 | 131 | - tfidf overlap for base relative sentence importance 132 | - html node boosts for tags like `

` and `` 133 | - listicle boosts for lists like `2) second item` 134 | - penalty for poor readability or really long sentences 135 | 136 | Here's an example of a sentence's internal structure after normalization, processing, and scoring: 137 | 138 | ```js 139 | { 140 | "index": 8, 141 | "sentence": { 142 | "original": "4. For the cost of 1 highly produced video, you can get a year's worth of videos from Automagical.", 143 | "listItem": 4, 144 | "actual": "For the cost of 1 highly produced video, you can get a year's worth of videos from Automagical.", 145 | "normalized": "for the cost of 1 highly produced video you can get a years worth of videos from automagical", 146 | "tokenized": [ 147 | "cost", 148 | "highly", 149 | "produced", 150 | "video", 151 | "years", 152 | "worth", 153 | "videos", 154 | "automagical" 155 | ] 156 | }, 157 | "liScore": 1, 158 | "nodeScore": 0.7, 159 | "readabilityPenalty": 0, 160 | "tfidfScore": 0.8019447657605553, 161 | "score": 5.601944765760555 162 | } 163 | ``` 164 | 165 | Support my OSS work by following me on twitter twitter 166 | -------------------------------------------------------------------------------- /saasify.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "synopsis", 3 | "description": "AI-powered content extraction and summarization for webpages and articles.", 4 | "alias": "synopsis", 5 | "services": [ 6 | { 7 | "src": "./src/summarize.ts", 8 | "examples": [ 9 | { 10 | "name": "The Onion 1", 11 | "description": "Summarizes an Onion article", 12 | "input": { 13 | "url": "https://www.theonion.com/fun-toy-banned-because-of-three-stupid-dead-kids-1819565691" 14 | } 15 | }, 16 | { 17 | "name": "The Onion 2", 18 | "description": "Summarizes an Onion article", 19 | "input": { 20 | "url": "https://local.theonion.com/plan-to-get-laid-at-dragoncon-2001-fails-1819566152" 21 | } 22 | }, 23 | { 24 | "name": "The Onion 3", 25 | "description": "Summarizes an Onion article", 26 | "input": { 27 | "url": "https://www.theonion.com/everyone-involved-in-pizzas-preparation-delivery-purc-1819564897" 28 | } 29 | }, 30 | { 31 | "name": "Wait But Why", 32 | "description": "Summarizes a blog post", 33 | "input": { 34 | "url": "https://waitbutwhy.com/2015/01/artificial-intelligence-revolution-1.html" 35 | } 36 | } 37 | ] 38 | }, 39 | { 40 | "src": "./src/detail.ts", 41 | "examples": [ 42 | { 43 | "name": "The Onion", 44 | "input": { 45 | "url": "https://www.theonion.com/everyone-involved-in-pizzas-preparation-delivery-purc-1819564897" 46 | } 47 | } 48 | ] 49 | }, 50 | { 51 | "src": "./src/extract.ts", 52 | "examples": [ 53 | { 54 | "name": "The Onion", 55 | "input": { 56 | "url": "https://www.theonion.com/everyone-involved-in-pizzas-preparation-delivery-purc-1819564897" 57 | } 58 | } 59 | ] 60 | } 61 | ], 62 | "amountPerBase": 1000, 63 | "amountPerRequest": 2, 64 | "amountPerCompute": 0, 65 | "amountPerBandwidth": 0, 66 | "noAuthRateLimit": { 67 | "requests": true, 68 | "requestsInterval": 3600, 69 | "requestsMaxPerInterval": 10 70 | }, 71 | "saas": { 72 | "name": "synopsis", 73 | "heading": "**Summarize articles automagically.**", 74 | "subheading": "AI-powered content extraction and summarization for webpages and articles.", 75 | "repo": "https://github.com/saasify-sh/synopsis", 76 | "logo": "./media/logo.svg", 77 | "favicon": "./media/favicon.ico", 78 | "features": [ 79 | { 80 | "name": "AI Web Extraction", 81 | "desc": "Our AI powered engine extracts the main content from any website with ease.", 82 | "icon": "media/icons/ai.svg" 83 | }, 84 | { 85 | "name": "Summarization", 86 | "desc": "Easily summarize the main content of any website via URL, raw text, or HTML.", 87 | "icon": "media/icons/body.svg" 88 | }, 89 | { 90 | "name": "Get Going Fast", 91 | "desc": "Try out the REST API for FREE and then sign up with your GitHub account and Stripe.", 92 | "icon": "media/icons/developer_activity.svg" 93 | }, 94 | { 95 | "name": "Cost Effective", 96 | "desc": "Building on top of serverless functions means you'll only pay for the API calls you actually use.", 97 | "icon": "media/icons/stripe.svg" 98 | }, 99 | { 100 | "name": "Open Source", 101 | "desc": "Check out the source on [GitHub](https://github.com/saasify-sh/synopsis)!", 102 | "icon": "media/icons/code_review.svg" 103 | } 104 | ], 105 | "theme": { 106 | "name": "waves", 107 | "backgroundImage": "https://images.unsplash.com/photo-1495020689067-958852a7765e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=3450&q=80", 108 | "buttonStyle": "rounded", 109 | "color": "#2097e2", 110 | "wave": false, 111 | "gradientDark": true, 112 | "codeBlockDark": true 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/detail.ts: -------------------------------------------------------------------------------- 1 | import summarizeImpl = require('text-summarization') 2 | import isHtml = require('is-html') 3 | import * as types from './types' 4 | 5 | const extractArticleContent = require('./extract-article-content') 6 | 7 | /** 8 | * Summarizes the content of any `url` or `input` text. 9 | * 10 | * Must provide either `url` or `input`. 11 | * 12 | * Returns a more verbose description of the summary and intermediary scoring 13 | * for all input content. 14 | * 15 | * @param url - Link to website to summarize. 16 | * @param input - Text or HTML to summarize. 17 | * @param title - Title of `input` content. 18 | * @param numSentences - Optional number of sentences to produce. Default is to 19 | * infer a reasonable number based on the input's length. 20 | * @param minNumSentences - Optional minimum number of sentences to produce. 21 | * @param maxNumSentences - Optional maximum number of sentences to produce. 22 | * @param minImageWidth - Optional minimum image width when considering images in HTML. 23 | * @param minImageHeight - Optional minimum image height when considering images in HTML. 24 | * @param media - Whether or not to consider source media during summarization. 25 | */ 26 | export default async function detail( 27 | url?: string, 28 | input?: string, 29 | title?: string, 30 | numSentences?: number, 31 | minNumSentences: number = 1, 32 | maxNumSentences: number = 1000, 33 | minImageWidth: number = 400, 34 | minImageHeight: number = 300, 35 | media: boolean = false 36 | ): Promise { 37 | const opts: types.SummarizationOptions = { 38 | title, 39 | minNumSentences, 40 | maxNumSentences, 41 | minImageWidth, 42 | minImageHeight, 43 | media, 44 | detailedAll: true 45 | } 46 | 47 | if (url) { 48 | const article = await extractArticleContent(url) 49 | opts.title = article.title 50 | opts.html = article.html 51 | opts.text = article.text 52 | } else if (input) { 53 | const isInputHtml = isHtml(input) 54 | if (isInputHtml) { 55 | opts.html = input 56 | } else { 57 | opts.text = input 58 | } 59 | } else { 60 | throw new Error('must provide either "url" or "input" to process') 61 | } 62 | 63 | const result = await (summarizeImpl(opts) as Promise) 64 | // console.log(JSON.stringify(result, null, 2)) 65 | 66 | return result 67 | } 68 | -------------------------------------------------------------------------------- /src/extract-article-content.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const pick = require('lodash.pick') 4 | const { scrape } = require('scrapex') 5 | 6 | module.exports = async function scrapex(url) { 7 | const result = pick(await scrape(url), [ 8 | 'description', 9 | 'title', 10 | 'author', 11 | 'text' 12 | ]) 13 | 14 | return result 15 | } 16 | -------------------------------------------------------------------------------- /src/extract.ts: -------------------------------------------------------------------------------- 1 | const extractArticleContent = require('./extract-article-content') 2 | 3 | /** 4 | * Extracts the main article content from a webpage or article, in addition to lots of useful metadata. 5 | * 6 | * @param url - Link to website to process. 7 | */ 8 | export default async function extract(url: string): Promise { 9 | const article = await extractArticleContent(url) 10 | 11 | return article 12 | } 13 | -------------------------------------------------------------------------------- /src/summarize.ts: -------------------------------------------------------------------------------- 1 | import summarizeImpl = require('text-summarization') 2 | import isHtml = require('is-html') 3 | 4 | import * as types from './types' 5 | 6 | const extractArticleContent = require('./extract-article-content') 7 | 8 | /** 9 | * Summarizes the content of any `url` or `input` text. 10 | * 11 | * Must provide either `url` or `input`. 12 | * 13 | * Returns the summary as an array of strings / sentences. 14 | * 15 | * @param url - Link to website to summarize. 16 | * @param input - Text or HTML to summarize. 17 | * @param title - Title of `input` content. 18 | * @param numSentences - Optional number of sentences to produce. Default is to 19 | * infer a reasonable number based on the input's length. 20 | * @param minNumSentences - Optional minimum number of sentences to produce. 21 | * @param maxNumSentences - Optional maximum number of sentences to produce. 22 | * @param minImageWidth - Optional minimum image width when considering images in HTML. 23 | * @param minImageHeight - Optional minimum image height when considering images in HTML. 24 | * @param media - Whether or not to consider source media during summarization. 25 | */ 26 | export default async function summarize( 27 | url?: string, 28 | input?: string, 29 | title?: string, 30 | numSentences?: number, 31 | minNumSentences: number = 1, 32 | maxNumSentences: number = 1000, 33 | minImageWidth: number = 400, 34 | minImageHeight: number = 300, 35 | media: boolean = false 36 | ): Promise { 37 | const opts: types.SummarizationOptions = { 38 | title, 39 | minNumSentences, 40 | maxNumSentences, 41 | minImageWidth, 42 | minImageHeight, 43 | media 44 | } 45 | 46 | if (url) { 47 | const article = await extractArticleContent(url) 48 | opts.title = article.title 49 | opts.html = article.html 50 | opts.text = article.text 51 | } else if (input) { 52 | const isInputHtml = isHtml(input) 53 | if (isInputHtml) { 54 | opts.html = input 55 | } else { 56 | opts.text = input 57 | } 58 | } else { 59 | throw new Error('must provide either "url" or "input" to process') 60 | } 61 | 62 | const result = await (summarizeImpl(opts) as Promise) 63 | // console.log(JSON.stringify(result, null, 2)) 64 | 65 | return result.abstractive || result.extractive 66 | } 67 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | export interface SummarizationSentence { 2 | original: string 3 | listItem: number 4 | actual: string 5 | normalized: string 6 | tokenized: string[] 7 | } 8 | 9 | export interface SummarizationItem { 10 | index: number 11 | sentence: SummarizationSentence 12 | liScore: number 13 | nodeScore: number 14 | readabilityScore: number 15 | attributionScore: number 16 | tfidfScore: number 17 | score: number 18 | } 19 | 20 | export interface SummarizationOptions { 21 | html?: string 22 | text?: string 23 | title?: string 24 | minNumSentences?: number 25 | maxNumSentences?: number 26 | minImageWidth?: number 27 | minImageHeight?: number 28 | media?: boolean 29 | detailedAll?: boolean 30 | } 31 | 32 | export interface SummarizationResult { 33 | title: string 34 | 35 | extractive: string[] 36 | abstractive?: string[] 37 | 38 | topItems?: SummarizationItem[] 39 | items?: SummarizationItem[] 40 | } 41 | --------------------------------------------------------------------------------