├── .gitignore ├── LICENSE ├── README.md ├── background.js ├── contentScript.js ├── icons ├── icon128.png ├── icon16.png └── icon48.png ├── lib ├── turndown-plugin-gfm.js └── turndown.umd.js ├── manifest.json ├── popup.html └── popup.js /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .edge-debug-profile/ 3 | test.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Suyuchen Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ArXiv Markdown Parser - Chrome / Edge Extension 2 | 3 | ![Chrome Web Store Version](https://img.shields.io/chrome-web-store/v/pgklmbjeooblkfcgbibhkjpbbhoabbbo) ![GitHub License](https://img.shields.io/github/license/sheryc/arxiv-markdown-parser-plugin) ![GitHub Repo stars](https://img.shields.io/github/stars/sheryc/arxiv-markdown-parser-plugin) < Please leave a 🌟 if you find this plugin useful :D 4 | 5 | **🔥 Update: The plugin is now live on Chrome Web Store: [Chrome Web Store Link](https://chromewebstore.google.com/detail/arxiv-markdown-parser/pgklmbjeooblkfcgbibhkjpbbhoabbbo)** 6 | 7 | **Turn ArXiv Papers into Markdown with One Click** 8 | 9 | Are you tired of wrestling with PDFs when you need to analyze or excerpt research papers? In the age of LLMs, having clean, accessible text is more important than ever. The ArXiv Markdown Parser Chrome extension is built to streamline your research workflow by converting arXiv papers into clean, readable Markdown with a single click. 10 | 11 | > Note: Currently the extension only supports papers with an HTML version. Most of the new papers have an HTML version but some are not. [It's a beta feature of ArXiv](https://info.arxiv.org/about/accessible_HTML.html) and more papers will be supported in the future. 12 | 13 | ## Table of Contents 14 | 15 | - [ArXiv Markdown Parser - Chrome / Edge Extension](#arxiv-markdown-parser---chrome--edge-extension) 16 | - [Table of Contents](#table-of-contents) 17 | - [Overview](#overview) 18 | - [Why Markdown Matters](#why-markdown-matters) 19 | - [Features](#features) 20 | - [Installation](#installation) 21 | - [Prerequisites](#prerequisites) 22 | - [Steps](#steps) 23 | - [Usage](#usage) 24 | - [License](#license) 25 | - [Acknowledgments](#acknowledgments) 26 | 27 | ## Overview 28 | 29 | The ArXiv Markdown Parser is designed to simplify the process of extracting content from arXiv papers. Whether you're conducting literature reviews, performing detailed analyses, or integrating research into LLM workflows, this extension helps you bypass the cumbersome process of PDF extraction. Instead, it converts the entire paper—including equations, tables, figures, and internal references—into well-structured Markdown, making it easier to read, share, and annotate. 30 | 31 | ### Why Markdown Matters 32 | 33 | - **LLM Integration:** Markdown text is much easier to feed into LLMs like ChatGPT. By bypassing the messy PDF extraction process, you ensure that models receive clean, structured input—ideal for summarization, translation, or analysis. 34 | - **Fast Table & Equation Copying:** Research papers often contain complex tables and equations. Converting to Markdown allows you to quickly copy and paste these elements into your notes or LLM prompts without formatting issues. 35 | - **Seamless Collaboration:** Markdown is one of the most popular formats for academic and technical documentation. Its compatibility with version control systems (like Git) makes it perfect for group research settings, collaborative wikis, or shared repositories. 36 | 37 | ## Features 38 | 39 | - **One-Click Conversion:** Simply open any arXiv paper (abs, pdf, or html) and click the extension icon to instantly convert the content into Markdown. 40 | - **Customizable Output:** Choose whether to include a table of contents and references in your Markdown output. 41 | - **Enhanced Research Workflow:** Quickly extract and organize key components of research papers for rapid summarization, annotation, or further analysis. 42 | - **Improved Equation Handling:** Easily obtain LaTeX or Markdown versions of equations for use in your notes, presentations, or LLM prompts. 43 | 44 | ## Installation 45 | 46 | ### Through Chrome Web Store 47 | 48 | The extension is live on [Chrome Web Store](https://chromewebstore.google.com/detail/arxiv-markdown-parser/pgklmbjeooblkfcgbibhkjpbbhoabbbo). I recommend installing from here if you're using Chrome or Edge. 49 | 50 | ### Manual Installation 51 | 52 | #### Prerequisites 53 | 54 | - [Google Chrome](https://www.google.com/chrome/) or any Chromium-based browser. 55 | 56 | #### Steps 57 | 58 | 1. **Clone the Repository:** 59 | 60 | ```bash 61 | git clone https://github.com/sheryc/arxiv-markdown-parser-plugin.git 62 | ``` 63 | 64 | or alternatively, download the zip or tarball file in the latest release and unzip it. 65 | 66 | 2. **Load the Extension in Chrome:** 67 | 68 | - Open Chrome and navigate to `chrome://extensions/`. 69 | - Enable **Developer Mode** (toggle in the top-right corner). 70 | - Click **Load unpacked** and select the directory where you cloned the repository or unzipped the zipfile / tarball. 71 | 72 | 3. **Installation Complete:** 73 | The extension should now appear in your Chrome toolbar. You are ready to convert arXiv papers into Markdown with a single click. 74 | 75 | ## Usage 76 | 77 | 1. **Open an arXiv Paper:** 78 | Navigate to any arXiv paper page (abstract, PDF, or HTML view). 79 | 80 | 2. **Activate the Extension:** 81 | Click on the ArXiv Markdown Parser icon in your browser toolbar. 82 | 83 | 3. **Configure Output Options:** 84 | Choose whether you want to include a table of contents and references in the generated Markdown. 85 | 86 | 4. **Get Your Markdown:** 87 | The extension will instantly convert the paper into Markdown format, including all equations, tables, figures, and internal references as links. 88 | 89 | ## License 90 | 91 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 92 | 93 | ## Acknowledgments 94 | 95 | - **LaTeXML** and **Turndown** for powering the conversion process. 96 | 97 | *Happy Researching!* (=・ω・=) 98 | -------------------------------------------------------------------------------- /background.js: -------------------------------------------------------------------------------- 1 | chrome.runtime.onInstalled.addListener(() => { 2 | console.log("ArXiv Markdown Parser extension installed."); 3 | }); 4 | -------------------------------------------------------------------------------- /contentScript.js: -------------------------------------------------------------------------------- 1 | console.log("[DEBUG] contentScript.js loaded!"); 2 | 3 | async function parseArxiv(arxivId, removeRefs = false, removeTable = false) { 4 | const url = `https://arxiv.org/html/${arxivId}`; 5 | const response = await fetch(url); 6 | if (!response.ok) { 7 | throw new Error(`Failed to fetch arxiv HTML for ${arxivId}. Status: ${response.status}`); 8 | } 9 | const htmlText = await response.text(); 10 | 11 | // parse into DOM 12 | const parser = new DOMParser(); 13 | const doc = parser.parseFromString(htmlText, "text/html"); 14 | 15 | // replace with inline $...$ 16 | convertAllMathMLtoLatex(doc); 17 | 18 | // fix tabular tables 19 | fixTabularTables(doc); 20 | 21 | // setup Turndown with GFM plugin 22 | const turndownService = new TurndownService({ 23 | headingStyle: "atx", 24 | codeBlockStyle: "fenced", 25 | fence: "```", 26 | bulletListMarker: "-", 27 | emDelimiter: "*", 28 | strongDelimiter: "**" 29 | }); 30 | turndownService.use(turndownPluginGfm.gfm); 31 | 32 | turndownService.addRule('mathContentTables', { 33 | filter: function (node) { 34 | return ( 35 | node.nodeName === 'TABLE' && 36 | node.innerHTML.includes('$') && 37 | !/ltx_equationgroup|ltx_eqn_align|ltx_eqn_table/.test(node.className || '') 38 | ) 39 | }, 40 | replacement: function (content, node) { 41 | const rows = Array.from(node.rows); 42 | let markdown = ''; 43 | 44 | rows.forEach((row, rowIndex) => { 45 | const cells = Array.from(row.cells); 46 | 47 | // Preserve cell content including math formatting 48 | markdown += '| ' + cells.map(cell => { 49 | return cell.textContent.trim(); 50 | }).join(' | ') + ' |\n'; 51 | 52 | // Add separator row after first row 53 | if (rowIndex === 0) { 54 | markdown += '| ' + cells.map(() => '---').join(' | ') + ' |\n'; 55 | } 56 | }); 57 | 58 | return '\n\n' + markdown + '\n\n'; 59 | } 60 | }); 61 | 62 | // turn LaTeX equation tables into $$ block equations $$ 63 | turndownService.addRule("latexEquationTables", { 64 | filter: function (node) { 65 | if (node.nodeName === "TABLE") { 66 | const cls = node.getAttribute("class") || ""; 67 | // If it has ltx_tabular, we do NOT treat it as an equation 68 | if (/\bltx_tabular\b/.test(cls)) return false; 69 | // If it has ltx_equationgroup, ltx_eqn_align, or ltx_eqn_table => treat as block equation 70 | return /ltx_equationgroup|ltx_eqn_align|ltx_eqn_table/.test(cls); 71 | } 72 | return false; 73 | }, 74 | replacement: function (content, node) { 75 | // We'll use the node's textContent as the equation text 76 | let eqnText = node.textContent.trim(); 77 | eqnText = eqnText.replace(/\s+/g, " "); 78 | eqnText = eqnText.replace(/^\$/, ""); 79 | eqnText = eqnText.replace(/\(\d+\)$/, ""); 80 | eqnText = eqnText.replace(/\$$/, ""); 81 | return `$$ ${eqnText} $$`; 82 | }, 83 | }); 84 | 85 | // convert the DOM to md 86 | let markdown = turndownService.turndown(doc.documentElement.innerHTML); 87 | 88 | // reformat paragraphs vs. table lines 89 | markdown = removeLineBreaksOutsideTables(markdown); 90 | 91 | // unescape double backslashes for correct LaTeX 92 | markdown = unescapeDoubleBackslashes(markdown); 93 | 94 | // Fix any remaining escaped underscores in LaTeX 95 | markdown = fixLatexUnderscores(markdown); 96 | 97 | if (removeRefs) { 98 | markdown = removeReferences(markdown); 99 | } else { 100 | markdown = preserveReferencesLineBreaks(markdown); 101 | } 102 | 103 | if (removeTable) { 104 | markdown = removeContentTable(markdown); 105 | } else { 106 | markdown = reformatTableOfContents(markdown); 107 | } 108 | 109 | return markdown; 110 | } 111 | 112 | function removeAllAttributes(elem) { 113 | // Repeatedly remove the first attribute until none remain 114 | while (elem.attributes && elem.attributes.length > 0) { 115 | elem.removeAttribute(elem.attributes[0].name); 116 | } 117 | } 118 | 119 | function reformatTableOfContents(markdown) { 120 | const paragraphs = markdown.split("\n\n"); 121 | 122 | if (paragraphs.length > 1) { 123 | const tocParagraph = paragraphs[1]; 124 | if (tocParagraph.includes("http") && (tocParagraph.includes("[1") || tocParagraph.includes("[2"))) { 125 | const linkPattern = /(\[[^\]]+\]\([^)]+\))/g; 126 | const links = tocParagraph.match(linkPattern) || []; 127 | 128 | if (links.length > 0) { 129 | const formattedLinks = []; 130 | 131 | for (const link of links) { 132 | const sectionMatch = link.match(/\[(\d+(?:\.\d+)*)\s+([^\]]+)\]/); 133 | 134 | if (sectionMatch) { 135 | const sectionNumber = sectionMatch[1]; // e.g., "1", "2.1" 136 | const parts = sectionNumber.split('.'); 137 | const level = parts.length; 138 | 139 | const indent = ' '.repeat(level - 1); 140 | formattedLinks.push(indent + link); 141 | } else { 142 | formattedLinks.push(link); 143 | } 144 | } 145 | paragraphs[1] = formattedLinks.join('\n'); 146 | } 147 | } 148 | } 149 | 150 | return paragraphs.join("\n\n"); 151 | } 152 | 153 | function preserveReferencesLineBreaks(markdown) { 154 | const refMarkers = ["References ----------", "## References", "### References", "#### References", "###### References"]; 155 | 156 | let refSection = null; 157 | for (const marker of refMarkers) { 158 | const markerIndex = markdown.indexOf(marker); 159 | if (markerIndex !== -1) { 160 | let endIndex = markdown.length; 161 | const nextHeadingMatch = markdown.slice(startIndex).match(/\n\s*#(?!#)/); 162 | if (nextHeadingMatch) { 163 | endIndex = startIndex + nextHeadingMatch.index; 164 | } 165 | 166 | refSection = { 167 | before: markdown.substring(0, markerIndex), 168 | marker: marker, 169 | content: markdown.substring(startIndex, endIndex), 170 | after: markdown.substring(endIndex) 171 | }; 172 | break; 173 | } 174 | } 175 | 176 | if (!refSection) { 177 | return markdown; // No references found 178 | } 179 | 180 | let formattedRefs = refSection.content; 181 | formattedRefs = formattedRefs.replace(/(?!^)\s*-\s+/g, '\n\n- '); 182 | return refSection.before + refSection.marker + formattedRefs + refSection.after; 183 | } 184 | 185 | 186 | function fixTabularTables(root) { 187 | // Select all elements that have class="ltx_tabular" 188 | const tables = root.querySelectorAll("table.ltx_tabular"); 189 | tables.forEach((table) => { 190 | removeAllAttributes(table); 191 | 192 | table.querySelectorAll("tbody, thead, tfoot, tr, td, th").forEach((el) => { 193 | removeAllAttributes(el); 194 | }); 195 | }); 196 | } 197 | 198 | function fixLatexUnderscores(markdown) { 199 | // Fix any remaining escaped underscores in LaTeX expressions 200 | return markdown.replace(/\$([^$]*?)\$/g, function(match, latex) { 201 | // Replace \_ with _ and \^ with ^ 202 | return '$' + latex.replace(/\\_/g, '_').replace(/\\\^/g, '^') + '$'; 203 | }); 204 | } 205 | 206 | function convertAllMathMLtoLatex(root) { 207 | const mathElements = root.querySelectorAll("math"); 208 | mathElements.forEach((math) => { 209 | const annotation = math.querySelector('annotation[encoding="application/x-tex"]'); 210 | if (annotation && annotation.textContent) { 211 | let latexSource = annotation.textContent.trim(); 212 | latexSource = latexSource.replace(/(? element with inline LaTeX delimited by $ signs. 217 | math.replaceWith(`$${latexSource}$`); 218 | console.log(math) 219 | } else { 220 | console.log("No annotation found"); 221 | math.replaceWith(math.textContent); 222 | } 223 | }); 224 | } 225 | 226 | 227 | function removeReferences(markdown) { 228 | const refMarkers = ["References ----------", "###### References"]; 229 | const paragraphs = markdown.split("\n\n"); 230 | 231 | const refIndex = paragraphs.findIndex(para => 232 | refMarkers.some(marker => para.includes(marker)) 233 | ); 234 | 235 | if (refIndex === -1) { 236 | return markdown; 237 | } 238 | 239 | paragraphs.splice(refIndex, 1); 240 | 241 | if (refIndex < paragraphs.length) { 242 | paragraphs.splice(refIndex, 1); 243 | } 244 | 245 | return paragraphs.join("\n\n"); 246 | } 247 | 248 | function removeLineBreaksOutsideTables(markdown) { 249 | let blocks = markdown.split(/\n\s*\n/); 250 | 251 | const processedBlocks = blocks.map((block) => { 252 | const lines = block.split("\n"); 253 | const firstNonBlank = lines.find((l) => l.trim().length > 0); 254 | if (firstNonBlank && firstNonBlank.trim().startsWith("|")) { 255 | return lines.join("\n"); 256 | } else { 257 | return lines.map((l) => l.trim()).join(" "); 258 | } 259 | }); 260 | 261 | return processedBlocks.join("\n\n"); 262 | } 263 | 264 | function unescapeDoubleBackslashes(text) { 265 | let old; 266 | do { 267 | old = text; 268 | text = text.replace(/\\\\/g, "\\"); 269 | } while (text !== old); 270 | return text; 271 | } 272 | 273 | function removeContentTable(markdown) { 274 | const paragraphs = markdown.split("\n\n"); 275 | if (paragraphs.length > 2) { 276 | // Remove the first 2 paragraphs (title + content table) 277 | paragraphs.splice(0, 2); 278 | } 279 | return paragraphs.join("\n\n"); 280 | } 281 | 282 | function removeLastTwoLines(markdown) { 283 | const lines = markdown.split("\n"); 284 | if (lines.length > 2) { 285 | lines.splice(-2, 2); 286 | } 287 | return lines.join("\n"); 288 | } 289 | 290 | (function () { 291 | const arxivId = extractArxivId(window.location.href); 292 | 293 | chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { 294 | if (request.action === "getMarkdown") { 295 | if (!arxivId) { 296 | sendResponse({ 297 | success: false, 298 | markdown: "", 299 | error: "No arXiv ID detected on this page.", 300 | }); 301 | return true; 302 | } 303 | 304 | parseArxiv(arxivId, request.removeRefs) 305 | .then((md) => { 306 | let finalMD = md; 307 | finalMD = removeLastTwoLines(finalMD); 308 | if (request.removeTable) { 309 | finalMD = removeContentTable(finalMD); 310 | } 311 | sendResponse({ success: true, markdown: finalMD }); 312 | }) 313 | .catch((err) => { 314 | sendResponse({ success: false, markdown: "", error: err.toString() }); 315 | }); 316 | 317 | return true; // async 318 | } 319 | }); 320 | 321 | function extractArxivId(url) { 322 | const match = url.match(/arxiv\.org\/(abs|pdf|html)\/([^?#]+)/); 323 | if (!match) return null; 324 | let id = match[2]; 325 | id = id.replace(/\.pdf$/, ""); 326 | id = id.replace(/v\d+$/, ""); 327 | return id; 328 | } 329 | })(); 330 | -------------------------------------------------------------------------------- /icons/icon128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sheryc/arxiv-markdown-parser-plugin/a6d7a41ba59d75cfc673f664e1fb4b022fd3e7cc/icons/icon128.png -------------------------------------------------------------------------------- /icons/icon16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sheryc/arxiv-markdown-parser-plugin/a6d7a41ba59d75cfc673f664e1fb4b022fd3e7cc/icons/icon16.png -------------------------------------------------------------------------------- /icons/icon48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sheryc/arxiv-markdown-parser-plugin/a6d7a41ba59d75cfc673f664e1fb4b022fd3e7cc/icons/icon48.png -------------------------------------------------------------------------------- /lib/turndown-plugin-gfm.js: -------------------------------------------------------------------------------- 1 | var turndownPluginGfm = (function (exports) { 2 | 'use strict'; 3 | 4 | var highlightRegExp = /highlight-(?:text|source)-([a-z0-9]+)/; 5 | 6 | function highlightedCodeBlock (turndownService) { 7 | turndownService.addRule('highlightedCodeBlock', { 8 | filter: function (node) { 9 | var firstChild = node.firstChild; 10 | return ( 11 | node.nodeName === 'DIV' && 12 | highlightRegExp.test(node.className) && 13 | firstChild && 14 | firstChild.nodeName === 'PRE' 15 | ) 16 | }, 17 | replacement: function (content, node, options) { 18 | var className = node.className || ''; 19 | var language = (className.match(highlightRegExp) || [null, ''])[1]; 20 | 21 | return ( 22 | '\n\n' + options.fence + language + '\n' + 23 | node.firstChild.textContent + 24 | '\n' + options.fence + '\n\n' 25 | ) 26 | } 27 | }); 28 | } 29 | 30 | function strikethrough (turndownService) { 31 | turndownService.addRule('strikethrough', { 32 | filter: ['del', 's', 'strike'], 33 | replacement: function (content) { 34 | return '~' + content + '~' 35 | } 36 | }); 37 | } 38 | 39 | var indexOf = Array.prototype.indexOf; 40 | var every = Array.prototype.every; 41 | var rules = {}; 42 | 43 | rules.tableCell = { 44 | filter: ['th', 'td'], 45 | replacement: function (content, node) { 46 | return cell(content, node) 47 | } 48 | }; 49 | 50 | rules.tableRow = { 51 | filter: 'tr', 52 | replacement: function (content, node) { 53 | var borderCells = ''; 54 | var alignMap = { left: ':--', right: '--:', center: ':-:' }; 55 | 56 | if (isHeadingRow(node)) { 57 | for (var i = 0; i < node.childNodes.length; i++) { 58 | var border = '---'; 59 | var align = ( 60 | node.childNodes[i].getAttribute('align') || '' 61 | ).toLowerCase(); 62 | 63 | if (align) border = alignMap[align] || border; 64 | 65 | borderCells += cell(border, node.childNodes[i]); 66 | } 67 | } 68 | return '\n' + content + (borderCells ? '\n' + borderCells : '') 69 | } 70 | }; 71 | 72 | rules.table = { 73 | // Only convert tables with a heading row. 74 | // Tables with no heading row are kept using `keep` (see below). 75 | filter: function (node) { 76 | return node.nodeName === 'TABLE' && isHeadingRow(node.rows[0]) 77 | }, 78 | 79 | replacement: function (content) { 80 | // Ensure there are no blank lines 81 | content = content.replace('\n\n', '\n'); 82 | return '\n\n' + content + '\n\n' 83 | } 84 | }; 85 | 86 | rules.tableSection = { 87 | filter: ['thead', 'tbody', 'tfoot'], 88 | replacement: function (content) { 89 | return content 90 | } 91 | }; 92 | 93 | // A tr is a heading row if: 94 | // - the parent is a THEAD 95 | // - or if its the first child of the TABLE or the first TBODY (possibly 96 | // following a blank THEAD) 97 | // - and every cell is a TH 98 | function isHeadingRow (tr) { 99 | var parentNode = tr.parentNode; 100 | return ( 101 | parentNode.nodeName === 'THEAD' || 102 | ( 103 | parentNode.firstChild === tr && 104 | (parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) && 105 | every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' }) 106 | ) 107 | ) 108 | } 109 | 110 | function isFirstTbody (element) { 111 | var previousSibling = element.previousSibling; 112 | return ( 113 | element.nodeName === 'TBODY' && ( 114 | !previousSibling || 115 | ( 116 | previousSibling.nodeName === 'THEAD' && 117 | /^\s*$/i.test(previousSibling.textContent) 118 | ) 119 | ) 120 | ) 121 | } 122 | 123 | function cell (content, node) { 124 | var index = indexOf.call(node.parentNode.childNodes, node); 125 | var prefix = ' '; 126 | if (index === 0) prefix = '| '; 127 | return prefix + content + ' |' 128 | } 129 | 130 | function tables (turndownService) { 131 | turndownService.keep(function (node) { 132 | return node.nodeName === 'TABLE' && !isHeadingRow(node.rows[0]) 133 | }); 134 | for (var key in rules) turndownService.addRule(key, rules[key]); 135 | } 136 | 137 | function taskListItems (turndownService) { 138 | turndownService.addRule('taskListItems', { 139 | filter: function (node) { 140 | return node.type === 'checkbox' && node.parentNode.nodeName === 'LI' 141 | }, 142 | replacement: function (content, node) { 143 | return (node.checked ? '[x]' : '[ ]') + ' ' 144 | } 145 | }); 146 | } 147 | 148 | function gfm (turndownService) { 149 | turndownService.use([ 150 | highlightedCodeBlock, 151 | strikethrough, 152 | tables, 153 | taskListItems 154 | ]); 155 | } 156 | 157 | exports.gfm = gfm; 158 | exports.highlightedCodeBlock = highlightedCodeBlock; 159 | exports.strikethrough = strikethrough; 160 | exports.tables = tables; 161 | exports.taskListItems = taskListItems; 162 | 163 | return exports; 164 | 165 | }({})); -------------------------------------------------------------------------------- /lib/turndown.umd.js: -------------------------------------------------------------------------------- 1 | (function (global, factory) { 2 | typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : 3 | typeof define === 'function' && define.amd ? define(factory) : 4 | (global = typeof globalThis !== 'undefined' ? globalThis : global || self, global.TurndownService = factory()); 5 | }(this, (function () { 'use strict'; 6 | 7 | function extend (destination) { 8 | for (var i = 1; i < arguments.length; i++) { 9 | var source = arguments[i]; 10 | for (var key in source) { 11 | if (source.hasOwnProperty(key)) destination[key] = source[key]; 12 | } 13 | } 14 | return destination 15 | } 16 | 17 | function repeat (character, count) { 18 | return Array(count + 1).join(character) 19 | } 20 | 21 | function trimLeadingNewlines (string) { 22 | return string.replace(/^\n*/, '') 23 | } 24 | 25 | function trimTrailingNewlines (string) { 26 | // avoid match-at-end regexp bottleneck, see #370 27 | var indexEnd = string.length; 28 | while (indexEnd > 0 && string[indexEnd - 1] === '\n') indexEnd--; 29 | return string.substring(0, indexEnd) 30 | } 31 | 32 | var blockElements = [ 33 | 'ADDRESS', 'ARTICLE', 'ASIDE', 'AUDIO', 'BLOCKQUOTE', 'BODY', 'CANVAS', 34 | 'CENTER', 'DD', 'DIR', 'DIV', 'DL', 'DT', 'FIELDSET', 'FIGCAPTION', 'FIGURE', 35 | 'FOOTER', 'FORM', 'FRAMESET', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HEADER', 36 | 'HGROUP', 'HR', 'HTML', 'ISINDEX', 'LI', 'MAIN', 'MENU', 'NAV', 'NOFRAMES', 37 | 'NOSCRIPT', 'OL', 'OUTPUT', 'P', 'PRE', 'SECTION', 'TABLE', 'TBODY', 'TD', 38 | 'TFOOT', 'TH', 'THEAD', 'TR', 'UL' 39 | ]; 40 | 41 | function isBlock (node) { 42 | return is(node, blockElements) 43 | } 44 | 45 | var voidElements = [ 46 | 'AREA', 'BASE', 'BR', 'COL', 'COMMAND', 'EMBED', 'HR', 'IMG', 'INPUT', 47 | 'KEYGEN', 'LINK', 'META', 'PARAM', 'SOURCE', 'TRACK', 'WBR' 48 | ]; 49 | 50 | function isVoid (node) { 51 | return is(node, voidElements) 52 | } 53 | 54 | function hasVoid (node) { 55 | return has(node, voidElements) 56 | } 57 | 58 | var meaningfulWhenBlankElements = [ 59 | 'A', 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TH', 'TD', 'IFRAME', 'SCRIPT', 60 | 'AUDIO', 'VIDEO' 61 | ]; 62 | 63 | function isMeaningfulWhenBlank (node) { 64 | return is(node, meaningfulWhenBlankElements) 65 | } 66 | 67 | function hasMeaningfulWhenBlank (node) { 68 | return has(node, meaningfulWhenBlankElements) 69 | } 70 | 71 | function is (node, tagNames) { 72 | return tagNames.indexOf(node.nodeName) >= 0 73 | } 74 | 75 | function has (node, tagNames) { 76 | return ( 77 | node.getElementsByTagName && 78 | tagNames.some(function (tagName) { 79 | return node.getElementsByTagName(tagName).length 80 | }) 81 | ) 82 | } 83 | 84 | var rules = {}; 85 | 86 | rules.paragraph = { 87 | filter: 'p', 88 | 89 | replacement: function (content) { 90 | return '\n\n' + content + '\n\n' 91 | } 92 | }; 93 | 94 | rules.lineBreak = { 95 | filter: 'br', 96 | 97 | replacement: function (content, node, options) { 98 | return options.br + '\n' 99 | } 100 | }; 101 | 102 | rules.heading = { 103 | filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], 104 | 105 | replacement: function (content, node, options) { 106 | var hLevel = Number(node.nodeName.charAt(1)); 107 | 108 | if (options.headingStyle === 'setext' && hLevel < 3) { 109 | var underline = repeat((hLevel === 1 ? '=' : '-'), content.length); 110 | return ( 111 | '\n\n' + content + '\n' + underline + '\n\n' 112 | ) 113 | } else { 114 | return '\n\n' + repeat('#', hLevel) + ' ' + content + '\n\n' 115 | } 116 | } 117 | }; 118 | 119 | rules.blockquote = { 120 | filter: 'blockquote', 121 | 122 | replacement: function (content) { 123 | content = content.replace(/^\n+|\n+$/g, ''); 124 | content = content.replace(/^/gm, '> '); 125 | return '\n\n' + content + '\n\n' 126 | } 127 | }; 128 | 129 | rules.list = { 130 | filter: ['ul', 'ol'], 131 | 132 | replacement: function (content, node) { 133 | var parent = node.parentNode; 134 | if (parent.nodeName === 'LI' && parent.lastElementChild === node) { 135 | return '\n' + content 136 | } else { 137 | return '\n\n' + content + '\n\n' 138 | } 139 | } 140 | }; 141 | 142 | rules.listItem = { 143 | filter: 'li', 144 | 145 | replacement: function (content, node, options) { 146 | content = content 147 | .replace(/^\n+/, '') // remove leading newlines 148 | .replace(/\n+$/, '\n') // replace trailing newlines with just a single one 149 | .replace(/\n/gm, '\n '); // indent 150 | var prefix = options.bulletListMarker + ' '; 151 | var parent = node.parentNode; 152 | if (parent.nodeName === 'OL') { 153 | var start = parent.getAttribute('start'); 154 | var index = Array.prototype.indexOf.call(parent.children, node); 155 | prefix = (start ? Number(start) + index : index + 1) + '. '; 156 | } 157 | return ( 158 | prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '') 159 | ) 160 | } 161 | }; 162 | 163 | rules.indentedCodeBlock = { 164 | filter: function (node, options) { 165 | return ( 166 | options.codeBlockStyle === 'indented' && 167 | node.nodeName === 'PRE' && 168 | node.firstChild && 169 | node.firstChild.nodeName === 'CODE' 170 | ) 171 | }, 172 | 173 | replacement: function (content, node, options) { 174 | return ( 175 | '\n\n ' + 176 | node.firstChild.textContent.replace(/\n/g, '\n ') + 177 | '\n\n' 178 | ) 179 | } 180 | }; 181 | 182 | rules.fencedCodeBlock = { 183 | filter: function (node, options) { 184 | return ( 185 | options.codeBlockStyle === 'fenced' && 186 | node.nodeName === 'PRE' && 187 | node.firstChild && 188 | node.firstChild.nodeName === 'CODE' 189 | ) 190 | }, 191 | 192 | replacement: function (content, node, options) { 193 | var className = node.firstChild.getAttribute('class') || ''; 194 | var language = (className.match(/language-(\S+)/) || [null, ''])[1]; 195 | var code = node.firstChild.textContent; 196 | 197 | var fenceChar = options.fence.charAt(0); 198 | var fenceSize = 3; 199 | var fenceInCodeRegex = new RegExp('^' + fenceChar + '{3,}', 'gm'); 200 | 201 | var match; 202 | while ((match = fenceInCodeRegex.exec(code))) { 203 | if (match[0].length >= fenceSize) { 204 | fenceSize = match[0].length + 1; 205 | } 206 | } 207 | 208 | var fence = repeat(fenceChar, fenceSize); 209 | 210 | return ( 211 | '\n\n' + fence + language + '\n' + 212 | code.replace(/\n$/, '') + 213 | '\n' + fence + '\n\n' 214 | ) 215 | } 216 | }; 217 | 218 | rules.horizontalRule = { 219 | filter: 'hr', 220 | 221 | replacement: function (content, node, options) { 222 | return '\n\n' + options.hr + '\n\n' 223 | } 224 | }; 225 | 226 | rules.inlineLink = { 227 | filter: function (node, options) { 228 | return ( 229 | options.linkStyle === 'inlined' && 230 | node.nodeName === 'A' && 231 | node.getAttribute('href') 232 | ) 233 | }, 234 | 235 | replacement: function (content, node) { 236 | var href = node.getAttribute('href'); 237 | if (href) href = href.replace(/([()])/g, '\\$1'); 238 | var title = cleanAttribute(node.getAttribute('title')); 239 | if (title) title = ' "' + title.replace(/"/g, '\\"') + '"'; 240 | return '[' + content + '](' + href + title + ')' 241 | } 242 | }; 243 | 244 | rules.referenceLink = { 245 | filter: function (node, options) { 246 | return ( 247 | options.linkStyle === 'referenced' && 248 | node.nodeName === 'A' && 249 | node.getAttribute('href') 250 | ) 251 | }, 252 | 253 | replacement: function (content, node, options) { 254 | var href = node.getAttribute('href'); 255 | var title = cleanAttribute(node.getAttribute('title')); 256 | if (title) title = ' "' + title + '"'; 257 | var replacement; 258 | var reference; 259 | 260 | switch (options.linkReferenceStyle) { 261 | case 'collapsed': 262 | replacement = '[' + content + '][]'; 263 | reference = '[' + content + ']: ' + href + title; 264 | break 265 | case 'shortcut': 266 | replacement = '[' + content + ']'; 267 | reference = '[' + content + ']: ' + href + title; 268 | break 269 | default: 270 | var id = this.references.length + 1; 271 | replacement = '[' + content + '][' + id + ']'; 272 | reference = '[' + id + ']: ' + href + title; 273 | } 274 | 275 | this.references.push(reference); 276 | return replacement 277 | }, 278 | 279 | references: [], 280 | 281 | append: function (options) { 282 | var references = ''; 283 | if (this.references.length) { 284 | references = '\n\n' + this.references.join('\n') + '\n\n'; 285 | this.references = []; // Reset references 286 | } 287 | return references 288 | } 289 | }; 290 | 291 | rules.emphasis = { 292 | filter: ['em', 'i'], 293 | 294 | replacement: function (content, node, options) { 295 | if (!content.trim()) return '' 296 | return options.emDelimiter + content + options.emDelimiter 297 | } 298 | }; 299 | 300 | rules.strong = { 301 | filter: ['strong', 'b'], 302 | 303 | replacement: function (content, node, options) { 304 | if (!content.trim()) return '' 305 | return options.strongDelimiter + content + options.strongDelimiter 306 | } 307 | }; 308 | 309 | rules.code = { 310 | filter: function (node) { 311 | var hasSiblings = node.previousSibling || node.nextSibling; 312 | var isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings; 313 | 314 | return node.nodeName === 'CODE' && !isCodeBlock 315 | }, 316 | 317 | replacement: function (content) { 318 | if (!content) return '' 319 | content = content.replace(/\r?\n|\r/g, ' '); 320 | 321 | var extraSpace = /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : ''; 322 | var delimiter = '`'; 323 | var matches = content.match(/`+/gm) || []; 324 | while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'; 325 | 326 | return delimiter + extraSpace + content + extraSpace + delimiter 327 | } 328 | }; 329 | 330 | rules.image = { 331 | filter: 'img', 332 | 333 | replacement: function (content, node) { 334 | var alt = cleanAttribute(node.getAttribute('alt')); 335 | var src = node.getAttribute('src') || ''; 336 | var title = cleanAttribute(node.getAttribute('title')); 337 | var titlePart = title ? ' "' + title + '"' : ''; 338 | return src ? '![' + alt + ']' + '(' + src + titlePart + ')' : '' 339 | } 340 | }; 341 | 342 | function cleanAttribute (attribute) { 343 | return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '' 344 | } 345 | 346 | /** 347 | * Manages a collection of rules used to convert HTML to Markdown 348 | */ 349 | 350 | function Rules (options) { 351 | this.options = options; 352 | this._keep = []; 353 | this._remove = []; 354 | 355 | this.blankRule = { 356 | replacement: options.blankReplacement 357 | }; 358 | 359 | this.keepReplacement = options.keepReplacement; 360 | 361 | this.defaultRule = { 362 | replacement: options.defaultReplacement 363 | }; 364 | 365 | this.array = []; 366 | for (var key in options.rules) this.array.push(options.rules[key]); 367 | } 368 | 369 | Rules.prototype = { 370 | add: function (key, rule) { 371 | this.array.unshift(rule); 372 | }, 373 | 374 | keep: function (filter) { 375 | this._keep.unshift({ 376 | filter: filter, 377 | replacement: this.keepReplacement 378 | }); 379 | }, 380 | 381 | remove: function (filter) { 382 | this._remove.unshift({ 383 | filter: filter, 384 | replacement: function () { 385 | return '' 386 | } 387 | }); 388 | }, 389 | 390 | forNode: function (node) { 391 | if (node.isBlank) return this.blankRule 392 | var rule; 393 | 394 | if ((rule = findRule(this.array, node, this.options))) return rule 395 | if ((rule = findRule(this._keep, node, this.options))) return rule 396 | if ((rule = findRule(this._remove, node, this.options))) return rule 397 | 398 | return this.defaultRule 399 | }, 400 | 401 | forEach: function (fn) { 402 | for (var i = 0; i < this.array.length; i++) fn(this.array[i], i); 403 | } 404 | }; 405 | 406 | function findRule (rules, node, options) { 407 | for (var i = 0; i < rules.length; i++) { 408 | var rule = rules[i]; 409 | if (filterValue(rule, node, options)) return rule 410 | } 411 | return void 0 412 | } 413 | 414 | function filterValue (rule, node, options) { 415 | var filter = rule.filter; 416 | if (typeof filter === 'string') { 417 | if (filter === node.nodeName.toLowerCase()) return true 418 | } else if (Array.isArray(filter)) { 419 | if (filter.indexOf(node.nodeName.toLowerCase()) > -1) return true 420 | } else if (typeof filter === 'function') { 421 | if (filter.call(rule, node, options)) return true 422 | } else { 423 | throw new TypeError('`filter` needs to be a string, array, or function') 424 | } 425 | } 426 | 427 | /** 428 | * The collapseWhitespace function is adapted from collapse-whitespace 429 | * by Luc Thevenard. 430 | * 431 | * The MIT License (MIT) 432 | * 433 | * Copyright (c) 2014 Luc Thevenard 434 | * 435 | * Permission is hereby granted, free of charge, to any person obtaining a copy 436 | * of this software and associated documentation files (the "Software"), to deal 437 | * in the Software without restriction, including without limitation the rights 438 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 439 | * copies of the Software, and to permit persons to whom the Software is 440 | * furnished to do so, subject to the following conditions: 441 | * 442 | * The above copyright notice and this permission notice shall be included in 443 | * all copies or substantial portions of the Software. 444 | * 445 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 446 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 447 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 448 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 449 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 450 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 451 | * THE SOFTWARE. 452 | */ 453 | 454 | /** 455 | * collapseWhitespace(options) removes extraneous whitespace from an the given element. 456 | * 457 | * @param {Object} options 458 | */ 459 | function collapseWhitespace (options) { 460 | var element = options.element; 461 | var isBlock = options.isBlock; 462 | var isVoid = options.isVoid; 463 | var isPre = options.isPre || function (node) { 464 | return node.nodeName === 'PRE' 465 | }; 466 | 467 | if (!element.firstChild || isPre(element)) return 468 | 469 | var prevText = null; 470 | var keepLeadingWs = false; 471 | 472 | var prev = null; 473 | var node = next(prev, element, isPre); 474 | 475 | while (node !== element) { 476 | if (node.nodeType === 3 || node.nodeType === 4) { // Node.TEXT_NODE or Node.CDATA_SECTION_NODE 477 | var text = node.data.replace(/[ \r\n\t]+/g, ' '); 478 | 479 | if ((!prevText || / $/.test(prevText.data)) && 480 | !keepLeadingWs && text[0] === ' ') { 481 | text = text.substr(1); 482 | } 483 | 484 | // `text` might be empty at this point. 485 | if (!text) { 486 | node = remove(node); 487 | continue 488 | } 489 | 490 | node.data = text; 491 | 492 | prevText = node; 493 | } else if (node.nodeType === 1) { // Node.ELEMENT_NODE 494 | if (isBlock(node) || node.nodeName === 'BR') { 495 | if (prevText) { 496 | prevText.data = prevText.data.replace(/ $/, ''); 497 | } 498 | 499 | prevText = null; 500 | keepLeadingWs = false; 501 | } else if (isVoid(node) || isPre(node)) { 502 | // Avoid trimming space around non-block, non-BR void elements and inline PRE. 503 | prevText = null; 504 | keepLeadingWs = true; 505 | } else if (prevText) { 506 | // Drop protection if set previously. 507 | keepLeadingWs = false; 508 | } 509 | } else { 510 | node = remove(node); 511 | continue 512 | } 513 | 514 | var nextNode = next(prev, node, isPre); 515 | prev = node; 516 | node = nextNode; 517 | } 518 | 519 | if (prevText) { 520 | prevText.data = prevText.data.replace(/ $/, ''); 521 | if (!prevText.data) { 522 | remove(prevText); 523 | } 524 | } 525 | } 526 | 527 | /** 528 | * remove(node) removes the given node from the DOM and returns the 529 | * next node in the sequence. 530 | * 531 | * @param {Node} node 532 | * @return {Node} node 533 | */ 534 | function remove (node) { 535 | var next = node.nextSibling || node.parentNode; 536 | 537 | node.parentNode.removeChild(node); 538 | 539 | return next 540 | } 541 | 542 | /** 543 | * next(prev, current, isPre) returns the next node in the sequence, given the 544 | * current and previous nodes. 545 | * 546 | * @param {Node} prev 547 | * @param {Node} current 548 | * @param {Function} isPre 549 | * @return {Node} 550 | */ 551 | function next (prev, current, isPre) { 552 | if ((prev && prev.parentNode === current) || isPre(current)) { 553 | return current.nextSibling || current.parentNode 554 | } 555 | 556 | return current.firstChild || current.nextSibling || current.parentNode 557 | } 558 | 559 | /* 560 | * Set up window for Node.js 561 | */ 562 | 563 | var root = (typeof window !== 'undefined' ? window : {}); 564 | 565 | /* 566 | * Parsing HTML strings 567 | */ 568 | 569 | function canParseHTMLNatively () { 570 | var Parser = root.DOMParser; 571 | var canParse = false; 572 | 573 | // Adapted from https://gist.github.com/1129031 574 | // Firefox/Opera/IE throw errors on unsupported types 575 | try { 576 | // WebKit returns null on unsupported types 577 | if (new Parser().parseFromString('', 'text/html')) { 578 | canParse = true; 579 | } 580 | } catch (e) {} 581 | 582 | return canParse 583 | } 584 | 585 | function createHTMLParser () { 586 | var Parser = function () {}; 587 | 588 | { 589 | var domino = require('@mixmark-io/domino'); 590 | Parser.prototype.parseFromString = function (string) { 591 | return domino.createDocument(string) 592 | }; 593 | } 594 | return Parser 595 | } 596 | 597 | var HTMLParser = canParseHTMLNatively() ? root.DOMParser : createHTMLParser(); 598 | 599 | function RootNode (input, options) { 600 | var root; 601 | if (typeof input === 'string') { 602 | var doc = htmlParser().parseFromString( 603 | // DOM parsers arrange elements in the and . 604 | // Wrapping in a custom element ensures elements are reliably arranged in 605 | // a single element. 606 | '' + input + '', 607 | 'text/html' 608 | ); 609 | root = doc.getElementById('turndown-root'); 610 | } else { 611 | root = input.cloneNode(true); 612 | } 613 | collapseWhitespace({ 614 | element: root, 615 | isBlock: isBlock, 616 | isVoid: isVoid, 617 | isPre: options.preformattedCode ? isPreOrCode : null 618 | }); 619 | 620 | return root 621 | } 622 | 623 | var _htmlParser; 624 | function htmlParser () { 625 | _htmlParser = _htmlParser || new HTMLParser(); 626 | return _htmlParser 627 | } 628 | 629 | function isPreOrCode (node) { 630 | return node.nodeName === 'PRE' || node.nodeName === 'CODE' 631 | } 632 | 633 | function Node (node, options) { 634 | node.isBlock = isBlock(node); 635 | node.isCode = node.nodeName === 'CODE' || node.parentNode.isCode; 636 | node.isBlank = isBlank(node); 637 | node.flankingWhitespace = flankingWhitespace(node, options); 638 | return node 639 | } 640 | 641 | function isBlank (node) { 642 | return ( 643 | !isVoid(node) && 644 | !isMeaningfulWhenBlank(node) && 645 | /^\s*$/i.test(node.textContent) && 646 | !hasVoid(node) && 647 | !hasMeaningfulWhenBlank(node) 648 | ) 649 | } 650 | 651 | function flankingWhitespace (node, options) { 652 | if (node.isBlock || (options.preformattedCode && node.isCode)) { 653 | return { leading: '', trailing: '' } 654 | } 655 | 656 | var edges = edgeWhitespace(node.textContent); 657 | 658 | // abandon leading ASCII WS if left-flanked by ASCII WS 659 | if (edges.leadingAscii && isFlankedByWhitespace('left', node, options)) { 660 | edges.leading = edges.leadingNonAscii; 661 | } 662 | 663 | // abandon trailing ASCII WS if right-flanked by ASCII WS 664 | if (edges.trailingAscii && isFlankedByWhitespace('right', node, options)) { 665 | edges.trailing = edges.trailingNonAscii; 666 | } 667 | 668 | return { leading: edges.leading, trailing: edges.trailing } 669 | } 670 | 671 | function edgeWhitespace (string) { 672 | var m = string.match(/^(([ \t\r\n]*)(\s*))(?:(?=\S)[\s\S]*\S)?((\s*?)([ \t\r\n]*))$/); 673 | return { 674 | leading: m[1], // whole string for whitespace-only strings 675 | leadingAscii: m[2], 676 | leadingNonAscii: m[3], 677 | trailing: m[4], // empty for whitespace-only strings 678 | trailingNonAscii: m[5], 679 | trailingAscii: m[6] 680 | } 681 | } 682 | 683 | function isFlankedByWhitespace (side, node, options) { 684 | var sibling; 685 | var regExp; 686 | var isFlanked; 687 | 688 | if (side === 'left') { 689 | sibling = node.previousSibling; 690 | regExp = / $/; 691 | } else { 692 | sibling = node.nextSibling; 693 | regExp = /^ /; 694 | } 695 | 696 | if (sibling) { 697 | if (sibling.nodeType === 3) { 698 | isFlanked = regExp.test(sibling.nodeValue); 699 | } else if (options.preformattedCode && sibling.nodeName === 'CODE') { 700 | isFlanked = false; 701 | } else if (sibling.nodeType === 1 && !isBlock(sibling)) { 702 | isFlanked = regExp.test(sibling.textContent); 703 | } 704 | } 705 | return isFlanked 706 | } 707 | 708 | var reduce = Array.prototype.reduce; 709 | var escapes = [ 710 | [/\\/g, '\\\\'], 711 | [/\*/g, '\\*'], 712 | [/^-/g, '\\-'], 713 | [/^\+ /g, '\\+ '], 714 | [/^(=+)/g, '\\$1'], 715 | [/^(#{1,6}) /g, '\\$1 '], 716 | [/`/g, '\\`'], 717 | [/^~~~/g, '\\~~~'], 718 | [/\[/g, '\\['], 719 | [/\]/g, '\\]'], 720 | [/^>/g, '\\>'], 721 | [/_/g, '\\_'], 722 | [/^(\d+)\. /g, '$1\\. '] 723 | ]; 724 | 725 | function TurndownService (options) { 726 | if (!(this instanceof TurndownService)) return new TurndownService(options) 727 | 728 | var defaults = { 729 | rules: rules, 730 | headingStyle: 'setext', 731 | hr: '* * *', 732 | bulletListMarker: '*', 733 | codeBlockStyle: 'indented', 734 | fence: '```', 735 | emDelimiter: '_', 736 | strongDelimiter: '**', 737 | linkStyle: 'inlined', 738 | linkReferenceStyle: 'full', 739 | br: ' ', 740 | preformattedCode: false, 741 | blankReplacement: function (content, node) { 742 | return node.isBlock ? '\n\n' : '' 743 | }, 744 | keepReplacement: function (content, node) { 745 | return node.isBlock ? '\n\n' + node.outerHTML + '\n\n' : node.outerHTML 746 | }, 747 | defaultReplacement: function (content, node) { 748 | return node.isBlock ? '\n\n' + content + '\n\n' : content 749 | } 750 | }; 751 | this.options = extend({}, defaults, options); 752 | this.rules = new Rules(this.options); 753 | } 754 | 755 | TurndownService.prototype = { 756 | /** 757 | * The entry point for converting a string or DOM node to Markdown 758 | * @public 759 | * @param {String|HTMLElement} input The string or DOM node to convert 760 | * @returns A Markdown representation of the input 761 | * @type String 762 | */ 763 | 764 | turndown: function (input) { 765 | if (!canConvert(input)) { 766 | throw new TypeError( 767 | input + ' is not a string, or an element/document/fragment node.' 768 | ) 769 | } 770 | 771 | if (input === '') return '' 772 | 773 | var output = process.call(this, new RootNode(input, this.options)); 774 | return postProcess.call(this, output) 775 | }, 776 | 777 | /** 778 | * Add one or more plugins 779 | * @public 780 | * @param {Function|Array} plugin The plugin or array of plugins to add 781 | * @returns The Turndown instance for chaining 782 | * @type Object 783 | */ 784 | 785 | use: function (plugin) { 786 | if (Array.isArray(plugin)) { 787 | for (var i = 0; i < plugin.length; i++) this.use(plugin[i]); 788 | } else if (typeof plugin === 'function') { 789 | plugin(this); 790 | } else { 791 | throw new TypeError('plugin must be a Function or an Array of Functions') 792 | } 793 | return this 794 | }, 795 | 796 | /** 797 | * Adds a rule 798 | * @public 799 | * @param {String} key The unique key of the rule 800 | * @param {Object} rule The rule 801 | * @returns The Turndown instance for chaining 802 | * @type Object 803 | */ 804 | 805 | addRule: function (key, rule) { 806 | this.rules.add(key, rule); 807 | return this 808 | }, 809 | 810 | /** 811 | * Keep a node (as HTML) that matches the filter 812 | * @public 813 | * @param {String|Array|Function} filter The unique key of the rule 814 | * @returns The Turndown instance for chaining 815 | * @type Object 816 | */ 817 | 818 | keep: function (filter) { 819 | this.rules.keep(filter); 820 | return this 821 | }, 822 | 823 | /** 824 | * Remove a node that matches the filter 825 | * @public 826 | * @param {String|Array|Function} filter The unique key of the rule 827 | * @returns The Turndown instance for chaining 828 | * @type Object 829 | */ 830 | 831 | remove: function (filter) { 832 | this.rules.remove(filter); 833 | return this 834 | }, 835 | 836 | /** 837 | * Escapes Markdown syntax 838 | * @public 839 | * @param {String} string The string to escape 840 | * @returns A string with Markdown syntax escaped 841 | * @type String 842 | */ 843 | 844 | escape: function (string) { 845 | return escapes.reduce(function (accumulator, escape) { 846 | return accumulator.replace(escape[0], escape[1]) 847 | }, string) 848 | } 849 | }; 850 | 851 | /** 852 | * Reduces a DOM node down to its Markdown string equivalent 853 | * @private 854 | * @param {HTMLElement} parentNode The node to convert 855 | * @returns A Markdown representation of the node 856 | * @type String 857 | */ 858 | 859 | function process (parentNode) { 860 | var self = this; 861 | return reduce.call(parentNode.childNodes, function (output, node) { 862 | node = new Node(node, self.options); 863 | 864 | var replacement = ''; 865 | if (node.nodeType === 3) { 866 | replacement = node.isCode ? node.nodeValue : self.escape(node.nodeValue); 867 | } else if (node.nodeType === 1) { 868 | replacement = replacementForNode.call(self, node); 869 | } 870 | 871 | return join(output, replacement) 872 | }, '') 873 | } 874 | 875 | /** 876 | * Appends strings as each rule requires and trims the output 877 | * @private 878 | * @param {String} output The conversion output 879 | * @returns A trimmed version of the ouput 880 | * @type String 881 | */ 882 | 883 | function postProcess (output) { 884 | var self = this; 885 | this.rules.forEach(function (rule) { 886 | if (typeof rule.append === 'function') { 887 | output = join(output, rule.append(self.options)); 888 | } 889 | }); 890 | 891 | return output.replace(/^[\t\r\n]+/, '').replace(/[\t\r\n\s]+$/, '') 892 | } 893 | 894 | /** 895 | * Converts an element node to its Markdown equivalent 896 | * @private 897 | * @param {HTMLElement} node The node to convert 898 | * @returns A Markdown representation of the node 899 | * @type String 900 | */ 901 | 902 | function replacementForNode (node) { 903 | var rule = this.rules.forNode(node); 904 | var content = process.call(this, node); 905 | var whitespace = node.flankingWhitespace; 906 | if (whitespace.leading || whitespace.trailing) content = content.trim(); 907 | return ( 908 | whitespace.leading + 909 | rule.replacement(content, node, this.options) + 910 | whitespace.trailing 911 | ) 912 | } 913 | 914 | /** 915 | * Joins replacement to the current output with appropriate number of new lines 916 | * @private 917 | * @param {String} output The current conversion output 918 | * @param {String} replacement The string to append to the output 919 | * @returns Joined output 920 | * @type String 921 | */ 922 | 923 | function join (output, replacement) { 924 | var s1 = trimTrailingNewlines(output); 925 | var s2 = trimLeadingNewlines(replacement); 926 | var nls = Math.max(output.length - s1.length, replacement.length - s2.length); 927 | var separator = '\n\n'.substring(0, nls); 928 | 929 | return s1 + separator + s2 930 | } 931 | 932 | /** 933 | * Determines whether an input can be converted 934 | * @private 935 | * @param {String|HTMLElement} input Describe this parameter 936 | * @returns Describe what it returns 937 | * @type String|Object|Array|Boolean|Number 938 | */ 939 | 940 | function canConvert (input) { 941 | return ( 942 | input != null && ( 943 | typeof input === 'string' || 944 | (input.nodeType && ( 945 | input.nodeType === 1 || input.nodeType === 9 || input.nodeType === 11 946 | )) 947 | ) 948 | ) 949 | } 950 | 951 | return TurndownService; 952 | 953 | }))); -------------------------------------------------------------------------------- /manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ArXiv Markdown Parser", 3 | "description": "Fetches arXiv paper in HTML and converts it to Markdown with LaTeX inlined.", 4 | "version": "1.1", 5 | "manifest_version": 3, 6 | "icons": { 7 | "16": "icons/icon16.png", 8 | "48": "icons/icon48.png", 9 | "128": "icons/icon128.png" 10 | }, 11 | "permissions": [ 12 | "activeTab" 13 | ], 14 | "action": { 15 | "default_popup": "popup.html", 16 | "default_icon": { 17 | "16": "icons/icon16.png", 18 | "48": "icons/icon48.png", 19 | "128": "icons/icon128.png" 20 | }, 21 | "default_title": "ArXiv Markdown Parser" 22 | }, 23 | "background": { 24 | "service_worker": "background.js" 25 | }, 26 | "content_scripts": [ 27 | { 28 | "matches": [ 29 | "*://arxiv.org/abs/*", 30 | "*://arxiv.org/pdf/*", 31 | "*://arxiv.org/html/*" 32 | ], 33 | "js": [ 34 | "lib/turndown.umd.js", 35 | "lib/turndown-plugin-gfm.js", 36 | "contentScript.js" 37 | ], 38 | "run_at": "document_end" 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /popup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ArXiv Parser 6 | 26 | 27 | 28 |

ArXiv Markdown Parser

29 | 30 | 31 | 32 | 36 | 40 | 41 |
42 |
43 | 44 |
45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /popup.js: -------------------------------------------------------------------------------- 1 | document.addEventListener("DOMContentLoaded", function () { 2 | const convertBtn = document.getElementById("convertBtn"); 3 | const markdownOutput = document.getElementById("markdownOutput"); 4 | const errorDiv = document.getElementById("error"); 5 | 6 | convertBtn.addEventListener("click", function () { 7 | errorDiv.textContent = ""; 8 | markdownOutput.value = "Loading..."; 9 | 10 | const removeTable = document.getElementById("removeTable").checked; 11 | const removeRefs = document.getElementById("removeRefs").checked; 12 | 13 | chrome.tabs.query({ active: true, currentWindow: true }, function (tabs) { 14 | if (!tabs || !tabs.length) { 15 | errorDiv.textContent = "No active tab found."; 16 | return; 17 | } 18 | const activeTab = tabs[0]; 19 | chrome.tabs.sendMessage( 20 | activeTab.id, 21 | { 22 | action: "getMarkdown", 23 | removeTable: removeTable, 24 | removeRefs: removeRefs 25 | }, 26 | function (response) { 27 | if (!response) { 28 | errorDiv.textContent = 29 | "No response (are you sure this is an arXiv page?)"; 30 | markdownOutput.value = ""; 31 | return; 32 | } 33 | if (response.success) { 34 | markdownOutput.value = response.markdown; 35 | } else { 36 | errorDiv.textContent = response.error || "Unknown error"; 37 | markdownOutput.value = ""; 38 | } 39 | } 40 | ); 41 | }); 42 | }); 43 | }); 44 | --------------------------------------------------------------------------------