├── .gitignore
├── LICENSE
├── README.md
├── background.js
├── contentScript.js
├── icons
    ├── icon128.png
    ├── icon16.png
    └── icon48.png
├── lib
    ├── turndown-plugin-gfm.js
    └── turndown.umd.js
├── manifest.json
├── popup.html
└── popup.js


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | .edge-debug-profile/
3 | test.md


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Suyuchen Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ArXiv Markdown Parser - Chrome / Edge Extension
 2 | 
 3 | ![Chrome Web Store Version](https://img.shields.io/chrome-web-store/v/pgklmbjeooblkfcgbibhkjpbbhoabbbo) ![GitHub License](https://img.shields.io/github/license/sheryc/arxiv-markdown-parser-plugin) ![GitHub Repo stars](https://img.shields.io/github/stars/sheryc/arxiv-markdown-parser-plugin) < Please leave a 🌟 if you find this plugin useful :D
 4 | 
 5 | **🔥 Update: The plugin is now live on Chrome Web Store: [Chrome Web Store Link](https://chromewebstore.google.com/detail/arxiv-markdown-parser/pgklmbjeooblkfcgbibhkjpbbhoabbbo)**
 6 | 
 7 | **Turn ArXiv Papers into Markdown with One Click**
 8 | 
 9 | Are you tired of wrestling with PDFs when you need to analyze or excerpt research papers? In the age of LLMs, having clean, accessible text is more important than ever. The ArXiv Markdown Parser Chrome extension is built to streamline your research workflow by converting arXiv papers into clean, readable Markdown with a single click.
10 | 
11 | > Note: Currently the extension only supports papers with an HTML version. Most of the new papers have an HTML version but some are not. [It's a beta feature of ArXiv](https://info.arxiv.org/about/accessible_HTML.html) and more papers will be supported in the future.
12 | 
13 | ## Table of Contents
14 | 
15 | - [ArXiv Markdown Parser - Chrome / Edge Extension](#arxiv-markdown-parser---chrome--edge-extension)
16 |   - [Table of Contents](#table-of-contents)
17 |   - [Overview](#overview)
18 |     - [Why Markdown Matters](#why-markdown-matters)
19 |   - [Features](#features)
20 |   - [Installation](#installation)
21 |     - [Prerequisites](#prerequisites)
22 |     - [Steps](#steps)
23 |   - [Usage](#usage)
24 |   - [License](#license)
25 |   - [Acknowledgments](#acknowledgments)
26 | 
27 | ## Overview
28 | 
29 | The ArXiv Markdown Parser is designed to simplify the process of extracting content from arXiv papers. Whether you're conducting literature reviews, performing detailed analyses, or integrating research into LLM workflows, this extension helps you bypass the cumbersome process of PDF extraction. Instead, it converts the entire paper—including equations, tables, figures, and internal references—into well-structured Markdown, making it easier to read, share, and annotate.
30 | 
31 | ### Why Markdown Matters
32 | 
33 | - **LLM Integration:** Markdown text is much easier to feed into LLMs like ChatGPT. By bypassing the messy PDF extraction process, you ensure that models receive clean, structured input—ideal for summarization, translation, or analysis.
34 | - **Fast Table & Equation Copying:** Research papers often contain complex tables and equations. Converting to Markdown allows you to quickly copy and paste these elements into your notes or LLM prompts without formatting issues.
35 | - **Seamless Collaboration:** Markdown is one of the most popular formats for academic and technical documentation. Its compatibility with version control systems (like Git) makes it perfect for group research settings, collaborative wikis, or shared repositories.
36 | 
37 | ## Features
38 | 
39 | - **One-Click Conversion:** Simply open any arXiv paper (abs, pdf, or html) and click the extension icon to instantly convert the content into Markdown.
40 | - **Customizable Output:** Choose whether to include a table of contents and references in your Markdown output.
41 | - **Enhanced Research Workflow:** Quickly extract and organize key components of research papers for rapid summarization, annotation, or further analysis.
42 | - **Improved Equation Handling:** Easily obtain LaTeX or Markdown versions of equations for use in your notes, presentations, or LLM prompts.
43 | 
44 | ## Installation
45 | 
46 | ### Through Chrome Web Store
47 | 
48 | The extension is live on [Chrome Web Store](https://chromewebstore.google.com/detail/arxiv-markdown-parser/pgklmbjeooblkfcgbibhkjpbbhoabbbo). I recommend installing from here if you're using Chrome or Edge.
49 | 
50 | ### Manual Installation
51 | 
52 | #### Prerequisites
53 | 
54 | - [Google Chrome](https://www.google.com/chrome/) or any Chromium-based browser.
55 | 
56 | #### Steps
57 | 
58 | 1. **Clone the Repository:**
59 | 
60 |    ```bash
61 |    git clone https://github.com/sheryc/arxiv-markdown-parser-plugin.git
62 |    ```
63 | 
64 |    or alternatively, download the zip or tarball file in the latest release and unzip it.
65 | 
66 | 2. **Load the Extension in Chrome:**
67 | 
68 |    - Open Chrome and navigate to `chrome://extensions/`.
69 |    - Enable **Developer Mode** (toggle in the top-right corner).
70 |    - Click **Load unpacked** and select the directory where you cloned the repository or unzipped the zipfile / tarball.
71 | 
72 | 3. **Installation Complete:**  
73 |    The extension should now appear in your Chrome toolbar. You are ready to convert arXiv papers into Markdown with a single click.
74 | 
75 | ## Usage
76 | 
77 | 1. **Open an arXiv Paper:**  
78 |    Navigate to any arXiv paper page (abstract, PDF, or HTML view).
79 | 
80 | 2. **Activate the Extension:**  
81 |    Click on the ArXiv Markdown Parser icon in your browser toolbar.
82 | 
83 | 3. **Configure Output Options:**  
84 |    Choose whether you want to include a table of contents and references in the generated Markdown.
85 | 
86 | 4. **Get Your Markdown:**  
87 |    The extension will instantly convert the paper into Markdown format, including all equations, tables, figures, and internal references as links.
88 | 
89 | ## License
90 | 
91 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
92 | 
93 | ## Acknowledgments
94 | 
95 | - **LaTeXML** and **Turndown** for powering the conversion process.
96 | 
97 | *Happy Researching!* (=・ω・=)
98 | 


--------------------------------------------------------------------------------
/background.js:
--------------------------------------------------------------------------------
1 | chrome.runtime.onInstalled.addListener(() => {
2 |     console.log("ArXiv Markdown Parser extension installed.");
3 |   });
4 |   


--------------------------------------------------------------------------------
/contentScript.js:
--------------------------------------------------------------------------------
  1 | console.log("[DEBUG] contentScript.js loaded!");
  2 | 
  3 | async function parseArxiv(arxivId, removeRefs = false, removeTable = false) {
  4 |   const url = `https://arxiv.org/html/${arxivId}`;
  5 |   const response = await fetch(url);
  6 |   if (!response.ok) {
  7 |     throw new Error(`Failed to fetch arxiv HTML for ${arxivId}. Status: ${response.status}`);
  8 |   }
  9 |   const htmlText = await response.text();
 10 | 
 11 |   // parse into DOM
 12 |   const parser = new DOMParser();
 13 |   const doc = parser.parseFromString(htmlText, "text/html");
 14 | 
 15 |   // replace <math> with inline $...$
 16 |   convertAllMathMLtoLatex(doc);
 17 | 
 18 |   // fix tabular tables
 19 |   fixTabularTables(doc);
 20 | 
 21 |   // setup Turndown with GFM plugin
 22 |   const turndownService = new TurndownService({
 23 |     headingStyle: "atx",
 24 |     codeBlockStyle: "fenced",
 25 |     fence: "```",
 26 |     bulletListMarker: "-",
 27 |     emDelimiter: "*",
 28 |     strongDelimiter: "**"
 29 |   });
 30 |   turndownService.use(turndownPluginGfm.gfm);
 31 | 
 32 |   turndownService.addRule('mathContentTables', {
 33 |     filter: function (node) {
 34 |       return (
 35 |         node.nodeName === 'TABLE' && 
 36 |         node.innerHTML.includes('$') && 
 37 |         !/ltx_equationgroup|ltx_eqn_align|ltx_eqn_table/.test(node.className || '')
 38 |       )
 39 |     },
 40 |     replacement: function (content, node) {
 41 |       const rows = Array.from(node.rows);
 42 |       let markdown = '';
 43 |       
 44 |       rows.forEach((row, rowIndex) => {
 45 |         const cells = Array.from(row.cells);
 46 |         
 47 |         // Preserve cell content including math formatting
 48 |         markdown += '| ' + cells.map(cell => {
 49 |           return cell.textContent.trim();
 50 |         }).join(' | ') + ' |\n';
 51 |         
 52 |         // Add separator row after first row
 53 |         if (rowIndex === 0) {
 54 |           markdown += '| ' + cells.map(() => '---').join(' | ') + ' |\n';
 55 |         }
 56 |       });
 57 |       
 58 |       return '\n\n' + markdown + '\n\n';
 59 |     }
 60 |   });
 61 | 
 62 |   // turn LaTeX equation tables into $$ block equations $$
 63 |   turndownService.addRule("latexEquationTables", {
 64 |     filter: function (node) {
 65 |       if (node.nodeName === "TABLE") {
 66 |         const cls = node.getAttribute("class") || "";
 67 |         // If it has ltx_tabular, we do NOT treat it as an equation
 68 |         if (/\bltx_tabular\b/.test(cls)) return false;
 69 |         // If it has ltx_equationgroup, ltx_eqn_align, or ltx_eqn_table => treat as block equation
 70 |         return /ltx_equationgroup|ltx_eqn_align|ltx_eqn_table/.test(cls);
 71 |       }
 72 |       return false;
 73 |     },
 74 |     replacement: function (content, node) {
 75 |       // We'll use the node's textContent as the equation text
 76 |       let eqnText = node.textContent.trim();
 77 |       eqnText = eqnText.replace(/\s+/g, " ");
 78 |       eqnText = eqnText.replace(/^\$/, "");
 79 |       eqnText = eqnText.replace(/\(\d+\)$/, "");
 80 |       eqnText = eqnText.replace(/\$$/, "");
 81 |       return `$$ ${eqnText} $$`;
 82 |     },
 83 |   });
 84 | 
 85 |   // convert the DOM to md
 86 |   let markdown = turndownService.turndown(doc.documentElement.innerHTML);
 87 | 
 88 |   // reformat paragraphs vs. table lines
 89 |   markdown = removeLineBreaksOutsideTables(markdown);
 90 | 
 91 |   // unescape double backslashes for correct LaTeX
 92 |   markdown = unescapeDoubleBackslashes(markdown);
 93 | 
 94 |   // Fix any remaining escaped underscores in LaTeX
 95 |   markdown = fixLatexUnderscores(markdown);
 96 | 
 97 |   if (removeRefs) {
 98 |     markdown = removeReferences(markdown);
 99 |   } else {
100 |     markdown = preserveReferencesLineBreaks(markdown);
101 |   }
102 | 
103 |   if (removeTable) {
104 |     markdown = removeContentTable(markdown);
105 |   } else {
106 |     markdown = reformatTableOfContents(markdown);
107 |   }
108 |   
109 |   return markdown;
110 | }
111 | 
112 | function removeAllAttributes(elem) {
113 |   // Repeatedly remove the first attribute until none remain
114 |   while (elem.attributes && elem.attributes.length > 0) {
115 |     elem.removeAttribute(elem.attributes[0].name);
116 |   }
117 | }
118 | 
119 | function reformatTableOfContents(markdown) {
120 |   const paragraphs = markdown.split("\n\n");
121 |   
122 |   if (paragraphs.length > 1) {
123 |     const tocParagraph = paragraphs[1];
124 |     if (tocParagraph.includes("http") && (tocParagraph.includes("[1") || tocParagraph.includes("[2"))) {
125 |       const linkPattern = /(\[[^\]]+\]\([^)]+\))/g;
126 |       const links = tocParagraph.match(linkPattern) || [];
127 |       
128 |       if (links.length > 0) {
129 |         const formattedLinks = [];
130 |         
131 |         for (const link of links) {
132 |           const sectionMatch = link.match(/\[(\d+(?:\.\d+)*)\s+([^\]]+)\]/);
133 |           
134 |           if (sectionMatch) {
135 |             const sectionNumber = sectionMatch[1]; // e.g., "1", "2.1"
136 |             const parts = sectionNumber.split('.');
137 |             const level = parts.length;
138 |             
139 |             const indent = '  '.repeat(level - 1);
140 |             formattedLinks.push(indent + link);
141 |           } else {
142 |             formattedLinks.push(link);
143 |           }
144 |         }
145 |         paragraphs[1] = formattedLinks.join('\n');
146 |       }
147 |     }
148 |   }
149 |   
150 |   return paragraphs.join("\n\n");
151 | }
152 | 
153 | function preserveReferencesLineBreaks(markdown) {
154 |   const refMarkers = ["References ----------", "## References", "### References", "#### References", "###### References"];
155 |   
156 |   let refSection = null;
157 |   for (const marker of refMarkers) {
158 |     const markerIndex = markdown.indexOf(marker);
159 |     if (markerIndex !== -1) {
160 |       let endIndex = markdown.length;
161 |       const nextHeadingMatch = markdown.slice(startIndex).match(/\n\s*#(?!#)/);
162 |       if (nextHeadingMatch) {
163 |         endIndex = startIndex + nextHeadingMatch.index;
164 |       }
165 |       
166 |       refSection = {
167 |         before: markdown.substring(0, markerIndex),
168 |         marker: marker,
169 |         content: markdown.substring(startIndex, endIndex),
170 |         after: markdown.substring(endIndex)
171 |       };
172 |       break;
173 |     }
174 |   }
175 |   
176 |   if (!refSection) {
177 |     return markdown; // No references found
178 |   }
179 | 
180 |   let formattedRefs = refSection.content;
181 |   formattedRefs = formattedRefs.replace(/(?!^)\s*-\s+/g, '\n\n-   ');
182 |   return refSection.before + refSection.marker + formattedRefs + refSection.after;
183 | }
184 | 
185 | 
186 | function fixTabularTables(root) {
187 |   // Select all <table> elements that have class="ltx_tabular"
188 |   const tables = root.querySelectorAll("table.ltx_tabular");
189 |   tables.forEach((table) => {
190 |     removeAllAttributes(table);
191 | 
192 |     table.querySelectorAll("tbody, thead, tfoot, tr, td, th").forEach((el) => {
193 |       removeAllAttributes(el);
194 |     });
195 |   });
196 | }
197 | 
198 | function fixLatexUnderscores(markdown) {
199 |   // Fix any remaining escaped underscores in LaTeX expressions
200 |   return markdown.replace(/\$([^$]*?)\$/g, function(match, latex) {
201 |     // Replace \_ with _ and \^ with ^
202 |     return '$' + latex.replace(/\\_/g, '_').replace(/\\\^/g, '^') + '$';
203 |   });
204 | }
205 | 
206 | function convertAllMathMLtoLatex(root) {
207 |   const mathElements = root.querySelectorAll("math");
208 |   mathElements.forEach((math) => {
209 |     const annotation = math.querySelector('annotation[encoding="application/x-tex"]');
210 |     if (annotation && annotation.textContent) {
211 |       let latexSource = annotation.textContent.trim();
212 |       latexSource = latexSource.replace(/(?<!\\)%/g, "");
213 |       latexSource = latexSource.replace(/\\([_^])/g, "$1");
214 |       latexSource = latexSource.replace(/\\(?=[\[\]])/g, "");
215 |       // console.log(latexSource);
216 |       // Replace the <math> element with inline LaTeX delimited by $ signs.
217 |       math.replaceWith(`$${latexSource}$`);
218 |       console.log(math)
219 |     } else {
220 |       console.log("No annotation found");
221 |       math.replaceWith(math.textContent);
222 |     }
223 |   });
224 | }
225 | 
226 | 
227 | function removeReferences(markdown) {
228 |   const refMarkers = ["References ----------", "###### References"];
229 |   const paragraphs = markdown.split("\n\n");
230 | 
231 |   const refIndex = paragraphs.findIndex(para =>
232 |     refMarkers.some(marker => para.includes(marker))
233 |   );
234 | 
235 |   if (refIndex === -1) {
236 |     return markdown;
237 |   }
238 | 
239 |   paragraphs.splice(refIndex, 1);
240 | 
241 |   if (refIndex < paragraphs.length) {
242 |     paragraphs.splice(refIndex, 1);
243 |   }
244 | 
245 |   return paragraphs.join("\n\n");
246 | }
247 | 
248 | function removeLineBreaksOutsideTables(markdown) {
249 |   let blocks = markdown.split(/\n\s*\n/);
250 | 
251 |   const processedBlocks = blocks.map((block) => {
252 |     const lines = block.split("\n");
253 |     const firstNonBlank = lines.find((l) => l.trim().length > 0);
254 |     if (firstNonBlank && firstNonBlank.trim().startsWith("|")) {
255 |       return lines.join("\n");
256 |     } else {
257 |       return lines.map((l) => l.trim()).join(" ");
258 |     }
259 |   });
260 | 
261 |   return processedBlocks.join("\n\n");
262 | }
263 | 
264 | function unescapeDoubleBackslashes(text) {
265 |   let old;
266 |   do {
267 |     old = text;
268 |     text = text.replace(/\\\\/g, "\\");
269 |   } while (text !== old);
270 |   return text;
271 | }
272 | 
273 | function removeContentTable(markdown) {
274 |   const paragraphs = markdown.split("\n\n");
275 |   if (paragraphs.length > 2) {
276 |     // Remove the first 2 paragraphs (title + content table)
277 |     paragraphs.splice(0, 2);
278 |   }
279 |   return paragraphs.join("\n\n");
280 | }
281 | 
282 | function removeLastTwoLines(markdown) {
283 |   const lines = markdown.split("\n");
284 |   if (lines.length > 2) {
285 |     lines.splice(-2, 2);
286 |   }
287 |   return lines.join("\n");
288 | }
289 | 
290 | (function () {
291 |   const arxivId = extractArxivId(window.location.href);
292 | 
293 |   chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
294 |     if (request.action === "getMarkdown") {
295 |       if (!arxivId) {
296 |         sendResponse({
297 |           success: false,
298 |           markdown: "",
299 |           error: "No arXiv ID detected on this page.",
300 |         });
301 |         return true;
302 |       }
303 | 
304 |       parseArxiv(arxivId, request.removeRefs)
305 |         .then((md) => {
306 |           let finalMD = md;
307 |           finalMD = removeLastTwoLines(finalMD);
308 |           if (request.removeTable) {
309 |             finalMD = removeContentTable(finalMD);
310 |           }
311 |           sendResponse({ success: true, markdown: finalMD });
312 |         })
313 |         .catch((err) => {
314 |           sendResponse({ success: false, markdown: "", error: err.toString() });
315 |         });
316 | 
317 |       return true; // async
318 |     }
319 |   });
320 | 
321 |   function extractArxivId(url) {
322 |     const match = url.match(/arxiv\.org\/(abs|pdf|html)\/([^?#]+)/);
323 |     if (!match) return null;
324 |     let id = match[2];
325 |     id = id.replace(/\.pdf$/, "");
326 |     id = id.replace(/v\d+$/, "");
327 |     return id;
328 |   }
329 | })();
330 | 


--------------------------------------------------------------------------------
/icons/icon128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sheryc/arxiv-markdown-parser-plugin/a6d7a41ba59d75cfc673f664e1fb4b022fd3e7cc/icons/icon128.png


--------------------------------------------------------------------------------
/icons/icon16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sheryc/arxiv-markdown-parser-plugin/a6d7a41ba59d75cfc673f664e1fb4b022fd3e7cc/icons/icon16.png


--------------------------------------------------------------------------------
/icons/icon48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sheryc/arxiv-markdown-parser-plugin/a6d7a41ba59d75cfc673f664e1fb4b022fd3e7cc/icons/icon48.png


--------------------------------------------------------------------------------
/lib/turndown-plugin-gfm.js:
--------------------------------------------------------------------------------
  1 | var turndownPluginGfm = (function (exports) {
  2 |     'use strict';
  3 |     
  4 |     var highlightRegExp = /highlight-(?:text|source)-([a-z0-9]+)/;
  5 |     
  6 |     function highlightedCodeBlock (turndownService) {
  7 |       turndownService.addRule('highlightedCodeBlock', {
  8 |         filter: function (node) {
  9 |           var firstChild = node.firstChild;
 10 |           return (
 11 |             node.nodeName === 'DIV' &&
 12 |             highlightRegExp.test(node.className) &&
 13 |             firstChild &&
 14 |             firstChild.nodeName === 'PRE'
 15 |           )
 16 |         },
 17 |         replacement: function (content, node, options) {
 18 |           var className = node.className || '';
 19 |           var language = (className.match(highlightRegExp) || [null, ''])[1];
 20 |     
 21 |           return (
 22 |             '\n\n' + options.fence + language + '\n' +
 23 |             node.firstChild.textContent +
 24 |             '\n' + options.fence + '\n\n'
 25 |           )
 26 |         }
 27 |       });
 28 |     }
 29 |     
 30 |     function strikethrough (turndownService) {
 31 |       turndownService.addRule('strikethrough', {
 32 |         filter: ['del', 's', 'strike'],
 33 |         replacement: function (content) {
 34 |           return '~' + content + '~'
 35 |         }
 36 |       });
 37 |     }
 38 |     
 39 |     var indexOf = Array.prototype.indexOf;
 40 |     var every = Array.prototype.every;
 41 |     var rules = {};
 42 |     
 43 |     rules.tableCell = {
 44 |       filter: ['th', 'td'],
 45 |       replacement: function (content, node) {
 46 |         return cell(content, node)
 47 |       }
 48 |     };
 49 |     
 50 |     rules.tableRow = {
 51 |       filter: 'tr',
 52 |       replacement: function (content, node) {
 53 |         var borderCells = '';
 54 |         var alignMap = { left: ':--', right: '--:', center: ':-:' };
 55 |     
 56 |         if (isHeadingRow(node)) {
 57 |           for (var i = 0; i < node.childNodes.length; i++) {
 58 |             var border = '---';
 59 |             var align = (
 60 |               node.childNodes[i].getAttribute('align') || ''
 61 |             ).toLowerCase();
 62 |     
 63 |             if (align) border = alignMap[align] || border;
 64 |     
 65 |             borderCells += cell(border, node.childNodes[i]);
 66 |           }
 67 |         }
 68 |         return '\n' + content + (borderCells ? '\n' + borderCells : '')
 69 |       }
 70 |     };
 71 |     
 72 |     rules.table = {
 73 |       // Only convert tables with a heading row.
 74 |       // Tables with no heading row are kept using `keep` (see below).
 75 |       filter: function (node) {
 76 |         return node.nodeName === 'TABLE' && isHeadingRow(node.rows[0])
 77 |       },
 78 |     
 79 |       replacement: function (content) {
 80 |         // Ensure there are no blank lines
 81 |         content = content.replace('\n\n', '\n');
 82 |         return '\n\n' + content + '\n\n'
 83 |       }
 84 |     };
 85 |     
 86 |     rules.tableSection = {
 87 |       filter: ['thead', 'tbody', 'tfoot'],
 88 |       replacement: function (content) {
 89 |         return content
 90 |       }
 91 |     };
 92 |     
 93 |     // A tr is a heading row if:
 94 |     // - the parent is a THEAD
 95 |     // - or if its the first child of the TABLE or the first TBODY (possibly
 96 |     //   following a blank THEAD)
 97 |     // - and every cell is a TH
 98 |     function isHeadingRow (tr) {
 99 |       var parentNode = tr.parentNode;
100 |       return (
101 |         parentNode.nodeName === 'THEAD' ||
102 |         (
103 |           parentNode.firstChild === tr &&
104 |           (parentNode.nodeName === 'TABLE' || isFirstTbody(parentNode)) &&
105 |           every.call(tr.childNodes, function (n) { return n.nodeName === 'TH' })
106 |         )
107 |       )
108 |     }
109 |     
110 |     function isFirstTbody (element) {
111 |       var previousSibling = element.previousSibling;
112 |       return (
113 |         element.nodeName === 'TBODY' && (
114 |           !previousSibling ||
115 |           (
116 |             previousSibling.nodeName === 'THEAD' &&
117 |             /^\s*$/i.test(previousSibling.textContent)
118 |           )
119 |         )
120 |       )
121 |     }
122 |     
123 |     function cell (content, node) {
124 |       var index = indexOf.call(node.parentNode.childNodes, node);
125 |       var prefix = ' ';
126 |       if (index === 0) prefix = '| ';
127 |       return prefix + content + ' |'
128 |     }
129 |     
130 |     function tables (turndownService) {
131 |       turndownService.keep(function (node) {
132 |         return node.nodeName === 'TABLE' && !isHeadingRow(node.rows[0])
133 |       });
134 |       for (var key in rules) turndownService.addRule(key, rules[key]);
135 |     }
136 |     
137 |     function taskListItems (turndownService) {
138 |       turndownService.addRule('taskListItems', {
139 |         filter: function (node) {
140 |           return node.type === 'checkbox' && node.parentNode.nodeName === 'LI'
141 |         },
142 |         replacement: function (content, node) {
143 |           return (node.checked ? '[x]' : '[ ]') + ' '
144 |         }
145 |       });
146 |     }
147 |     
148 |     function gfm (turndownService) {
149 |       turndownService.use([
150 |         highlightedCodeBlock,
151 |         strikethrough,
152 |         tables,
153 |         taskListItems
154 |       ]);
155 |     }
156 |     
157 |     exports.gfm = gfm;
158 |     exports.highlightedCodeBlock = highlightedCodeBlock;
159 |     exports.strikethrough = strikethrough;
160 |     exports.tables = tables;
161 |     exports.taskListItems = taskListItems;
162 |     
163 |     return exports;
164 |     
165 |     }({}));


--------------------------------------------------------------------------------
/lib/turndown.umd.js:
--------------------------------------------------------------------------------
  1 | (function (global, factory) {
  2 |     typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
  3 |     typeof define === 'function' && define.amd ? define(factory) :
  4 |     (global = typeof globalThis !== 'undefined' ? globalThis : global || self, global.TurndownService = factory());
  5 |   }(this, (function () { 'use strict';
  6 |   
  7 |     function extend (destination) {
  8 |       for (var i = 1; i < arguments.length; i++) {
  9 |         var source = arguments[i];
 10 |         for (var key in source) {
 11 |           if (source.hasOwnProperty(key)) destination[key] = source[key];
 12 |         }
 13 |       }
 14 |       return destination
 15 |     }
 16 |   
 17 |     function repeat (character, count) {
 18 |       return Array(count + 1).join(character)
 19 |     }
 20 |   
 21 |     function trimLeadingNewlines (string) {
 22 |       return string.replace(/^\n*/, '')
 23 |     }
 24 |   
 25 |     function trimTrailingNewlines (string) {
 26 |       // avoid match-at-end regexp bottleneck, see #370
 27 |       var indexEnd = string.length;
 28 |       while (indexEnd > 0 && string[indexEnd - 1] === '\n') indexEnd--;
 29 |       return string.substring(0, indexEnd)
 30 |     }
 31 |   
 32 |     var blockElements = [
 33 |       'ADDRESS', 'ARTICLE', 'ASIDE', 'AUDIO', 'BLOCKQUOTE', 'BODY', 'CANVAS',
 34 |       'CENTER', 'DD', 'DIR', 'DIV', 'DL', 'DT', 'FIELDSET', 'FIGCAPTION', 'FIGURE',
 35 |       'FOOTER', 'FORM', 'FRAMESET', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HEADER',
 36 |       'HGROUP', 'HR', 'HTML', 'ISINDEX', 'LI', 'MAIN', 'MENU', 'NAV', 'NOFRAMES',
 37 |       'NOSCRIPT', 'OL', 'OUTPUT', 'P', 'PRE', 'SECTION', 'TABLE', 'TBODY', 'TD',
 38 |       'TFOOT', 'TH', 'THEAD', 'TR', 'UL'
 39 |     ];
 40 |   
 41 |     function isBlock (node) {
 42 |       return is(node, blockElements)
 43 |     }
 44 |   
 45 |     var voidElements = [
 46 |       'AREA', 'BASE', 'BR', 'COL', 'COMMAND', 'EMBED', 'HR', 'IMG', 'INPUT',
 47 |       'KEYGEN', 'LINK', 'META', 'PARAM', 'SOURCE', 'TRACK', 'WBR'
 48 |     ];
 49 |   
 50 |     function isVoid (node) {
 51 |       return is(node, voidElements)
 52 |     }
 53 |   
 54 |     function hasVoid (node) {
 55 |       return has(node, voidElements)
 56 |     }
 57 |   
 58 |     var meaningfulWhenBlankElements = [
 59 |       'A', 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TH', 'TD', 'IFRAME', 'SCRIPT',
 60 |       'AUDIO', 'VIDEO'
 61 |     ];
 62 |   
 63 |     function isMeaningfulWhenBlank (node) {
 64 |       return is(node, meaningfulWhenBlankElements)
 65 |     }
 66 |   
 67 |     function hasMeaningfulWhenBlank (node) {
 68 |       return has(node, meaningfulWhenBlankElements)
 69 |     }
 70 |   
 71 |     function is (node, tagNames) {
 72 |       return tagNames.indexOf(node.nodeName) >= 0
 73 |     }
 74 |   
 75 |     function has (node, tagNames) {
 76 |       return (
 77 |         node.getElementsByTagName &&
 78 |         tagNames.some(function (tagName) {
 79 |           return node.getElementsByTagName(tagName).length
 80 |         })
 81 |       )
 82 |     }
 83 |   
 84 |     var rules = {};
 85 |   
 86 |     rules.paragraph = {
 87 |       filter: 'p',
 88 |   
 89 |       replacement: function (content) {
 90 |         return '\n\n' + content + '\n\n'
 91 |       }
 92 |     };
 93 |   
 94 |     rules.lineBreak = {
 95 |       filter: 'br',
 96 |   
 97 |       replacement: function (content, node, options) {
 98 |         return options.br + '\n'
 99 |       }
100 |     };
101 |   
102 |     rules.heading = {
103 |       filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
104 |   
105 |       replacement: function (content, node, options) {
106 |         var hLevel = Number(node.nodeName.charAt(1));
107 |   
108 |         if (options.headingStyle === 'setext' && hLevel < 3) {
109 |           var underline = repeat((hLevel === 1 ? '=' : '-'), content.length);
110 |           return (
111 |             '\n\n' + content + '\n' + underline + '\n\n'
112 |           )
113 |         } else {
114 |           return '\n\n' + repeat('#', hLevel) + ' ' + content + '\n\n'
115 |         }
116 |       }
117 |     };
118 |   
119 |     rules.blockquote = {
120 |       filter: 'blockquote',
121 |   
122 |       replacement: function (content) {
123 |         content = content.replace(/^\n+|\n+$/g, '');
124 |         content = content.replace(/^/gm, '> ');
125 |         return '\n\n' + content + '\n\n'
126 |       }
127 |     };
128 |   
129 |     rules.list = {
130 |       filter: ['ul', 'ol'],
131 |   
132 |       replacement: function (content, node) {
133 |         var parent = node.parentNode;
134 |         if (parent.nodeName === 'LI' && parent.lastElementChild === node) {
135 |           return '\n' + content
136 |         } else {
137 |           return '\n\n' + content + '\n\n'
138 |         }
139 |       }
140 |     };
141 |   
142 |     rules.listItem = {
143 |       filter: 'li',
144 |   
145 |       replacement: function (content, node, options) {
146 |         content = content
147 |           .replace(/^\n+/, '') // remove leading newlines
148 |           .replace(/\n+$/, '\n') // replace trailing newlines with just a single one
149 |           .replace(/\n/gm, '\n    '); // indent
150 |         var prefix = options.bulletListMarker + '   ';
151 |         var parent = node.parentNode;
152 |         if (parent.nodeName === 'OL') {
153 |           var start = parent.getAttribute('start');
154 |           var index = Array.prototype.indexOf.call(parent.children, node);
155 |           prefix = (start ? Number(start) + index : index + 1) + '.  ';
156 |         }
157 |         return (
158 |           prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '')
159 |         )
160 |       }
161 |     };
162 |   
163 |     rules.indentedCodeBlock = {
164 |       filter: function (node, options) {
165 |         return (
166 |           options.codeBlockStyle === 'indented' &&
167 |           node.nodeName === 'PRE' &&
168 |           node.firstChild &&
169 |           node.firstChild.nodeName === 'CODE'
170 |         )
171 |       },
172 |   
173 |       replacement: function (content, node, options) {
174 |         return (
175 |           '\n\n    ' +
176 |           node.firstChild.textContent.replace(/\n/g, '\n    ') +
177 |           '\n\n'
178 |         )
179 |       }
180 |     };
181 |   
182 |     rules.fencedCodeBlock = {
183 |       filter: function (node, options) {
184 |         return (
185 |           options.codeBlockStyle === 'fenced' &&
186 |           node.nodeName === 'PRE' &&
187 |           node.firstChild &&
188 |           node.firstChild.nodeName === 'CODE'
189 |         )
190 |       },
191 |   
192 |       replacement: function (content, node, options) {
193 |         var className = node.firstChild.getAttribute('class') || '';
194 |         var language = (className.match(/language-(\S+)/) || [null, ''])[1];
195 |         var code = node.firstChild.textContent;
196 |   
197 |         var fenceChar = options.fence.charAt(0);
198 |         var fenceSize = 3;
199 |         var fenceInCodeRegex = new RegExp('^' + fenceChar + '{3,}', 'gm');
200 |   
201 |         var match;
202 |         while ((match = fenceInCodeRegex.exec(code))) {
203 |           if (match[0].length >= fenceSize) {
204 |             fenceSize = match[0].length + 1;
205 |           }
206 |         }
207 |   
208 |         var fence = repeat(fenceChar, fenceSize);
209 |   
210 |         return (
211 |           '\n\n' + fence + language + '\n' +
212 |           code.replace(/\n$/, '') +
213 |           '\n' + fence + '\n\n'
214 |         )
215 |       }
216 |     };
217 |   
218 |     rules.horizontalRule = {
219 |       filter: 'hr',
220 |   
221 |       replacement: function (content, node, options) {
222 |         return '\n\n' + options.hr + '\n\n'
223 |       }
224 |     };
225 |   
226 |     rules.inlineLink = {
227 |       filter: function (node, options) {
228 |         return (
229 |           options.linkStyle === 'inlined' &&
230 |           node.nodeName === 'A' &&
231 |           node.getAttribute('href')
232 |         )
233 |       },
234 |   
235 |       replacement: function (content, node) {
236 |         var href = node.getAttribute('href');
237 |         if (href) href = href.replace(/([()])/g, '\\$1');
238 |         var title = cleanAttribute(node.getAttribute('title'));
239 |         if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
240 |         return '[' + content + '](' + href + title + ')'
241 |       }
242 |     };
243 |   
244 |     rules.referenceLink = {
245 |       filter: function (node, options) {
246 |         return (
247 |           options.linkStyle === 'referenced' &&
248 |           node.nodeName === 'A' &&
249 |           node.getAttribute('href')
250 |         )
251 |       },
252 |   
253 |       replacement: function (content, node, options) {
254 |         var href = node.getAttribute('href');
255 |         var title = cleanAttribute(node.getAttribute('title'));
256 |         if (title) title = ' "' + title + '"';
257 |         var replacement;
258 |         var reference;
259 |   
260 |         switch (options.linkReferenceStyle) {
261 |           case 'collapsed':
262 |             replacement = '[' + content + '][]';
263 |             reference = '[' + content + ']: ' + href + title;
264 |             break
265 |           case 'shortcut':
266 |             replacement = '[' + content + ']';
267 |             reference = '[' + content + ']: ' + href + title;
268 |             break
269 |           default:
270 |             var id = this.references.length + 1;
271 |             replacement = '[' + content + '][' + id + ']';
272 |             reference = '[' + id + ']: ' + href + title;
273 |         }
274 |   
275 |         this.references.push(reference);
276 |         return replacement
277 |       },
278 |   
279 |       references: [],
280 |   
281 |       append: function (options) {
282 |         var references = '';
283 |         if (this.references.length) {
284 |           references = '\n\n' + this.references.join('\n') + '\n\n';
285 |           this.references = []; // Reset references
286 |         }
287 |         return references
288 |       }
289 |     };
290 |   
291 |     rules.emphasis = {
292 |       filter: ['em', 'i'],
293 |   
294 |       replacement: function (content, node, options) {
295 |         if (!content.trim()) return ''
296 |         return options.emDelimiter + content + options.emDelimiter
297 |       }
298 |     };
299 |   
300 |     rules.strong = {
301 |       filter: ['strong', 'b'],
302 |   
303 |       replacement: function (content, node, options) {
304 |         if (!content.trim()) return ''
305 |         return options.strongDelimiter + content + options.strongDelimiter
306 |       }
307 |     };
308 |   
309 |     rules.code = {
310 |       filter: function (node) {
311 |         var hasSiblings = node.previousSibling || node.nextSibling;
312 |         var isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
313 |   
314 |         return node.nodeName === 'CODE' && !isCodeBlock
315 |       },
316 |   
317 |       replacement: function (content) {
318 |         if (!content) return ''
319 |         content = content.replace(/\r?\n|\r/g, ' ');
320 |   
321 |         var extraSpace = /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
322 |         var delimiter = '`';
323 |         var matches = content.match(/`+/gm) || [];
324 |         while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
325 |   
326 |         return delimiter + extraSpace + content + extraSpace + delimiter
327 |       }
328 |     };
329 |   
330 |     rules.image = {
331 |       filter: 'img',
332 |   
333 |       replacement: function (content, node) {
334 |         var alt = cleanAttribute(node.getAttribute('alt'));
335 |         var src = node.getAttribute('src') || '';
336 |         var title = cleanAttribute(node.getAttribute('title'));
337 |         var titlePart = title ? ' "' + title + '"' : '';
338 |         return src ? '![' + alt + ']' + '(' + src + titlePart + ')' : ''
339 |       }
340 |     };
341 |   
342 |     function cleanAttribute (attribute) {
343 |       return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''
344 |     }
345 |   
346 |     /**
347 |      * Manages a collection of rules used to convert HTML to Markdown
348 |      */
349 |   
350 |     function Rules (options) {
351 |       this.options = options;
352 |       this._keep = [];
353 |       this._remove = [];
354 |   
355 |       this.blankRule = {
356 |         replacement: options.blankReplacement
357 |       };
358 |   
359 |       this.keepReplacement = options.keepReplacement;
360 |   
361 |       this.defaultRule = {
362 |         replacement: options.defaultReplacement
363 |       };
364 |   
365 |       this.array = [];
366 |       for (var key in options.rules) this.array.push(options.rules[key]);
367 |     }
368 |   
369 |     Rules.prototype = {
370 |       add: function (key, rule) {
371 |         this.array.unshift(rule);
372 |       },
373 |   
374 |       keep: function (filter) {
375 |         this._keep.unshift({
376 |           filter: filter,
377 |           replacement: this.keepReplacement
378 |         });
379 |       },
380 |   
381 |       remove: function (filter) {
382 |         this._remove.unshift({
383 |           filter: filter,
384 |           replacement: function () {
385 |             return ''
386 |           }
387 |         });
388 |       },
389 |   
390 |       forNode: function (node) {
391 |         if (node.isBlank) return this.blankRule
392 |         var rule;
393 |   
394 |         if ((rule = findRule(this.array, node, this.options))) return rule
395 |         if ((rule = findRule(this._keep, node, this.options))) return rule
396 |         if ((rule = findRule(this._remove, node, this.options))) return rule
397 |   
398 |         return this.defaultRule
399 |       },
400 |   
401 |       forEach: function (fn) {
402 |         for (var i = 0; i < this.array.length; i++) fn(this.array[i], i);
403 |       }
404 |     };
405 |   
406 |     function findRule (rules, node, options) {
407 |       for (var i = 0; i < rules.length; i++) {
408 |         var rule = rules[i];
409 |         if (filterValue(rule, node, options)) return rule
410 |       }
411 |       return void 0
412 |     }
413 |   
414 |     function filterValue (rule, node, options) {
415 |       var filter = rule.filter;
416 |       if (typeof filter === 'string') {
417 |         if (filter === node.nodeName.toLowerCase()) return true
418 |       } else if (Array.isArray(filter)) {
419 |         if (filter.indexOf(node.nodeName.toLowerCase()) > -1) return true
420 |       } else if (typeof filter === 'function') {
421 |         if (filter.call(rule, node, options)) return true
422 |       } else {
423 |         throw new TypeError('`filter` needs to be a string, array, or function')
424 |       }
425 |     }
426 |   
427 |     /**
428 |      * The collapseWhitespace function is adapted from collapse-whitespace
429 |      * by Luc Thevenard.
430 |      *
431 |      * The MIT License (MIT)
432 |      *
433 |      * Copyright (c) 2014 Luc Thevenard <lucthevenard@gmail.com>
434 |      *
435 |      * Permission is hereby granted, free of charge, to any person obtaining a copy
436 |      * of this software and associated documentation files (the "Software"), to deal
437 |      * in the Software without restriction, including without limitation the rights
438 |      * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
439 |      * copies of the Software, and to permit persons to whom the Software is
440 |      * furnished to do so, subject to the following conditions:
441 |      *
442 |      * The above copyright notice and this permission notice shall be included in
443 |      * all copies or substantial portions of the Software.
444 |      *
445 |      * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
446 |      * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
447 |      * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
448 |      * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
449 |      * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
450 |      * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
451 |      * THE SOFTWARE.
452 |      */
453 |   
454 |     /**
455 |      * collapseWhitespace(options) removes extraneous whitespace from an the given element.
456 |      *
457 |      * @param {Object} options
458 |      */
459 |     function collapseWhitespace (options) {
460 |       var element = options.element;
461 |       var isBlock = options.isBlock;
462 |       var isVoid = options.isVoid;
463 |       var isPre = options.isPre || function (node) {
464 |         return node.nodeName === 'PRE'
465 |       };
466 |   
467 |       if (!element.firstChild || isPre(element)) return
468 |   
469 |       var prevText = null;
470 |       var keepLeadingWs = false;
471 |   
472 |       var prev = null;
473 |       var node = next(prev, element, isPre);
474 |   
475 |       while (node !== element) {
476 |         if (node.nodeType === 3 || node.nodeType === 4) { // Node.TEXT_NODE or Node.CDATA_SECTION_NODE
477 |           var text = node.data.replace(/[ \r\n\t]+/g, ' ');
478 |   
479 |           if ((!prevText || / $/.test(prevText.data)) &&
480 |               !keepLeadingWs && text[0] === ' ') {
481 |             text = text.substr(1);
482 |           }
483 |   
484 |           // `text` might be empty at this point.
485 |           if (!text) {
486 |             node = remove(node);
487 |             continue
488 |           }
489 |   
490 |           node.data = text;
491 |   
492 |           prevText = node;
493 |         } else if (node.nodeType === 1) { // Node.ELEMENT_NODE
494 |           if (isBlock(node) || node.nodeName === 'BR') {
495 |             if (prevText) {
496 |               prevText.data = prevText.data.replace(/ $/, '');
497 |             }
498 |   
499 |             prevText = null;
500 |             keepLeadingWs = false;
501 |           } else if (isVoid(node) || isPre(node)) {
502 |             // Avoid trimming space around non-block, non-BR void elements and inline PRE.
503 |             prevText = null;
504 |             keepLeadingWs = true;
505 |           } else if (prevText) {
506 |             // Drop protection if set previously.
507 |             keepLeadingWs = false;
508 |           }
509 |         } else {
510 |           node = remove(node);
511 |           continue
512 |         }
513 |   
514 |         var nextNode = next(prev, node, isPre);
515 |         prev = node;
516 |         node = nextNode;
517 |       }
518 |   
519 |       if (prevText) {
520 |         prevText.data = prevText.data.replace(/ $/, '');
521 |         if (!prevText.data) {
522 |           remove(prevText);
523 |         }
524 |       }
525 |     }
526 |   
527 |     /**
528 |      * remove(node) removes the given node from the DOM and returns the
529 |      * next node in the sequence.
530 |      *
531 |      * @param {Node} node
532 |      * @return {Node} node
533 |      */
534 |     function remove (node) {
535 |       var next = node.nextSibling || node.parentNode;
536 |   
537 |       node.parentNode.removeChild(node);
538 |   
539 |       return next
540 |     }
541 |   
542 |     /**
543 |      * next(prev, current, isPre) returns the next node in the sequence, given the
544 |      * current and previous nodes.
545 |      *
546 |      * @param {Node} prev
547 |      * @param {Node} current
548 |      * @param {Function} isPre
549 |      * @return {Node}
550 |      */
551 |     function next (prev, current, isPre) {
552 |       if ((prev && prev.parentNode === current) || isPre(current)) {
553 |         return current.nextSibling || current.parentNode
554 |       }
555 |   
556 |       return current.firstChild || current.nextSibling || current.parentNode
557 |     }
558 |   
559 |     /*
560 |      * Set up window for Node.js
561 |      */
562 |   
563 |     var root = (typeof window !== 'undefined' ? window : {});
564 |   
565 |     /*
566 |      * Parsing HTML strings
567 |      */
568 |   
569 |     function canParseHTMLNatively () {
570 |       var Parser = root.DOMParser;
571 |       var canParse = false;
572 |   
573 |       // Adapted from https://gist.github.com/1129031
574 |       // Firefox/Opera/IE throw errors on unsupported types
575 |       try {
576 |         // WebKit returns null on unsupported types
577 |         if (new Parser().parseFromString('', 'text/html')) {
578 |           canParse = true;
579 |         }
580 |       } catch (e) {}
581 |   
582 |       return canParse
583 |     }
584 |   
585 |     function createHTMLParser () {
586 |       var Parser = function () {};
587 |   
588 |       {
589 |         var domino = require('@mixmark-io/domino');
590 |         Parser.prototype.parseFromString = function (string) {
591 |           return domino.createDocument(string)
592 |         };
593 |       }
594 |       return Parser
595 |     }
596 |   
597 |     var HTMLParser = canParseHTMLNatively() ? root.DOMParser : createHTMLParser();
598 |   
599 |     function RootNode (input, options) {
600 |       var root;
601 |       if (typeof input === 'string') {
602 |         var doc = htmlParser().parseFromString(
603 |           // DOM parsers arrange elements in the <head> and <body>.
604 |           // Wrapping in a custom element ensures elements are reliably arranged in
605 |           // a single element.
606 |           '<x-turndown id="turndown-root">' + input + '</x-turndown>',
607 |           'text/html'
608 |         );
609 |         root = doc.getElementById('turndown-root');
610 |       } else {
611 |         root = input.cloneNode(true);
612 |       }
613 |       collapseWhitespace({
614 |         element: root,
615 |         isBlock: isBlock,
616 |         isVoid: isVoid,
617 |         isPre: options.preformattedCode ? isPreOrCode : null
618 |       });
619 |   
620 |       return root
621 |     }
622 |   
623 |     var _htmlParser;
624 |     function htmlParser () {
625 |       _htmlParser = _htmlParser || new HTMLParser();
626 |       return _htmlParser
627 |     }
628 |   
629 |     function isPreOrCode (node) {
630 |       return node.nodeName === 'PRE' || node.nodeName === 'CODE'
631 |     }
632 |   
633 |     function Node (node, options) {
634 |       node.isBlock = isBlock(node);
635 |       node.isCode = node.nodeName === 'CODE' || node.parentNode.isCode;
636 |       node.isBlank = isBlank(node);
637 |       node.flankingWhitespace = flankingWhitespace(node, options);
638 |       return node
639 |     }
640 |   
641 |     function isBlank (node) {
642 |       return (
643 |         !isVoid(node) &&
644 |         !isMeaningfulWhenBlank(node) &&
645 |         /^\s*$/i.test(node.textContent) &&
646 |         !hasVoid(node) &&
647 |         !hasMeaningfulWhenBlank(node)
648 |       )
649 |     }
650 |   
651 |     function flankingWhitespace (node, options) {
652 |       if (node.isBlock || (options.preformattedCode && node.isCode)) {
653 |         return { leading: '', trailing: '' }
654 |       }
655 |   
656 |       var edges = edgeWhitespace(node.textContent);
657 |   
658 |       // abandon leading ASCII WS if left-flanked by ASCII WS
659 |       if (edges.leadingAscii && isFlankedByWhitespace('left', node, options)) {
660 |         edges.leading = edges.leadingNonAscii;
661 |       }
662 |   
663 |       // abandon trailing ASCII WS if right-flanked by ASCII WS
664 |       if (edges.trailingAscii && isFlankedByWhitespace('right', node, options)) {
665 |         edges.trailing = edges.trailingNonAscii;
666 |       }
667 |   
668 |       return { leading: edges.leading, trailing: edges.trailing }
669 |     }
670 |   
671 |     function edgeWhitespace (string) {
672 |       var m = string.match(/^(([ \t\r\n]*)(\s*))(?:(?=\S)[\s\S]*\S)?((\s*?)([ \t\r\n]*))$/);
673 |       return {
674 |         leading: m[1], // whole string for whitespace-only strings
675 |         leadingAscii: m[2],
676 |         leadingNonAscii: m[3],
677 |         trailing: m[4], // empty for whitespace-only strings
678 |         trailingNonAscii: m[5],
679 |         trailingAscii: m[6]
680 |       }
681 |     }
682 |   
683 |     function isFlankedByWhitespace (side, node, options) {
684 |       var sibling;
685 |       var regExp;
686 |       var isFlanked;
687 |   
688 |       if (side === 'left') {
689 |         sibling = node.previousSibling;
690 |         regExp = / $/;
691 |       } else {
692 |         sibling = node.nextSibling;
693 |         regExp = /^ /;
694 |       }
695 |   
696 |       if (sibling) {
697 |         if (sibling.nodeType === 3) {
698 |           isFlanked = regExp.test(sibling.nodeValue);
699 |         } else if (options.preformattedCode && sibling.nodeName === 'CODE') {
700 |           isFlanked = false;
701 |         } else if (sibling.nodeType === 1 && !isBlock(sibling)) {
702 |           isFlanked = regExp.test(sibling.textContent);
703 |         }
704 |       }
705 |       return isFlanked
706 |     }
707 |   
708 |     var reduce = Array.prototype.reduce;
709 |     var escapes = [
710 |       [/\\/g, '\\\\'],
711 |       [/\*/g, '\\*'],
712 |       [/^-/g, '\\-'],
713 |       [/^\+ /g, '\\+ '],
714 |       [/^(=+)/g, '\\$1'],
715 |       [/^(#{1,6}) /g, '\\$1 '],
716 |       [/`/g, '\\`'],
717 |       [/^~~~/g, '\\~~~'],
718 |       [/\[/g, '\\['],
719 |       [/\]/g, '\\]'],
720 |       [/^>/g, '\\>'],
721 |       [/_/g, '\\_'],
722 |       [/^(\d+)\. /g, '$1\\. ']
723 |     ];
724 |   
725 |     function TurndownService (options) {
726 |       if (!(this instanceof TurndownService)) return new TurndownService(options)
727 |   
728 |       var defaults = {
729 |         rules: rules,
730 |         headingStyle: 'setext',
731 |         hr: '* * *',
732 |         bulletListMarker: '*',
733 |         codeBlockStyle: 'indented',
734 |         fence: '```',
735 |         emDelimiter: '_',
736 |         strongDelimiter: '**',
737 |         linkStyle: 'inlined',
738 |         linkReferenceStyle: 'full',
739 |         br: '  ',
740 |         preformattedCode: false,
741 |         blankReplacement: function (content, node) {
742 |           return node.isBlock ? '\n\n' : ''
743 |         },
744 |         keepReplacement: function (content, node) {
745 |           return node.isBlock ? '\n\n' + node.outerHTML + '\n\n' : node.outerHTML
746 |         },
747 |         defaultReplacement: function (content, node) {
748 |           return node.isBlock ? '\n\n' + content + '\n\n' : content
749 |         }
750 |       };
751 |       this.options = extend({}, defaults, options);
752 |       this.rules = new Rules(this.options);
753 |     }
754 |   
755 |     TurndownService.prototype = {
756 |       /**
757 |        * The entry point for converting a string or DOM node to Markdown
758 |        * @public
759 |        * @param {String|HTMLElement} input The string or DOM node to convert
760 |        * @returns A Markdown representation of the input
761 |        * @type String
762 |        */
763 |   
764 |       turndown: function (input) {
765 |         if (!canConvert(input)) {
766 |           throw new TypeError(
767 |             input + ' is not a string, or an element/document/fragment node.'
768 |           )
769 |         }
770 |   
771 |         if (input === '') return ''
772 |   
773 |         var output = process.call(this, new RootNode(input, this.options));
774 |         return postProcess.call(this, output)
775 |       },
776 |   
777 |       /**
778 |        * Add one or more plugins
779 |        * @public
780 |        * @param {Function|Array} plugin The plugin or array of plugins to add
781 |        * @returns The Turndown instance for chaining
782 |        * @type Object
783 |        */
784 |   
785 |       use: function (plugin) {
786 |         if (Array.isArray(plugin)) {
787 |           for (var i = 0; i < plugin.length; i++) this.use(plugin[i]);
788 |         } else if (typeof plugin === 'function') {
789 |           plugin(this);
790 |         } else {
791 |           throw new TypeError('plugin must be a Function or an Array of Functions')
792 |         }
793 |         return this
794 |       },
795 |   
796 |       /**
797 |        * Adds a rule
798 |        * @public
799 |        * @param {String} key The unique key of the rule
800 |        * @param {Object} rule The rule
801 |        * @returns The Turndown instance for chaining
802 |        * @type Object
803 |        */
804 |   
805 |       addRule: function (key, rule) {
806 |         this.rules.add(key, rule);
807 |         return this
808 |       },
809 |   
810 |       /**
811 |        * Keep a node (as HTML) that matches the filter
812 |        * @public
813 |        * @param {String|Array|Function} filter The unique key of the rule
814 |        * @returns The Turndown instance for chaining
815 |        * @type Object
816 |        */
817 |   
818 |       keep: function (filter) {
819 |         this.rules.keep(filter);
820 |         return this
821 |       },
822 |   
823 |       /**
824 |        * Remove a node that matches the filter
825 |        * @public
826 |        * @param {String|Array|Function} filter The unique key of the rule
827 |        * @returns The Turndown instance for chaining
828 |        * @type Object
829 |        */
830 |   
831 |       remove: function (filter) {
832 |         this.rules.remove(filter);
833 |         return this
834 |       },
835 |   
836 |       /**
837 |        * Escapes Markdown syntax
838 |        * @public
839 |        * @param {String} string The string to escape
840 |        * @returns A string with Markdown syntax escaped
841 |        * @type String
842 |        */
843 |   
844 |       escape: function (string) {
845 |         return escapes.reduce(function (accumulator, escape) {
846 |           return accumulator.replace(escape[0], escape[1])
847 |         }, string)
848 |       }
849 |     };
850 |   
851 |     /**
852 |      * Reduces a DOM node down to its Markdown string equivalent
853 |      * @private
854 |      * @param {HTMLElement} parentNode The node to convert
855 |      * @returns A Markdown representation of the node
856 |      * @type String
857 |      */
858 |   
859 |     function process (parentNode) {
860 |       var self = this;
861 |       return reduce.call(parentNode.childNodes, function (output, node) {
862 |         node = new Node(node, self.options);
863 |   
864 |         var replacement = '';
865 |         if (node.nodeType === 3) {
866 |           replacement = node.isCode ? node.nodeValue : self.escape(node.nodeValue);
867 |         } else if (node.nodeType === 1) {
868 |           replacement = replacementForNode.call(self, node);
869 |         }
870 |   
871 |         return join(output, replacement)
872 |       }, '')
873 |     }
874 |   
875 |     /**
876 |      * Appends strings as each rule requires and trims the output
877 |      * @private
878 |      * @param {String} output The conversion output
879 |      * @returns A trimmed version of the ouput
880 |      * @type String
881 |      */
882 |   
883 |     function postProcess (output) {
884 |       var self = this;
885 |       this.rules.forEach(function (rule) {
886 |         if (typeof rule.append === 'function') {
887 |           output = join(output, rule.append(self.options));
888 |         }
889 |       });
890 |   
891 |       return output.replace(/^[\t\r\n]+/, '').replace(/[\t\r\n\s]+$/, '')
892 |     }
893 |   
894 |     /**
895 |      * Converts an element node to its Markdown equivalent
896 |      * @private
897 |      * @param {HTMLElement} node The node to convert
898 |      * @returns A Markdown representation of the node
899 |      * @type String
900 |      */
901 |   
902 |     function replacementForNode (node) {
903 |       var rule = this.rules.forNode(node);
904 |       var content = process.call(this, node);
905 |       var whitespace = node.flankingWhitespace;
906 |       if (whitespace.leading || whitespace.trailing) content = content.trim();
907 |       return (
908 |         whitespace.leading +
909 |         rule.replacement(content, node, this.options) +
910 |         whitespace.trailing
911 |       )
912 |     }
913 |   
914 |     /**
915 |      * Joins replacement to the current output with appropriate number of new lines
916 |      * @private
917 |      * @param {String} output The current conversion output
918 |      * @param {String} replacement The string to append to the output
919 |      * @returns Joined output
920 |      * @type String
921 |      */
922 |   
923 |     function join (output, replacement) {
924 |       var s1 = trimTrailingNewlines(output);
925 |       var s2 = trimLeadingNewlines(replacement);
926 |       var nls = Math.max(output.length - s1.length, replacement.length - s2.length);
927 |       var separator = '\n\n'.substring(0, nls);
928 |   
929 |       return s1 + separator + s2
930 |     }
931 |   
932 |     /**
933 |      * Determines whether an input can be converted
934 |      * @private
935 |      * @param {String|HTMLElement} input Describe this parameter
936 |      * @returns Describe what it returns
937 |      * @type String|Object|Array|Boolean|Number
938 |      */
939 |   
940 |     function canConvert (input) {
941 |       return (
942 |         input != null && (
943 |           typeof input === 'string' ||
944 |           (input.nodeType && (
945 |             input.nodeType === 1 || input.nodeType === 9 || input.nodeType === 11
946 |           ))
947 |         )
948 |       )
949 |     }
950 |   
951 |     return TurndownService;
952 |   
953 |   })));


--------------------------------------------------------------------------------
/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "ArXiv Markdown Parser",
 3 |     "description": "Fetches arXiv paper in HTML and converts it to Markdown with LaTeX inlined.",
 4 |     "version": "1.1",
 5 |     "manifest_version": 3,
 6 |     "icons": {
 7 |       "16": "icons/icon16.png",
 8 |       "48": "icons/icon48.png",
 9 |       "128": "icons/icon128.png"
10 |     },
11 |     "permissions": [
12 |       "activeTab"
13 |     ],
14 |     "action": {
15 |       "default_popup": "popup.html",
16 |       "default_icon": {
17 |         "16": "icons/icon16.png",
18 |         "48": "icons/icon48.png",
19 |         "128": "icons/icon128.png"
20 |       },
21 |       "default_title": "ArXiv Markdown Parser"
22 |     },
23 |     "background": {
24 |       "service_worker": "background.js"
25 |     },
26 |     "content_scripts": [
27 |       {
28 |         "matches": [
29 |           "*://arxiv.org/abs/*",
30 |           "*://arxiv.org/pdf/*",
31 |           "*://arxiv.org/html/*"
32 |         ],
33 |         "js": [
34 |           "lib/turndown.umd.js", 
35 |           "lib/turndown-plugin-gfm.js",
36 |           "contentScript.js"
37 |         ],
38 |         "run_at": "document_end"
39 |       }
40 |     ]
41 |   }
42 |   


--------------------------------------------------------------------------------
/popup.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="utf-8" />
 5 |     <title>ArXiv Parser</title>
 6 |     <style>
 7 |       body {
 8 |         min-width: 300px;
 9 |         min-height: 150px;
10 |         font-family: sans-serif;
11 |         margin: 10px;
12 |       }
13 |       textarea {
14 |         width: 100%;
15 |         height: 200px;
16 |       }
17 |       button {
18 |         margin: 4px 0;
19 |         padding: 6px 10px;
20 |       }
21 |       label {
22 |         display: block;
23 |         margin-top: 8px;
24 |       }
25 |     </style>
26 |   </head>
27 |   <body>
28 |     <h3>ArXiv Markdown Parser</h3>
29 |     <button id="convertBtn">Get Markdown</button>
30 | 
31 |     <!-- New checkboxes -->
32 |     <label>
33 |       <input type="checkbox" id="removeTable" />
34 |       Remove content table
35 |     </label>
36 |     <label>
37 |       <input type="checkbox" id="removeRefs" />
38 |       Remove references
39 |     </label>
40 | 
41 |     <div id="error" style="color: red; margin-top: 10px;"></div>
42 |     <div id="markdownContainer" style="margin-top: 10px;">
43 |       <textarea id="markdownOutput"></textarea>
44 |     </div>
45 |     <script src="popup.js"></script>
46 |   </body>
47 | </html>
48 | 


--------------------------------------------------------------------------------
/popup.js:
--------------------------------------------------------------------------------
 1 | document.addEventListener("DOMContentLoaded", function () {
 2 |   const convertBtn = document.getElementById("convertBtn");
 3 |   const markdownOutput = document.getElementById("markdownOutput");
 4 |   const errorDiv = document.getElementById("error");
 5 | 
 6 |   convertBtn.addEventListener("click", function () {
 7 |     errorDiv.textContent = "";
 8 |     markdownOutput.value = "Loading...";
 9 | 
10 |     const removeTable = document.getElementById("removeTable").checked;
11 |     const removeRefs = document.getElementById("removeRefs").checked;
12 | 
13 |     chrome.tabs.query({ active: true, currentWindow: true }, function (tabs) {
14 |       if (!tabs || !tabs.length) {
15 |         errorDiv.textContent = "No active tab found.";
16 |         return;
17 |       }
18 |       const activeTab = tabs[0];
19 |       chrome.tabs.sendMessage(
20 |         activeTab.id,
21 |         {
22 |           action: "getMarkdown",
23 |           removeTable: removeTable,
24 |           removeRefs: removeRefs
25 |         },
26 |         function (response) {
27 |           if (!response) {
28 |             errorDiv.textContent =
29 |               "No response (are you sure this is an arXiv page?)";
30 |             markdownOutput.value = "";
31 |             return;
32 |           }
33 |           if (response.success) {
34 |             markdownOutput.value = response.markdown;
35 |           } else {
36 |             errorDiv.textContent = response.error || "Unknown error";
37 |             markdownOutput.value = "";
38 |           }
39 |         }
40 |       );
41 |     });
42 |   });
43 | });
44 | 


--------------------------------------------------------------------------------