├── testdata ├── task_fmt.txt ├── fuzz │ └── Fuzz │ │ ├── 5d90cadcbf2fc0a05c34346f2e0d544de4e230b1a7b56412ab4b5fdbb413d147 │ │ ├── e2e384485b8d6c08f62211a6db9cf55e3582dfe088c6ffc6f3ee80446171e148 │ │ ├── 900d64f4df082a036ff8da05207cb0b00379ef7c2714addee6a3000b7d42f046 │ │ ├── 6e1ec98995f90b7237a488109ef07219eea37739b6fc18f69c0c61cfd43590ce │ │ ├── 99b7e429a4c90c1eddd1560b79014ab2442f3b5b9b80d84d0a04a96a4a8c9906 │ │ ├── b6461168fb519180a65d1a230dc6c5cb03194e5817bf4a192c33b6fbd8eec65f │ │ ├── 4f0397bfd8cdada4815be61da4ee7a80200dc512dc0bfc09dd086dad03b335dc │ │ ├── e73b40d4a194f94ba52c4774f577ca9d90e71698cf76d2944c688c0b4a9927b9 │ │ ├── 38a2bce29a092521f5d1f873dd7bab598b72474bee79f396ac5d1515128baa71 │ │ └── ce9879da2226220068fd4085fff503aa5ebf62c5879af3dc23fc47dd29e500f0 ├── heading_fmt.txt ├── code_fmt.txt ├── smart.txt ├── emoji.txt ├── linkref_fmt.txt ├── table_fmt.txt ├── headings.txt ├── del.txt ├── footnote.txt ├── spec2txtar.go ├── task.txt ├── gfm_smart.txt ├── cmark2txtar.go ├── table.txt ├── gfm_regress.txt ├── basic_fmt.txt ├── gfm_ext.txt └── autoext.txt ├── go.mod ├── go.sum ├── README.md ├── block.go ├── doc.go ├── htmltags.go ├── emoji2gist.go ├── LICENSE ├── quote.go ├── mdfmt └── main.go ├── table_test.go ├── entity2go.go ├── md2html └── main.go ├── emoji2go.go ├── fuzz_test.go ├── break.go ├── htmlesc.go ├── lex.go ├── line.go ├── footnote.go ├── print.go ├── big_test.go ├── para.go ├── heading.go ├── code.go ├── parse.go ├── table.go ├── list.go ├── md_test.go └── html.go /testdata/task_fmt.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"TaskList": true} 3 | -- gfm279.md -- 4 | - [ ] foo 5 | - [x] bar 6 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/5d90cadcbf2fc0a05c34346f2e0d544de4e230b1a7b56412ab4b5fdbb413d147: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("*[_*]()\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/e2e384485b8d6c08f62211a6db9cf55e3582dfe088c6ffc6f3ee80446171e148: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("\\\\\nr\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/900d64f4df082a036ff8da05207cb0b00379ef7c2714addee6a3000b7d42f046: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("*[a*r*]()\n") 3 | -------------------------------------------------------------------------------- /testdata/heading_fmt.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"HeadingID": true} 3 | -- 1 -- 4 | # H {# id } 5 | -- want -- 6 | # H {#id} 7 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/6e1ec98995f90b7237a488109ef07219eea37739b6fc18f69c0c61cfd43590ce: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("!][![[]()]()]()\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/99b7e429a4c90c1eddd1560b79014ab2442f3b5b9b80d84d0a04a96a4a8c9906: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("- e\n\n o\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/b6461168fb519180a65d1a230dc6c5cb03194e5817bf4a192c33b6fbd8eec65f: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("- a\n > b\n ` `\n c'= ```\n; d\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/4f0397bfd8cdada4815be61da4ee7a80200dc512dc0bfc09dd086dad03b335dc: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("![foo](/\x10rl \"title%(/url \"titlU%(/url \"\")\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/e73b40d4a194f94ba52c4774f577ca9d90e71698cf76d2944c688c0b4a9927b9: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("1. a\n\n \x05\x05\x05\x05\x052n b\n\n 3. c\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/38a2bce29a092521f5d1f873dd7bab598b72474bee79f396ac5d1515128baa71: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("![([foo]![([oo][i.)](u.\u007ffoo][i1)](u1.)](uri3)\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/ce9879da2226220068fd4085fff503aa5ebf62c5879af3dc23fc47dd29e500f0: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("1. foo\n\n ```\n bar\n ``\n\n ` baz\n\n > bo\n\n `>m\n") 3 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module rsc.io/markdown 2 | 3 | go 1.22.0 4 | 5 | require ( 6 | github.com/yuin/goldmark v1.6.0 // for testing only 7 | golang.org/x/text v0.3.7 8 | golang.org/x/tools v0.1.5 9 | ) 10 | -------------------------------------------------------------------------------- /testdata/code_fmt.txt: -------------------------------------------------------------------------------- 1 | -- 1 -- 2 | `x` 3 | -- want -- 4 | `x` 5 | -- 2 -- 6 | ```x``` 7 | -- want -- 8 | `x` 9 | -- 3 -- 10 | ```` `x` ```` 11 | -- want -- 12 | `` `x` `` 13 | -- 4 -- 14 | `````a ``` b`` ````` 15 | -- want -- 16 | ````a ``` b`` ```` 17 | -------------------------------------------------------------------------------- /testdata/smart.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"SmartQuote": true} 3 | -- 1.md -- 4 | 'hello' 5 | -- 1.html -- 6 |

‘hello’

7 | -- 2.md -- 8 | my'hello' 9 | -- 2.html -- 10 |

my’hello’

11 | -- 3.md -- 12 | [my]'hello' 13 | -- 3.html -- 14 |

[my]’hello’

15 | -------------------------------------------------------------------------------- /testdata/emoji.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"Emoji": true} 3 | -- 1.md -- 4 | emojis 5 | :+1: 6 | :100: 7 | :1st_place_medal: 8 | :negative_squared_cross_mark: 9 | :wales: 10 | :south_georgia_south_sandwich_islands: 11 | :woman_facepalming: 12 | end 13 | -- 1.html -- 14 |

emojis 15 | 👍 16 | 💯 17 | 🥇 18 | ❎ 19 | 🏴󠁧󠁢󠁷󠁬󠁳󠁿 20 | 🇬🇸 21 | 🤦‍♀️ 22 | end

23 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/yuin/goldmark v1.6.0 h1:boZcn2GTjpsynOsC0iJHnBWa4Bi0qzfJjthwauItG68= 2 | github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 3 | golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= 4 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 5 | golang.org/x/tools v0.1.5 h1:ouewzE6p+/VEB31YYnTbEJdi8pFqKp4P4n85vwo3DHA= 6 | golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Package markdown is a Commonmark-compliant Markdown parser and 2 | HTML generator. It does not have many bells and whistles, but it does 3 | expose the parsed syntax in an easy-to-use form. 4 | 5 | Work in progress. 6 | 7 | TODO: 8 | - documentation 9 | - make Format always print valid markdown, 10 | even when the tree was constructed manually and may 11 | not correspond to something Parse would return. 12 | - footnote support 13 | - possibly math support 14 | - would it be simpler to have a lexer generated from regexps? 15 | -------------------------------------------------------------------------------- /testdata/linkref_fmt.txt: -------------------------------------------------------------------------------- 1 | Tests for rendering a document's link references in markdown. 2 | -- simple -- 3 | A document. 4 | 5 | [foo]: u 6 | -- want -- 7 | A document. 8 | 9 | [foo]: u 10 | -- sorted -- 11 | A document. 12 | 13 | [foo]: u1 14 | [bar]: u2 15 | -- want -- 16 | A document. 17 | 18 | [bar]: u2 19 | [foo]: u1 20 | -- interleaved -- 21 | First. 22 | 23 | [foo]: u1 24 | Second. 25 | 26 | [bar]: u2 27 | -- want -- 28 | First. 29 | 30 | Second. 31 | 32 | [bar]: u2 33 | [foo]: u1 34 | -- titles -- 35 | A document. 36 | 37 | [r1]: u1 (title1) 38 | [r2]: u2 "title2" 39 | [r3]: u3 'title3' 40 | -------------------------------------------------------------------------------- /block.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | // Block is implemented by: 8 | // 9 | // CodeBlock 10 | // Document 11 | // Empty 12 | // HTMLBlock 13 | // Heading 14 | // Item 15 | // List 16 | // Paragraph 17 | // Quote 18 | // Text 19 | // ThematicBreak 20 | type Block interface { 21 | Block() 22 | Pos() Position 23 | printHTML(p *printer) 24 | printMarkdown(p *printer) 25 | } 26 | 27 | type Position struct { 28 | StartLine int 29 | EndLine int 30 | } 31 | 32 | func (p Position) Pos() Position { 33 | return p 34 | } 35 | -------------------------------------------------------------------------------- /testdata/table_fmt.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"Table": true} 3 | -- padded -- 4 | |foo|bar|baz| 5 | |--|--|--| 6 | |1|2|3| 7 | |a|b|c| 8 | -- want -- 9 | | foo | bar | baz | 10 | | --- | --- | --- | 11 | | 1 | 2 | 3 | 12 | | a | b | c | 13 | -- aligned -- 14 | |foo|bär|baz| 15 | |:--|:-:|--:| 16 | |1|2|3| 17 | |a|b|c| 18 | -- want -- 19 | | foo | bär | baz | 20 | | :-- | :-: | --: | 21 | | 1 | 2 | 3 | 22 | | a | b | c | 23 | -- with_normalized_inline -- 24 | |[foo](u1 )| 25 | |---| 26 | |1| 27 | |a| 28 | -- want -- 29 | | [foo](u1) | 30 | | --------- | 31 | | 1 | 32 | | a | 33 | -- indented -- 34 | - item 1 35 | 36 | | col1 | col2 | 37 | | ---- | ---- | 38 | | 1 | 2 | 39 | -- bigvalues -- 40 | | foo | bar | baz | 41 | | --- | -------- | --- | 42 | | 1 | 22345678 | 3 | 43 | | a | b | c | 44 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | type Document struct { 8 | Position 9 | Blocks []Block 10 | Links map[string]*Link 11 | } 12 | 13 | func (*Document) Block() {} 14 | 15 | func (b *Document) printHTML(p *printer) { 16 | for _, c := range b.Blocks { 17 | c.printHTML(p) 18 | } 19 | } 20 | 21 | func (b *Document) printMarkdown(p *printer) { 22 | printMarkdownBlocks(b.Blocks, p) 23 | 24 | // Terminate with a single newline. 25 | text := p.buf.Bytes() 26 | w := len(text) 27 | for w > 0 && text[w-1] == '\n' { 28 | w-- 29 | } 30 | p.buf.Truncate(w) 31 | if w > 0 { 32 | p.nl() 33 | } 34 | 35 | // Add link reference definitions. 36 | if len(b.Links) > 0 { 37 | if p.buf.Len() > 0 { 38 | p.nl() 39 | } 40 | printLinks(p, b.Links) 41 | } 42 | } 43 | 44 | func printMarkdownBlocks(bs []Block, p *printer) { 45 | for bn, b := range bs { 46 | if bn > 0 { 47 | p.nl() // end block 48 | if p.loose > 0 { 49 | p.nl() 50 | } 51 | } 52 | b.printMarkdown(p) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /testdata/headings.txt: -------------------------------------------------------------------------------- 1 | Goldmark fails on 11 because it doesn't like slashes or spaces in ids. 2 | -- parser.json -- 3 | {"HeadingID": true} 4 | -- 1.md -- 5 | # Heading 6 | -- 1.html -- 7 |

Heading

8 | -- 2.md -- 9 | # Heading ### 10 | -- 2.html -- 11 |

Heading

12 | -- 3.md -- 13 | # Heading {#id} 14 | -- 3.html -- 15 |

Heading

16 | -- 4.md -- 17 | # Heading {#id} ## 18 | -- 4.html -- 19 |

Heading

20 | -- 5.md -- 21 | # Heading {#id} more 22 | -- 5.html -- 23 |

Heading {#id} more

24 | -- 6.md -- 25 | # Heading {nope} 26 | -- 6.html -- 27 |

Heading {nope}

28 | -- 7.md -- 29 | # Heading {uhuh 30 | -- 7.html -- 31 |

Heading {uhuh

32 | -- 8.md -- 33 | # {#no} Heading 34 | -- 8.html -- 35 |

{#no} Heading

36 | -- 9.md -- 37 | # Heading {#id1} {#id2} 38 | -- 9.html -- 39 |

Heading {#id1}

40 | -- 10.md -- 41 | # Heading {#id1} {#id2 42 | -- 10.html -- 43 |

Heading {#id1} {#id2

44 | -- 11.md -- 45 | # Heading {#a/b c} 46 | -- 11.html -- 47 |

Heading

48 | -- 12.md -- 49 | # {} 50 | -- 12.html -- 51 |

{}

52 | -- 13.md -- 53 | # {#} 54 | -- 13.html -- 55 |

{#}

56 | -------------------------------------------------------------------------------- /htmltags.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | // htmlTags lists the known HTML tags for HTML block type 6. 8 | // See https://spec.commonmark.org/0.31.2/#html-blocks. 9 | var htmlTags = []string{ 10 | "address", 11 | "article", 12 | "aside", 13 | "base", 14 | "basefont", 15 | "blockquote", 16 | "body", 17 | "caption", 18 | "center", 19 | "col", 20 | "colgroup", 21 | "dd", 22 | "details", 23 | "dialog", 24 | "dir", 25 | "div", 26 | "dl", 27 | "dt", 28 | "fieldset", 29 | "figcaption", 30 | "figure", 31 | "footer", 32 | "form", 33 | "frame", 34 | "frameset", 35 | "h1", 36 | "h2", 37 | "h3", 38 | "h4", 39 | "h5", 40 | "h6", 41 | "head", 42 | "header", 43 | "hr", 44 | "html", 45 | "iframe", 46 | "legend", 47 | "li", 48 | "link", 49 | "main", 50 | "menu", 51 | "menuitem", 52 | "nav", 53 | "noframes", 54 | "ol", 55 | "optgroup", 56 | "option", 57 | "p", 58 | "param", 59 | "section", 60 | "source", 61 | "summary", 62 | "table", 63 | "tbody", 64 | "td", 65 | "tfoot", 66 | "th", 67 | "thead", 68 | "title", 69 | "tr", 70 | "track", 71 | "ul", 72 | } 73 | -------------------------------------------------------------------------------- /emoji2gist.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build ignore 6 | 7 | package main 8 | 9 | import ( 10 | "bytes" 11 | "encoding/json" 12 | "flag" 13 | "fmt" 14 | "io" 15 | "log" 16 | "net/http" 17 | "os" 18 | "sort" 19 | ) 20 | 21 | var outfile = flag.String("o", "", "write output to `file`") 22 | 23 | func main() { 24 | log.SetFlags(0) 25 | log.SetPrefix("emoji2gist: ") 26 | flag.Parse() 27 | 28 | resp, err := http.Get("https://api.github.com/emojis") 29 | if err != nil { 30 | log.Fatal(err) 31 | } 32 | if resp.StatusCode != 200 { 33 | log.Fatal(resp.Status) 34 | } 35 | data, err := io.ReadAll(resp.Body) 36 | if err != nil { 37 | log.Fatal(err) 38 | } 39 | 40 | list := make(map[string]string) 41 | err = json.Unmarshal(data, &list) 42 | if err != nil { 43 | log.Fatal(err) 44 | } 45 | 46 | var names []string 47 | for name := range list { 48 | names = append(names, name) 49 | } 50 | sort.Strings(names) 51 | 52 | var buf bytes.Buffer 53 | fmt.Fprintf(&buf, "code | emoji\n-|-\n") 54 | for _, name := range names { 55 | fmt.Fprintf(&buf, "`%s` | :%s:\n", name, name) 56 | } 57 | 58 | if *outfile != "" { 59 | if err := os.WriteFile(*outfile, buf.Bytes(), 0666); err != nil { 60 | log.Fatal(err) 61 | } 62 | } else { 63 | os.Stdout.Write(buf.Bytes()) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /testdata/del.txt: -------------------------------------------------------------------------------- 1 | Strikethrough tests. 2 | 3 | gfm* from https://github.github.com/gfm/#strikethrough-extension- 4 | (version 0.29-gfm (2019-04-06)) 5 | 6 | Others by hand, guessing based on GitHub behavior. 7 | 8 | -- parser.json -- 9 | {"Strikethrough": true} 10 | -- gfm491.md -- 11 | ~~Hi~~ Hello, ~there~ world! 12 | -- gfm491.html -- 13 |

Hi Hello, there world!

14 | -- gfm492.md -- 15 | This ~~has a 16 | 17 | new paragraph~~. 18 | -- gfm492.html -- 19 |

This ~~has a

20 |

new paragraph~~.

21 | -- gfm493.md -- 22 | This will ~~~not~~~ strike. 23 | -- gfm493.html -- 24 |

This will ~~~not~~~ strike.

25 | -- 1.md -- 26 | 5*6*78 27 | 5_6_78 28 | 5~6~78 29 | -- 1.html -- 30 |

5678 31 | 5_6_78 32 | 5678

33 | -- 2.md -- 34 | ~~Hi~~ Hello, ~~there~~ world! 35 | 5~~6~~78 36 | -- 2.html -- 37 |

Hi Hello, there world! 38 | 5678

39 | -- 3.md -- 40 | ~~___`this`___~~ 41 | -- 3.html -- 42 |

this

43 | -- 4.md -- 44 | ~~***`this`***~~ 45 | -- 4.html -- 46 |

this

47 | -- 5.md -- 48 | ~~*this*~~ 49 | -- 5.html -- 50 |

this

51 | -- 6.md -- 52 | ~~_this_~~ 53 | -- 6.html -- 54 |

this

55 | -- 7.md -- 56 | ~~___this___~~ 57 | -- 7.html -- 58 |

this

59 | -- 8.md -- 60 | ~~__this__~~ 61 | -- 8.html -- 62 |

this

63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /testdata/footnote.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"Footnote": true} 3 | -- 1.md -- 4 | Here is a simple footnote[^1][^4]. 5 | 6 | A footnote can also[^3] have multiple lines[^4]. 7 | 8 | [^1]: My reference. 9 | [^4]: To add line breaks within a footnote, prefix new lines with 2 spaces. 10 | This is a second line. 11 | -- 1.html -- 12 |

Here is a simple footnote12.

13 |

A footnote can also[^3] have multiple lines2.

14 |
Footnotes
15 |
    16 |
  1. 17 |

    My reference. 18 |

    19 |
  2. 20 |
  3. 21 |

    To add line breaks within a footnote, prefix new lines with 2 spaces. 22 | This is a second line. 23 | 24 |

    25 |
  4. 26 |
27 | -- 2.md -- 28 | Footnote[^abc]. 29 | 30 | [^aBc]: Hi. 31 | -- 2.html -- 32 |

Footnote1.

33 |
Footnotes
34 |
    35 |
  1. 36 |

    Hi. 37 |

    38 |
  2. 39 |
40 | -- 3.md -- 41 | Footnote[^aBc]. 42 | 43 | [^abC]: Hi. 44 | -- 3.html -- 45 |

Footnote1.

46 |
Footnotes
47 |
    48 |
  1. 49 |

    Hi. 50 |

    51 |
  2. 52 |
53 | -------------------------------------------------------------------------------- /quote.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | // A Quote is a [Block] representing a [block quote]. 8 | // 9 | // [block quote]: https://spec.commonmark.org/0.31.2/#block-quotes 10 | type Quote struct { 11 | Position 12 | Blocks []Block // content of quote 13 | } 14 | 15 | func (*Quote) Block() {} 16 | 17 | func (b *Quote) printHTML(p *printer) { 18 | p.html("
\n") 19 | for _, c := range b.Blocks { 20 | c.printHTML(p) 21 | } 22 | p.html("
\n") 23 | } 24 | 25 | func (b *Quote) printMarkdown(p *printer) { 26 | p.maybeQuoteNL('>') 27 | p.WriteString("> ") 28 | defer p.pop(p.push("> ")) 29 | printMarkdownBlocks(b.Blocks, p) 30 | } 31 | 32 | // A quoteBuildier is a [blockBuilder] for a block quote. 33 | type quoteBuilder struct{} 34 | 35 | // startBlockQuote is a [starter] for a [Quote]. 36 | func startBlockQuote(p *parser, s line) (line, bool) { 37 | line, ok := trimQuote(s) 38 | if !ok { 39 | return s, false 40 | } 41 | p.addBlock(new(quoteBuilder)) 42 | return line, true 43 | } 44 | 45 | func trimQuote(s line) (line, bool) { 46 | t := s 47 | t.trimSpace(0, 3, false) 48 | if !t.trim('>') { 49 | return s, false 50 | } 51 | t.trimSpace(0, 1, true) 52 | return t, true 53 | } 54 | 55 | func (b *quoteBuilder) extend(p *parser, s line) (line, bool) { 56 | return trimQuote(s) 57 | } 58 | 59 | func (b *quoteBuilder) build(p *parser) Block { 60 | return &Quote{p.pos(), p.blocks()} 61 | } 62 | -------------------------------------------------------------------------------- /mdfmt/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Mdfmt reformats Markdown data. 6 | // 7 | // Usage: 8 | // 9 | // mdfmt [-w] [file...] 10 | // 11 | // Mdfmt reads the named files, or else standard input, as Markdown documents 12 | // and then reprints the same Markdown documents to standard output. 13 | // 14 | // The -w flag specifies to rewrite the files in place. 15 | package main 16 | 17 | import ( 18 | "flag" 19 | "fmt" 20 | "io" 21 | "log" 22 | "os" 23 | 24 | "rsc.io/markdown" 25 | ) 26 | 27 | var ( 28 | wflag = flag.Bool("w", false, "write reformatted Markdown back to input files") 29 | exit = 0 30 | ) 31 | 32 | func usage() { 33 | fmt.Fprintf(os.Stderr, "usage: mdfmt [-w] [file...]\n") 34 | flag.PrintDefaults() 35 | os.Exit(2) 36 | } 37 | 38 | func main() { 39 | log.SetPrefix("mdfmt: ") 40 | log.SetFlags(0) 41 | flag.Usage = usage 42 | flag.Parse() 43 | 44 | if flag.NArg() == 0 { 45 | data, err := io.ReadAll(os.Stdin) 46 | if err != nil { 47 | log.Fatal(err) 48 | } 49 | convert(data, "") 50 | } else { 51 | for _, file := range flag.Args() { 52 | data, err := os.ReadFile(file) 53 | if err != nil { 54 | log.Print(err) 55 | exit = 1 56 | continue 57 | } 58 | convert(data, file) 59 | } 60 | } 61 | os.Exit(exit) 62 | } 63 | 64 | func convert(data []byte, file string) { 65 | var p markdown.Parser 66 | doc := p.Parse(string(data)) 67 | out := []byte(markdown.Format(doc)) 68 | if *wflag && file != "" { 69 | if err := os.WriteFile(file, out, 0666); err != nil { 70 | log.Print(err) 71 | exit = 1 72 | return 73 | } 74 | } else { 75 | os.Stdout.Write(out) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /testdata/spec2txtar.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // go run spec2txtar.go https://spec.commonmark.org/0.30/spec.json > spec0.30.txt 6 | 7 | package main 8 | 9 | import ( 10 | "encoding/json" 11 | "flag" 12 | "fmt" 13 | "io" 14 | "log" 15 | "net/http" 16 | "os" 17 | "strings" 18 | 19 | "golang.org/x/tools/txtar" 20 | ) 21 | 22 | type specCase struct { 23 | Name string 24 | Markdown string 25 | HTML string 26 | Example int 27 | } 28 | 29 | func main() { 30 | log.SetFlags(0) 31 | log.SetPrefix("spec2txtar: ") 32 | flag.Usage = func() { 33 | fmt.Fprintf(os.Stderr, "usage: spec2txtar url\n") 34 | os.Exit(2) 35 | } 36 | flag.Parse() 37 | if flag.NArg() != 1 { 38 | flag.Usage() 39 | } 40 | url := flag.Arg(0) 41 | 42 | resp, err := http.Get(url) 43 | if err != nil { 44 | log.Fatal(err) 45 | } 46 | if resp.StatusCode != 200 { 47 | log.Fatal(resp.Status) 48 | } 49 | data, err := io.ReadAll(resp.Body) 50 | if err != nil { 51 | log.Fatal(err) 52 | } 53 | 54 | var spec []specCase 55 | err = json.Unmarshal(data, &spec) 56 | if err != nil { 57 | log.Fatal(err) 58 | } 59 | 60 | a := &txtar.Archive{ 61 | Comment: []byte("// go run spec2txtar.go " + url + "\n"), 62 | } 63 | for _, cas := range spec { 64 | name := fmt.Sprintf("%d", cas.Example) 65 | a.Files = append(a.Files, 66 | txtar.File{ 67 | Name: name + ".md", 68 | Data: []byte(encode(cas.Markdown)), 69 | }, 70 | txtar.File{ 71 | Name: name + ".html", 72 | Data: []byte(encode(cas.HTML)), 73 | }, 74 | ) 75 | } 76 | 77 | os.Stdout.Write(txtar.Format(a)) 78 | } 79 | 80 | func encode(s string) string { 81 | s = strings.ReplaceAll(s, " \n", " ^J\n") 82 | s = strings.ReplaceAll(s, "\t\n", "\t^J\n") 83 | if s != "" && !strings.HasSuffix(s, "\n") { 84 | s += "^D\n" 85 | } 86 | return s 87 | } 88 | -------------------------------------------------------------------------------- /table_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | var tableCountTests = []struct { 12 | row string 13 | n int 14 | }{ 15 | {"|", 1}, 16 | {"|x|", 1}, 17 | {"||", 1}, 18 | {"| |", 1}, 19 | {"| | |", 2}, 20 | {"| | Foo | Bar |", 3}, 21 | {"| | Foo | Bar |", 3}, 22 | {"", 1}, 23 | {"|a|b", 2}, 24 | {"|a| ", 1}, 25 | {" |b", 1}, 26 | {"a|b", 2}, 27 | {`x\|y`, 1}, 28 | {`x\\|y`, 1}, 29 | {`x\\\|y`, 1}, 30 | {`x\\\\|y`, 1}, 31 | {`x\\\\\|y`, 1}, 32 | {`| 0\|1\\|2\\\|3\\\\|4\\\\\|5\\\\\\|6\\\\\\\|7\\\\\\\\|8 |`, 1}, 33 | } 34 | 35 | func TestTableCount(t *testing.T) { 36 | for _, tt := range tableCountTests { 37 | n := tableCount(tableTrimOuter(tt.row)) 38 | if n != tt.n { 39 | t.Errorf("tableCount(%#q) = %d, want %d", tt.row, n, tt.n) 40 | } 41 | } 42 | } 43 | 44 | func TestPad(t *testing.T) { 45 | testCases := []struct { 46 | raw, align string 47 | w int 48 | 49 | want string 50 | }{ 51 | {"foo", "center", 8, " foo "}, 52 | {"foo", "center", 6, " foo "}, 53 | {"foo", "center", 5, " foo "}, 54 | {"föó", "center", 5, " föó "}, 55 | {"foo", "center", 4, "foo "}, 56 | {"foo", "center", 3, "foo"}, 57 | 58 | {"foo", "left", 8, "foo "}, 59 | {"foo", "right", 8, " foo"}, 60 | {"foo", "", 8, "foo "}, 61 | 62 | {"foo", "left", 6, "foo "}, 63 | {"foo", "right", 6, " foo"}, 64 | {"foo", "", 6, "foo "}, 65 | 66 | {"foo", "left", 5, "foo "}, 67 | {"foo", "right", 5, " foo"}, 68 | {"foo", "", 5, "foo "}, 69 | 70 | {"foo", "left", 4, "foo "}, 71 | {"foo", "right", 4, " foo"}, 72 | {"foo", "", 4, "foo "}, 73 | 74 | {"foo", "left", 3, "foo"}, 75 | {"foo", "right", 3, "foo"}, 76 | {"foo", "", 3, "foo"}, 77 | } 78 | 79 | for _, tc := range testCases { 80 | in := tc.raw 81 | a := tc.align 82 | w := tc.w 83 | want := tc.want 84 | var p printer 85 | pad(&p, in, a, w) 86 | h := p.buf.String() 87 | if h != want { 88 | t.Errorf("\npad(%s, %s, %d)\n have %q\n want %q", in, a, w, h, want) 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /entity2go.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build ignore 6 | 7 | package main 8 | 9 | import ( 10 | "bytes" 11 | "encoding/json" 12 | "flag" 13 | "fmt" 14 | "go/format" 15 | "io" 16 | "log" 17 | "net/http" 18 | "os" 19 | "sort" 20 | "strings" 21 | ) 22 | 23 | var outfile = flag.String("o", "", "write output to `file`") 24 | 25 | func main() { 26 | log.SetFlags(0) 27 | log.SetPrefix("entity2go: ") 28 | flag.Parse() 29 | 30 | resp, err := http.Get("https://html.spec.whatwg.org/entities.json") 31 | if err != nil { 32 | log.Fatal(err) 33 | } 34 | if resp.StatusCode != 200 { 35 | log.Fatal(resp.Status) 36 | } 37 | data, err := io.ReadAll(resp.Body) 38 | if err != nil { 39 | log.Fatal(err) 40 | } 41 | 42 | list := make(map[string]struct { 43 | Codepoints []rune 44 | }) 45 | err = json.Unmarshal(data, &list) 46 | if err != nil { 47 | log.Fatal(err) 48 | } 49 | 50 | var names []string 51 | for name := range list { 52 | names = append(names, name) 53 | } 54 | sort.Strings(names) 55 | 56 | var buf bytes.Buffer 57 | buf.WriteString(hdr) 58 | fmt.Fprintf(&buf, "var htmlEntity = map[string]string{\n") 59 | for _, name := range names { 60 | if !strings.HasSuffix(name, ";") { 61 | continue 62 | } 63 | fmt.Fprintf(&buf, "\t%q: \"", name) 64 | for _, r := range list[name].Codepoints { 65 | if r <= 0xFFFF { 66 | fmt.Fprintf(&buf, "\\u%04x", r) 67 | } else { 68 | fmt.Fprintf(&buf, "\\U%08x", r) 69 | } 70 | } 71 | fmt.Fprintf(&buf, "\",\n") 72 | } 73 | fmt.Fprintf(&buf, "}\n") 74 | 75 | src, err := format.Source(buf.Bytes()) 76 | if err != nil { 77 | log.Fatalf("reformatting output: %v", err) 78 | } 79 | 80 | if *outfile != "" { 81 | if err := os.WriteFile(*outfile, src, 0666); err != nil { 82 | log.Fatal(err) 83 | } 84 | } else { 85 | os.Stdout.Write(buf.Bytes()) 86 | } 87 | } 88 | 89 | var hdr = `// Copyright 2023 The Go Authors. All rights reserved. 90 | // Use of this source code is governed by a BSD-style 91 | // license that can be found in the LICENSE file. 92 | 93 | //go:generate go run entity2go.go -o entity.go 94 | 95 | package markdown 96 | 97 | // htmlEntity maps known HTML entity sequences to their meanings. 98 | ` 99 | -------------------------------------------------------------------------------- /md2html/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Md2html converts Markdown to HTML. 6 | // 7 | // Usage: 8 | // 9 | // md2html [file...] 10 | // 11 | // Md2html reads the named files, or else standard input, as Markdown documents 12 | // and then prints the corresponding HTML to standard output. 13 | package main 14 | 15 | import ( 16 | "bytes" 17 | "flag" 18 | "io/ioutil" 19 | "log" 20 | "os" 21 | "unicode/utf8" 22 | 23 | "rsc.io/markdown" 24 | ) 25 | 26 | func main() { 27 | flag.Parse() 28 | args := flag.Args() 29 | if len(args) == 0 { 30 | do(os.Stdin) 31 | } else { 32 | for _, arg := range args { 33 | f, err := os.Open(arg) 34 | if err != nil { 35 | log.Fatal(err) 36 | } 37 | do(f) 38 | f.Close() 39 | } 40 | } 41 | } 42 | 43 | func do(f *os.File) { 44 | data, err := ioutil.ReadAll(f) 45 | if err != nil { 46 | log.Fatal(err) 47 | } 48 | os.Stdout.WriteString(toHTML(data)) 49 | } 50 | 51 | // toHTML converts Markdown to HTML. 52 | func toHTML(md []byte) string { 53 | var p markdown.Parser 54 | p.Table = true 55 | return markdown.ToHTML(p.Parse(string(replaceTabs(md)))) 56 | } 57 | 58 | // replaceTabs replaces all tabs in text with spaces up to a 4-space tab stop. 59 | // 60 | // In Markdown, tabs used for indentation are required to be interpreted as 61 | // 4-space tab stops. See https://spec.commonmark.org/0.30/#tabs. 62 | // Go also renders nicely and more compactly on the screen with 4-space 63 | // tab stops, while browsers often use 8-space. 64 | // Make the Go code consistently compact across browsers, 65 | // all while staying Markdown-compatible, by expanding to 4-space tab stops. 66 | // 67 | // This function does not handle multi-codepoint Unicode sequences correctly. 68 | func replaceTabs(text []byte) []byte { 69 | var buf bytes.Buffer 70 | col := 0 71 | for len(text) > 0 { 72 | r, size := utf8.DecodeRune(text) 73 | text = text[size:] 74 | 75 | switch r { 76 | case '\n': 77 | buf.WriteByte('\n') 78 | col = 0 79 | 80 | case '\t': 81 | buf.WriteByte(' ') 82 | col++ 83 | for col%4 != 0 { 84 | buf.WriteByte(' ') 85 | col++ 86 | } 87 | 88 | default: 89 | buf.WriteRune(r) 90 | col++ 91 | } 92 | } 93 | return buf.Bytes() 94 | } 95 | -------------------------------------------------------------------------------- /testdata/task.txt: -------------------------------------------------------------------------------- 1 | Task list items tests. 2 | 3 | gfm* from https://github.github.com/gfm/#task-list-items-extension- 4 | (version 0.29-gfm (2019-04-06)) 5 | 6 | Others by hand, guessing based on GitHub behavior. 7 | 8 | -- parser.json -- 9 | {"TaskList": true} 10 | -- gfm279.md -- 11 | - [ ] foo 12 | - [x] bar 13 | -- gfm279.html -- 14 | 18 | -- gfm280.md -- 19 | - [x] foo 20 | - [ ] bar 21 | - [x] baz 22 | - [ ] bim 23 | -- gfm280.html -- 24 | 33 | -- spaces.md -- 34 | - [ ] foo 35 | - [x] bar 36 | - [ ]quux 37 | -- spaces.html -- 38 | 43 | -- wxyz.md -- 44 | - [w] woolloomooloo 45 | - [x] foo 46 | - [y] bar 47 | - [z] baz 48 | -- wxyz.html -- 49 | 55 | -- X.md -- 56 | - [x] foo 57 | - [X] bar 58 | - [ ] baz 59 | -- X.html -- 60 | 65 | -- 1.md -- 66 | - [x] foo 67 | - 68 | - [x] bar 69 | - hello 70 | - > quote 71 | - *emph* 72 | -- 1.html -- 73 | 85 | -- 2.md -- 86 | - [x] foo 87 | - 88 | - [x] bar 89 | 90 | - hello 91 | -- 2.html -- 92 | 104 | -------------------------------------------------------------------------------- /testdata/gfm_smart.txt: -------------------------------------------------------------------------------- 1 | // go run cmark2txtar.go /users/rsc/pub/cmark-gfm/test/smart_punct.txt 2 | -- parser.json -- 3 | {"SmartQuote": true, "SmartDash": true, "SmartDot": true} 4 | -- 1.md -- 5 | "Hello," said the spider. 6 | "'Shelob' is my name." 7 | -- 1.html -- 8 |

“Hello,” said the spider. 9 | “‘Shelob’ is my name.”

10 | -- 2.md -- 11 | 'A', 'B', and 'C' are letters. 12 | -- 2.html -- 13 |

‘A’, ‘B’, and ‘C’ are letters.

14 | -- 3.md -- 15 | 'Oak,' 'elm,' and 'beech' are names of trees. 16 | So is 'pine.' 17 | -- 3.html -- 18 |

‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. 19 | So is ‘pine.’

20 | -- 4.md -- 21 | 'He said, "I want to go."' 22 | -- 4.html -- 23 |

‘He said, “I want to go.”’

24 | -- 5.md -- 25 | Were you alive in the 70's? 26 | -- 5.html -- 27 |

Were you alive in the 70’s?

28 | -- 6.md -- 29 | Here is some quoted '`code`' and a "[quoted link](url)". 30 | -- 6.html -- 31 |

Here is some quoted ‘code’ and a “quoted link”.

32 | -- 7.md -- 33 | 'tis the season to be 'jolly' 34 | -- 7.html -- 35 |

’tis the season to be ‘jolly’

36 | -- 8.md -- 37 | 'We'll use Jane's boat and John's truck,' Jenna said. 38 | -- 8.html -- 39 |

‘We’ll use Jane’s boat and John’s truck,’ Jenna said.

40 | -- 9.md -- 41 | "A paragraph with no closing quote. 42 | 43 | "Second paragraph by same speaker, in fiction." 44 | -- 9.html -- 45 |

“A paragraph with no closing quote.

46 |

“Second paragraph by same speaker, in fiction.”

47 | -- 10.md -- 48 | [a]'s b' 49 | -- 10.html -- 50 |

[a]’s b’

51 | -- 11.md -- 52 | \"This is not smart.\" 53 | This isn\'t either. 54 | 5\'8\" 55 | -- 11.html -- 56 |

"This is not smart." 57 | This isn't either. 58 | 5'8"

59 | -- 12.md -- 60 | Some dashes: em---em 61 | en--en 62 | em --- em 63 | en -- en 64 | 2--3 65 | -- 12.html -- 66 |

Some dashes: em—em 67 | en–en 68 | em — em 69 | en – en 70 | 2–3

71 | -- 13.md -- 72 | one- 73 | two-- 74 | three--- 75 | four---- 76 | five----- 77 | six------ 78 | seven------- 79 | eight-------- 80 | nine--------- 81 | thirteen-------------. 82 | -- 13.html -- 83 |

one- 84 | two– 85 | three— 86 | four–– 87 | five—– 88 | six—— 89 | seven—–– 90 | eight–––– 91 | nine——— 92 | thirteen———––.

93 | -- 14.md -- 94 | Escaped hyphens: \-- \-\-\-. 95 | -- 14.html -- 96 |

Escaped hyphens: -- ---.

97 | -- 15.md -- 98 | Ellipses...and...and.... 99 | -- 15.html -- 100 |

Ellipses…and…and….

101 | -- 16.md -- 102 | No ellipses\.\.\. 103 | -- 16.html -- 104 |

No ellipses...

105 | -------------------------------------------------------------------------------- /testdata/cmark2txtar.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "log" 11 | "os" 12 | "strings" 13 | 14 | "golang.org/x/tools/txtar" 15 | "rsc.io/markdown" 16 | ) 17 | 18 | var parsers = map[string]string{ 19 | "example autolink": `{"AutoLinkText": true, "AutoLinkAssumeHTTP": true}`, 20 | "example disabled": `{"TaskListItems": true}`, 21 | "example strikethrough": `{"Strikethrough": true}`, 22 | "example table": `{"Table": true}`, 23 | } 24 | 25 | func main() { 26 | log.SetFlags(0) 27 | log.SetPrefix("cmark2txtar: ") 28 | flag.Usage = func() { 29 | fmt.Fprintf(os.Stderr, "usage: cmark2txtar file\n") 30 | os.Exit(2) 31 | } 32 | flag.Parse() 33 | if flag.NArg() != 1 { 34 | flag.Usage() 35 | } 36 | file := flag.Arg(0) 37 | 38 | data, err := os.ReadFile(file) 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | 43 | a := &txtar.Archive{ 44 | Comment: []byte("// go run cmark2txtar.go " + file + "\n"), 45 | } 46 | 47 | var p markdown.Parser 48 | doc := p.Parse(string(data)) 49 | n := 0 50 | for _, b := range doc.Blocks { 51 | var in, out []string 52 | b, ok := b.(*markdown.CodeBlock) 53 | if !ok || !strings.HasPrefix(b.Info, "example") { 54 | continue 55 | } 56 | for i := 0; i < len(b.Text); i++ { 57 | if b.Text[i] == "." { 58 | in, out = b.Text[:i], b.Text[i+1:] 59 | goto Found 60 | } 61 | } 62 | log.Fatalf("did not find . in pre block:\n%s", strings.Join(b.Text, "\n")) 63 | Found: 64 | parserChange := false 65 | if b.Info != "example" { 66 | js, ok := parsers[b.Info] 67 | if !ok { 68 | log.Printf("skipping %s", b.Info) 69 | continue 70 | } 71 | parserChange = true 72 | a.Files = append(a.Files, txtar.File{Name: "parser.json", Data: []byte(js)}) 73 | } 74 | n++ 75 | name := fmt.Sprintf("%d", n) 76 | a.Files = append(a.Files, 77 | txtar.File{ 78 | Name: name + ".md", 79 | Data: []byte(encode(join(in))), 80 | }, 81 | txtar.File{ 82 | Name: name + ".html", 83 | Data: []byte(encode(join(out))), 84 | }, 85 | ) 86 | if parserChange { 87 | a.Files = append(a.Files, txtar.File{Name: "parser.json", Data: []byte(`{}`)}) 88 | } 89 | } 90 | 91 | os.Stdout.Write(txtar.Format(a)) 92 | } 93 | 94 | func encode(s string) string { 95 | s = strings.ReplaceAll(s, " \n", " ^J\n") 96 | s = strings.ReplaceAll(s, "\t\n", "\t^J\n") 97 | if s != "" && !strings.HasSuffix(s, "\n") { 98 | s += "^D\n" 99 | } 100 | return s 101 | } 102 | 103 | func join(s []string) string { 104 | if len(s) == 0 { 105 | return "" 106 | } 107 | x := strings.Join(s, "\n") + "\n" 108 | x = strings.ReplaceAll(x, "→", "\t") 109 | return x 110 | } 111 | -------------------------------------------------------------------------------- /emoji2go.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build ignore 6 | 7 | package main 8 | 9 | import ( 10 | "bytes" 11 | "encoding/json" 12 | "flag" 13 | "fmt" 14 | "go/format" 15 | "io" 16 | "log" 17 | "net/http" 18 | "os" 19 | "regexp" 20 | "sort" 21 | "strconv" 22 | "strings" 23 | ) 24 | 25 | var outfile = flag.String("o", "", "write output to `file`") 26 | 27 | func get(url string) []byte { 28 | resp, err := http.Get(url) 29 | if err != nil { 30 | log.Fatal(err) 31 | } 32 | if resp.StatusCode != 200 { 33 | log.Fatal(resp.Status) 34 | } 35 | data, err := io.ReadAll(resp.Body) 36 | if err != nil { 37 | log.Fatal(err) 38 | } 39 | return data 40 | } 41 | 42 | var gemojiRE = regexp.MustCompile(`]*>`) 43 | 44 | func main() { 45 | log.SetFlags(0) 46 | log.SetPrefix("emoji2go: ") 47 | flag.Parse() 48 | 49 | emojiJSON := get("https://api.github.com/emojis") 50 | list := make(map[string]string) 51 | err := json.Unmarshal(emojiJSON, &list) 52 | if err != nil { 53 | log.Fatal(err) 54 | } 55 | 56 | var names []string 57 | for name := range list { 58 | names = append(names, name) 59 | } 60 | sort.Strings(names) 61 | 62 | emojiHTML := string(get("https://gist.github.com/rsc/316bc98c066ad111973634d435203aac")) 63 | 64 | bad := false 65 | var buf bytes.Buffer 66 | buf.WriteString(hdr) 67 | fmt.Fprintf(&buf, "var emoji = map[string]string{\n") 68 | n := 0 69 | for _, name := range names { 70 | n = max(n, len(name)) 71 | _, val, ok := strings.Cut(emojiHTML, ""+name+"\n") 72 | if !ok { 73 | log.Printf("gist missing :%s:", name) 74 | bad = true 75 | continue 76 | } 77 | val, _, ok = strings.Cut(val, "") 78 | if !ok { 79 | log.Printf("gist missing :%s:", name) 80 | bad = true 81 | continue 82 | } 83 | val = gemojiRE.ReplaceAllString(val, "") 84 | if strings.Contains(val, "<") { 85 | log.Printf("skipping %s: non-unicode: %s", name, val) 86 | continue 87 | } 88 | fmt.Fprintf(&buf, "\t%q: %s,\n", name, strconv.QuoteToASCII(val)) 89 | } 90 | fmt.Fprintf(&buf, "}\n\n") 91 | 92 | fmt.Fprintf(&buf, "const maxEmojiLen = %d\n", n) 93 | 94 | if bad { 95 | os.Exit(1) 96 | } 97 | 98 | src, err := format.Source(buf.Bytes()) 99 | if err != nil { 100 | log.Fatalf("reformatting output: %v", err) 101 | } 102 | 103 | if *outfile != "" { 104 | if err := os.WriteFile(*outfile, src, 0666); err != nil { 105 | log.Fatal(err) 106 | } 107 | } else { 108 | os.Stdout.Write(src) 109 | } 110 | } 111 | 112 | var hdr = `// Copyright 2023 The Go Authors. All rights reserved. 113 | // Use of this source code is governed by a BSD-style 114 | // license that can be found in the LICENSE file. 115 | 116 | //go:generate go run emoji2go.go -o emoji.go 117 | 118 | package markdown 119 | 120 | // emoji maps known emoji names to their UTF-8 emoji forms. 121 | ` 122 | -------------------------------------------------------------------------------- /fuzz_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "net/url" 11 | "path/filepath" 12 | "strings" 13 | "testing" 14 | "unicode/utf8" 15 | 16 | "golang.org/x/tools/txtar" 17 | ) 18 | 19 | func FuzzGoldmark(f *testing.F) { 20 | if !*goldmarkFlag { 21 | f.Skip("-goldmark not set") 22 | } 23 | files, err := filepath.Glob("testdata/*.txt") 24 | if err != nil { 25 | f.Fatal(err) 26 | } 27 | for _, file := range files { 28 | if strings.HasSuffix(file, "to_markdown.txt") { 29 | continue 30 | } 31 | a, err := txtar.ParseFile(file) 32 | if err != nil { 33 | f.Fatal(err) 34 | } 35 | for i := 0; i+2 <= len(a.Files); { 36 | if a.Files[i].Name == "parser.json" { 37 | i++ 38 | continue 39 | } 40 | md := a.Files[i] 41 | html := a.Files[i+1] 42 | i += 2 43 | name := strings.TrimSuffix(md.Name, ".md") 44 | if name != strings.TrimSuffix(html.Name, ".html") { 45 | f.Fatalf("mismatched file pair: %s and %s", md.Name, html.Name) 46 | } 47 | f.Add(decode(string(md.Data))) 48 | } 49 | } 50 | f.Fuzz(func(t *testing.T, s string) { 51 | // Too many corner cases involving non-terminated lines. 52 | if !strings.HasSuffix(s, "\n") { 53 | s += "\n" 54 | } 55 | // Goldmark does not convert \r to \n. 56 | s = strings.ReplaceAll(s, "\r", "\n") 57 | // Goldmark treats \v as isUnicodeSpace for deciding emphasis. 58 | // Not unreasonable, but not what the spec says. 59 | s = strings.ReplaceAll(s, "\v", "\f") 60 | if !utf8.ValidString(s) { 61 | s = string([]rune(s)) // coerce to valid UTF8 62 | } 63 | var parsers = []Parser{ 64 | {}, 65 | {HeadingID: true}, 66 | {Strikethrough: true}, 67 | {TaskList: true}, 68 | {HeadingID: true, Strikethrough: true, TaskList: true}, 69 | } 70 | for i, p := range parsers { 71 | if t.Failed() { 72 | break 73 | } 74 | t.Run(fmt.Sprintf("p%d", i), func(t *testing.T) { 75 | doc, corner := p.parse(s) 76 | if corner { 77 | return 78 | } 79 | out := ToHTML(doc) 80 | 81 | gm := goldmarkParser(&p) 82 | var buf bytes.Buffer 83 | if err := gm.Convert([]byte(s), &buf); err != nil { 84 | t.Fatal(err) 85 | } 86 | if buf.Len() > 0 && buf.Bytes()[buf.Len()-1] != '\n' { 87 | buf.WriteByte('\n') 88 | } 89 | gout := buf.String() 90 | 91 | // Goldmark uses
,
, and . 92 | // Goldmark also escapes | as %7C. 93 | // Apply rewrites to out as well as gout to handle these appearing 94 | // as literals in the input. 95 | canon := func(s string) string { 96 | s = strings.ReplaceAll(s, " />", ">") 97 | s = strings.ReplaceAll(s, "%7C", "|") 98 | return s 99 | } 100 | out = canon(out) 101 | gout = canon(gout) 102 | 103 | if out != gout { 104 | q := strings.ReplaceAll(url.QueryEscape(s), "+", "%20") 105 | t.Fatalf("in: %q\nparse:\n%s\nout: %q\ngout: %q\ndingus: (https://spec.commonmark.org/dingus/?text=%s)\ngithub: (https://github.com/rsc/tmp/issues/new?body=%s)", s, dump(doc), out, gout, q, q) 106 | } 107 | }) 108 | } 109 | }) 110 | } 111 | -------------------------------------------------------------------------------- /break.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | // A ThematicBreak is a [Block] representing a [thematic break], 8 | // usually displayed as a horizontal rule (
tag). 9 | // 10 | // [thematic break]: https://spec.commonmark.org/0.31.2/#thematic-breaks 11 | type ThematicBreak struct { 12 | Position 13 | } 14 | 15 | func (*ThematicBreak) Block() {} 16 | 17 | func (b *ThematicBreak) printHTML(p *printer) { 18 | p.html("
\n") 19 | } 20 | 21 | func (b *ThematicBreak) printMarkdown(p *printer) { 22 | p.maybeNL() 23 | p.md("***") 24 | } 25 | 26 | // startThematicBreak is a [starter] for a [ThematicBreak]. 27 | func startThematicBreak(p *parser, s line) (line, bool) { 28 | if !trimThematicBreak(&s) { 29 | return s, false 30 | } 31 | p.doneBlock(&ThematicBreak{Position{p.lineno, p.lineno}}) 32 | return line{}, true 33 | } 34 | 35 | // trimThematicBreak attempts to trim a thematic break from s, 36 | // reporting whether it was successful. 37 | // See https://spec.commonmark.org/0.31.2/#thematic-breaks. 38 | func trimThematicBreak(s *line) bool { 39 | t := s 40 | t.trimSpace(0, 3, false) 41 | c := t.peek() 42 | if c != '-' && c != '_' && c != '*' { 43 | return false 44 | } 45 | for i := 0; ; i++ { 46 | if !t.trim(c) { 47 | if i < 3 { 48 | return false 49 | } 50 | break 51 | } 52 | t.skipSpace() 53 | } 54 | if !t.eof() { 55 | return false 56 | } 57 | *s = line{} 58 | return true 59 | } 60 | 61 | // A HardBreak is an Inline representing a hard line break (
tag). 62 | type HardBreak struct{} 63 | 64 | func (*HardBreak) Inline() {} 65 | 66 | func (x *HardBreak) printHTML(p *printer) { 67 | p.html("
\n") 68 | } 69 | 70 | func (x *HardBreak) printMarkdown(p *printer) { 71 | p.md(`\`) 72 | p.nl() 73 | } 74 | 75 | func (x *HardBreak) printText(p *printer) { 76 | p.text("\n") 77 | } 78 | 79 | // A SoftBreak is an Inline representing a soft line break (newline character). 80 | type SoftBreak struct{} 81 | 82 | func (*SoftBreak) Inline() {} 83 | 84 | func (x *SoftBreak) printHTML(p *printer) { 85 | // TODO: If printer config says to, print
instead. 86 | p.html("\n") 87 | } 88 | 89 | func (x *SoftBreak) printMarkdown(p *printer) { 90 | p.nl() 91 | } 92 | 93 | func (x *SoftBreak) printText(p *printer) { 94 | p.text("\n") 95 | } 96 | 97 | // parseBreak is an [inlineParser] for a [SoftBreak] or [HardBreak]. 98 | // The caller has checked that s[start] is a newline. 99 | func parseBreak(p *parser, s string, start int) (x Inline, end int, ok bool) { 100 | // Back up to remove trailing spaces and tabs. 101 | i := start 102 | for i > 0 && (s[i-1] == ' ' || s[i-1] == '\t') { 103 | i-- 104 | } 105 | if i < start { 106 | // The caller will do p.emit(start), but we want to skip 107 | // the spaces and tabs between i and start, so do the 108 | // emit ourselves followed by skipping to start. 109 | p.emit(i) 110 | p.skip(start) 111 | } 112 | 113 | end = start + 1 114 | // TODO: Do tabs count? That would be a mess. 115 | if start >= 2 && s[start-1] == ' ' && s[start-2] == ' ' { 116 | return &HardBreak{}, end, true 117 | } 118 | return &SoftBreak{}, end, true 119 | } 120 | -------------------------------------------------------------------------------- /htmlesc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import "strings" 8 | 9 | // htmlEscaper is a strings.Replacer that escapes text for inclusion in HTML. 10 | // It escapes " & < > only. In particular it does not escape ' so any generated 11 | // HTML should use " for attribute quoting. 12 | var htmlEscaper = strings.NewReplacer( 13 | "\"", """, 14 | "&", "&", 15 | "<", "<", 16 | ">", ">", 17 | ) 18 | 19 | // htmlLinkEscaper is a strings.Replacer that escapes URLs 20 | // for inclusion in an tag. 21 | var htmlLinkEscaper = strings.NewReplacer( 22 | "\"", "%22", 23 | "&", "&", 24 | "<", "%3C", 25 | ">", "%3E", 26 | "\\", "%5C", 27 | " ", "%20", 28 | "`", "%60", 29 | "[", "%5B", 30 | "]", "%5D", 31 | "^", "%5E", 32 | "{", "%7B", 33 | "}", "%7D", 34 | 35 | "\x00", "%00", "\x01", "%01", "\x02", "%02", "\x03", "%03", 36 | "\x04", "%04", "\x05", "%05", "\x06", "%06", "\x07", "%07", 37 | "\x08", "%08", "\x0B", "%0B", // not 09 (tab) or 0A (newline) 38 | "\x0C", "%0C", "\x0E", "%0E", "\x0F", "%0F", // not 0D (carriage return) 39 | 40 | "\x10", "%10", "\x11", "%11", "\x12", "%12", "\x13", "%13", 41 | "\x14", "%14", "\x15", "%15", "\x16", "%16", "\x17", "%17", 42 | "\x18", "%18", "\x19", "%19", "\x1A", "%1A", "\x1B", "%1B", 43 | "\x1C", "%1C", "\x1D", "%1D", "\x1E", "%1E", "\x1F", "%1F", 44 | 45 | "\x7F", "%7F", 46 | 47 | "\x80", "%80", "\x81", "%81", "\x82", "%82", "\x83", "%83", 48 | "\x84", "%84", "\x85", "%85", "\x86", "%86", "\x87", "%87", 49 | "\x88", "%88", "\x89", "%89", "\x8A", "%8A", "\x8B", "%8B", 50 | "\x8C", "%8C", "\x8D", "%8D", "\x8E", "%8E", "\x8F", "%8F", 51 | 52 | "\x90", "%90", "\x91", "%91", "\x92", "%92", "\x93", "%93", 53 | "\x94", "%94", "\x95", "%95", "\x96", "%96", "\x97", "%97", 54 | "\x98", "%98", "\x99", "%99", "\x9A", "%9A", "\x9B", "%9B", 55 | "\x9C", "%9C", "\x9D", "%9D", "\x9E", "%9E", "\x9F", "%9F", 56 | 57 | "\xA0", "%A0", "\xA1", "%A1", "\xA2", "%A2", "\xA3", "%A3", 58 | "\xA4", "%A4", "\xA5", "%A5", "\xA6", "%A6", "\xA7", "%A7", 59 | "\xA8", "%A8", "\xA9", "%A9", "\xAA", "%AA", "\xAB", "%AB", 60 | "\xAC", "%AC", "\xAD", "%AD", "\xAE", "%AE", "\xAF", "%AF", 61 | 62 | "\xB0", "%B0", "\xB1", "%B1", "\xB2", "%B2", "\xB3", "%B3", 63 | "\xB4", "%B4", "\xB5", "%B5", "\xB6", "%B6", "\xB7", "%B7", 64 | "\xB8", "%B8", "\xB9", "%B9", "\xBA", "%BA", "\xBB", "%BB", 65 | "\xBC", "%BC", "\xBD", "%BD", "\xBE", "%BE", "\xBF", "%BF", 66 | 67 | "\xC0", "%C0", "\xC1", "%C1", "\xC2", "%C2", "\xC3", "%C3", 68 | "\xC4", "%C4", "\xC5", "%C5", "\xC6", "%C6", "\xC7", "%C7", 69 | "\xC8", "%C8", "\xC9", "%C9", "\xCA", "%CA", "\xCB", "%CB", 70 | "\xCC", "%CC", "\xCD", "%CD", "\xCE", "%CE", "\xCF", "%CF", 71 | 72 | "\xD0", "%D0", "\xD1", "%D1", "\xD2", "%D2", "\xD3", "%D3", 73 | "\xD4", "%D4", "\xD5", "%D5", "\xD6", "%D6", "\xD7", "%D7", 74 | "\xD8", "%D8", "\xD9", "%D9", "\xDA", "%DA", "\xDB", "%DB", 75 | "\xDC", "%DC", "\xDD", "%DD", "\xDE", "%DE", "\xDF", "%DF", 76 | 77 | "\xE0", "%E0", "\xE1", "%E1", "\xE2", "%E2", "\xE3", "%E3", 78 | "\xE4", "%E4", "\xE5", "%E5", "\xE6", "%E6", "\xE7", "%E7", 79 | "\xE8", "%E8", "\xE9", "%E9", "\xEA", "%EA", "\xEB", "%EB", 80 | "\xEC", "%EC", "\xED", "%ED", "\xEE", "%EE", "\xEF", "%EF", 81 | 82 | "\xF0", "%F0", "\xF1", "%F1", "\xF2", "%F2", "\xF3", "%F3", 83 | "\xF4", "%F4", "\xF5", "%F5", "\xF6", "%F6", "\xF7", "%F7", 84 | "\xF8", "%F8", "\xF9", "%F9", "\xFA", "%FA", "\xFB", "%FB", 85 | "\xFC", "%FC", "\xFD", "%FD", "\xFE", "%FE", "\xFF", "%FF", 86 | ) 87 | -------------------------------------------------------------------------------- /testdata/table.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"Table": true} 3 | -- gfm198.md -- 4 | | foo | bar | 5 | | --- | --- | 6 | | baz | bim | 7 | -- gfm198.html -- 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 |
foobar
bazbim
22 | -- gfm199.md -- 23 | | abc | defghi | 24 | :-: | -----------: 25 | bar | baz 26 | -- gfm199.html -- 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 |
abcdefghi
barbaz
41 | -- gfm200.md -- 42 | | f\|oo | 43 | | ------ | 44 | | b `\|` az | 45 | | b **\|** im | 46 | -- gfm200.html -- 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
f|oo
b | az
b | im
62 | -- gfm201.md -- 63 | | abc | def | 64 | | --- | --- | 65 | | bar | baz | 66 | > bar 67 | -- gfm201.html -- 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 |
abcdef
barbaz
82 |
83 |

bar

84 |
85 | -- gfm202.md -- 86 | | abc | def | 87 | | --- | --- | 88 | | bar | baz | 89 | bar 90 | 91 | bar 92 | -- gfm202.html -- 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 |
abcdef
barbaz
bar
111 |

bar

112 | -- gfm203.md -- 113 | | abc | def | 114 | | --- | 115 | | bar | 116 | -- gfm203.html -- 117 |

| abc | def | 118 | | --- | 119 | | bar |

120 | -- gfm204.md -- 121 | | abc | def | 122 | | --- | --- | 123 | | bar | 124 | | bar | baz | boo | 125 | -- gfm204.html -- 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 |
abcdef
bar
barbaz
144 | -- gfm205.md -- 145 | | abc | def | 146 | | --- | --- | 147 | -- gfm205.html -- 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 |
abcdef
156 | -- 1.md -- 157 | hello world 158 | this is a test 159 | 160 | > a 161 | b 162 | > |- 163 | > d 164 | e 165 | > e 166 | c 167 | -- 1.html -- 168 |

hello world 169 | this is a test

170 |
171 |

a

172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 |
b
d
184 |
185 |

e

186 |
187 |

e 188 | c

189 |
190 | -- 2.md -- 191 | | 0\|1\\|2\\\|3\\\\|4\\\\\|5\\\\\\|6\\\\\\\|7\\\\\\\\|8 | 192 | | ------ | 193 | -- 2.html -- 194 | 195 | 196 | 197 | 198 | 199 | 200 |
0|1|2\|3\|4\\|5\\|6\\\|7\\\|8
201 | -- 3.md -- 202 | | | Foo | Bar | 203 | | -------- | -------- | -------- | 204 | | a | value1 | value2 | 205 | | b | value3 | value4 | 206 | -- 3.html -- 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 |
FooBar
avalue1value2
bvalue3value4
228 | -- 4.md -- 229 | | 230 | |- 231 | |x 232 | | 233 | -- 4.html -- 234 |

| 235 | |- 236 | |x 237 | |

238 | -- 5.md -- 239 | || 240 | |- 241 | |x 242 | | 243 | -- 5.html -- 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 |
x
256 |

|

257 | -------------------------------------------------------------------------------- /lex.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "strings" 9 | "unicode" 10 | ) 11 | 12 | // isPunct reports whether c is Markdown punctuation. 13 | func isPunct(c byte) bool { 14 | return '!' <= c && c <= '/' || ':' <= c && c <= '@' || '[' <= c && c <= '`' || '{' <= c && c <= '~' 15 | } 16 | 17 | // isLetter reports whether c is an ASCII letter. 18 | func isLetter(c byte) bool { 19 | return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 20 | } 21 | 22 | // isDigit reports whether c is an ASCII digit. 23 | func isDigit(c byte) bool { 24 | return '0' <= c && c <= '9' 25 | } 26 | 27 | // isLetterDigit reports whether c is an ASCII letter or digit. 28 | func isLetterDigit(c byte) bool { 29 | return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' 30 | } 31 | 32 | // isLDH reports whether c is an ASCII letter, digit, or hyphen. 33 | func isLDH(c byte) bool { 34 | return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '-' 35 | } 36 | 37 | // isHexDigit reports whether c is an ASCII hexadecimal digit. 38 | func isHexDigit(c byte) bool { 39 | return 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' || '0' <= c && c <= '9' 40 | } 41 | 42 | // isUnocdeSpace reports whether r is a Unicode space as defined by Markdown. 43 | // This is not the same as unicode.IsSpace. 44 | // For example, U+0085 does not satisfy isUnicodeSpace 45 | // but does satisfy unicode.IsSpace. 46 | func isUnicodeSpace(r rune) bool { 47 | if r < 0x80 { 48 | return r == ' ' || r == '\t' || r == '\f' || r == '\n' 49 | } 50 | return unicode.In(r, unicode.Zs) 51 | } 52 | 53 | // isUnocdeSpace reports whether r is Unicode punctuation as defined by Markdown. 54 | // This is not the same as unicode.Punct; it also includes unicode.Symbol. 55 | func isUnicodePunct(r rune) bool { 56 | if r < 0x80 { 57 | return isPunct(byte(r)) 58 | } 59 | return unicode.In(r, unicode.Punct, unicode.Symbol) 60 | } 61 | 62 | // skipSpace returns i + the number of spaces, tabs, carriage returns, and newlines 63 | // at the start of s[i:]. That is, it skips i past any such characters, returning the new i. 64 | func skipSpace(s string, i int) int { 65 | // Note: Blank lines have already been removed. 66 | for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') { 67 | i++ 68 | } 69 | return i 70 | } 71 | 72 | // mdEscaper escapes symbols that are used in inline Markdown sequences. 73 | // TODO(rsc): There is a better way to do this. 74 | var mdEscaper = strings.NewReplacer( 75 | `(`, `\(`, 76 | `)`, `\)`, 77 | `[`, `\[`, 78 | `]`, `\]`, 79 | `*`, `\*`, 80 | `_`, `\_`, 81 | `<`, `\<`, 82 | `>`, `\>`, 83 | ) 84 | 85 | // mdLinkEscaper escapes symbols that have meaning inside a link target. 86 | var mdLinkEscaper = strings.NewReplacer( 87 | `(`, `\(`, 88 | `)`, `\)`, 89 | `<`, `\<`, 90 | `>`, `\>`, 91 | ) 92 | 93 | // mdUnscape returns the Markdown unescaping of s. 94 | func mdUnescape(s string) string { 95 | if !strings.Contains(s, `\`) && !strings.Contains(s, `&`) { 96 | return s 97 | } 98 | return mdUnescaper.Replace(s) 99 | } 100 | 101 | // mdUnescaper unescapes Markdown escape sequences and HTML entities. 102 | // TODO(rsc): Perhaps there is a better way to do this. 103 | var mdUnescaper = func() *strings.Replacer { 104 | var list = []string{ 105 | `\!`, `!`, 106 | `\"`, `"`, 107 | `\#`, `#`, 108 | `\$`, `$`, 109 | `\%`, `%`, 110 | `\&`, `&`, 111 | `\'`, `'`, 112 | `\(`, `(`, 113 | `\)`, `)`, 114 | `\*`, `*`, 115 | `\+`, `+`, 116 | `\,`, `,`, 117 | `\-`, `-`, 118 | `\.`, `.`, 119 | `\/`, `/`, 120 | `\:`, `:`, 121 | `\;`, `;`, 122 | `\<`, `<`, 123 | `\=`, `=`, 124 | `\>`, `>`, 125 | `\?`, `?`, 126 | `\@`, `@`, 127 | `\[`, `[`, 128 | `\\`, `\`, 129 | `\]`, `]`, 130 | `\^`, `^`, 131 | `\_`, `_`, 132 | "\\`", "`", 133 | `\{`, `{`, 134 | `\|`, `|`, 135 | `\}`, `}`, 136 | `\~`, `~`, 137 | } 138 | 139 | for name, repl := range htmlEntity { 140 | list = append(list, name, repl) 141 | } 142 | return strings.NewReplacer(list...) 143 | }() 144 | -------------------------------------------------------------------------------- /testdata/gfm_regress.txt: -------------------------------------------------------------------------------- 1 | // go run cmark2txtar.go /users/rsc/pub/cmark-gfm/test/regression.txt 2 | -- 1.md -- 3 | line1 4 | 5 | line2 6 | -- 1.html -- 7 |

line1

8 |

line2

9 | -- 2.md -- 10 | By taking it apart 11 | 12 | - alternative solutions 13 | ^J 14 | Repeatedly solving 15 | ^J 16 | - how techniques 17 | -- 2.html -- 18 |

By taking it apart

19 | 22 |

Repeatedly solving

23 | 26 | -- 3.md -- 27 |

lorem

28 | 29 |

lorem

30 | 31 |

lorem

32 | 33 |

lorem

34 | 35 |
lorem
36 | 37 |
lorem
38 | -- 3.html -- 39 |

lorem

40 |

lorem

41 |

lorem

42 |

lorem

43 |
lorem
44 |
lorem
45 | -- 4.md -- 46 | hi 47 | -- ^J 48 | -- 4.html -- 49 |

hi

50 | -- 5.md -- 51 | a***b* c* 52 | -- 5.html -- 53 |

a*b c

54 | -- 6.md -- 55 | [a] 56 | 57 | [a]: 58 | -- 6.html -- 59 |

[a]

60 |

[a]: <te

61 | -- 7.md -- 62 | [a](te\ st) 63 | -- 7.html -- 64 |

[a](te\ st)

65 | -- parser.json -- 66 | {"Strikethrough": true} 67 | -- 8.md -- 68 | ~~**_`this`_**~~ ^J 69 | ~~***`this`***~~ ^J 70 | ~~___`this`___~~ 71 | 72 | **_`this`_** ^J 73 | ***`this`*** ^J 74 | ___`this`___ 75 | 76 | ~~**_this_**~~ ^J 77 | ~~***this***~~ ^J 78 | ~~___this___~~ 79 | 80 | **_this_** ^J 81 | ***this*** ^J 82 | ___this___ 83 | -- 8.html -- 84 |

this
85 | this
86 | this

87 |

this
88 | this
89 | this

90 |

this
91 | this
92 | this

93 |

this
94 | this
95 | this

96 | -- parser.json -- 97 | {} 98 | -- 9.md -- 99 | City: 100 | 101 | 102 | 103 | -- 9.html -- 104 |

City: 105 | 106 | 107 |

108 | -- parser.json -- 109 | {"Strikethrough": true} 110 | -- 10.md -- 111 | ~Hi~ Hello, world! 112 | -- 10.html -- 113 |

Hi Hello, world!

114 | -- parser.json -- 115 | {} 116 | -- parser.json -- 117 | {"Strikethrough": true} 118 | -- 11.md -- 119 | This ~text~ ~~is~~ ~~~curious~~~. 120 | -- 11.html -- 121 |

This text is ~~~curious~~~.

122 | -- parser.json -- 123 | {} 124 | -- 12.md -- 125 | [x](http://members.aon.at/~nkehrer/ibm_5110/emu5110.html) 126 | -- 12.html -- 127 |

x

128 | -- 13.md -- 129 | City: 130 | 131 | 132 | 133 | -- 13.html -- 134 |

City: 135 | 136 | 137 |

138 | -- 14.md -- 139 | [a](\ b) 140 | 141 | [a](<[a](\ b)

147 |

[a](<<b)

148 |

[a](<b 149 | )

150 | -- 15.md -- 151 | [link](url ((title)) 152 | -- 15.html -- 153 |

[link](url ((title))

154 | -- 16.md -- 155 | 156 | 157 | 158 | 159 | 160 | -- 16.html -- 161 | 162 | 163 | 164 | -- 17.md -- 165 | [a]( 166 | -- 17.html -- 167 |

[a](<b) c>

168 | -- parser.json -- 169 | {"Table": true} 170 | -- 18.md -- 171 | | 172 | -| 173 | -- 18.html -- 174 |

| 175 | -|

176 | -- parser.json -- 177 | {} 178 | -- 19.md -- 179 | *text* [link](#section) 180 | -- 19.html -- 181 |

text link

182 | -------------------------------------------------------------------------------- /line.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | type line struct { 8 | spaces int 9 | i int 10 | tab int 11 | text string 12 | nl byte // newline character ending this line: \r or \n or \r+\n or zero for EOF 13 | nonblank int // index of first non-space, non-tab char in text; len(text) if none 14 | } 15 | 16 | func makeLine(text string, nl byte) line { 17 | s := line{text: text, nl: nl} 18 | s.setNonblank() 19 | return s 20 | } 21 | 22 | func (s *line) setNonblank() { 23 | i := s.i 24 | for i < len(s.text) && (s.text[i] == ' ' || s.text[i] == '\t') { 25 | i++ 26 | } 27 | s.nonblank = i 28 | } 29 | 30 | func (s *line) peek() byte { 31 | if s.spaces > 0 { 32 | return ' ' 33 | } 34 | if s.i >= len(s.text) { 35 | return 0 36 | } 37 | return s.text[s.i] 38 | } 39 | 40 | func (s *line) skipSpace() { 41 | s.spaces = 0 42 | if s.nonblank < s.i { 43 | panic("nonblank") 44 | } 45 | s.i = s.nonblank 46 | } 47 | 48 | func (s *line) trimSpace(min, max int, eolOK bool) bool { 49 | t := *s 50 | 51 | for n := 0; n < max; n++ { 52 | if t.spaces > 0 { 53 | t.spaces-- 54 | continue 55 | } 56 | if t.i >= len(t.text) && eolOK { 57 | continue 58 | } 59 | // TODO performance bottleneck here using trimSpace with list extensions? 60 | // but each only fails once? 61 | if t.i < len(t.text) { 62 | switch t.text[t.i] { 63 | case '\t': 64 | t.spaces = 4 - (t.i-t.tab)&3 - 1 65 | t.i++ 66 | t.tab = t.i // TODO seems wrong 67 | continue 68 | case ' ': 69 | t.i++ 70 | continue 71 | } 72 | } 73 | if n >= min { 74 | break 75 | } 76 | return false 77 | } 78 | if t.nonblank < t.i { 79 | t.setNonblank() 80 | } 81 | *s = t 82 | return true 83 | } 84 | 85 | func (s *line) trim(c byte) bool { 86 | if s.spaces > 0 { 87 | if c == ' ' { 88 | s.spaces-- 89 | return true 90 | } 91 | return false 92 | } 93 | if s.i < len(s.text) && s.text[s.i] == c { 94 | s.i++ 95 | if s.nonblank < s.i { 96 | s.setNonblank() 97 | } 98 | return true 99 | } 100 | return false 101 | } 102 | 103 | func (s *line) skip(n int) { 104 | s.i += n 105 | if s.nonblank < s.i { 106 | s.setNonblank() 107 | } 108 | } 109 | 110 | func (s *line) string() string { 111 | switch s.spaces { 112 | case 0: 113 | return s.text[s.i:] 114 | case 1: 115 | return " " + s.text[s.i:] 116 | case 2: 117 | return " " + s.text[s.i:] 118 | case 3: 119 | return " " + s.text[s.i:] 120 | } 121 | // unreachable 122 | panic("bad spaces") 123 | } 124 | 125 | func trimLeftSpaceTab(s string) string { 126 | i := 0 127 | for i < len(s) && (s[i] == ' ' || s[i] == '\t') { 128 | i++ 129 | } 130 | return s[i:] 131 | } 132 | 133 | func trimRightSpaceTab(s string) string { 134 | j := len(s) 135 | for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') { 136 | j-- 137 | } 138 | return s[:j] 139 | } 140 | 141 | func trimSpaceTab(s string) string { 142 | i := 0 143 | for i < len(s) && (s[i] == ' ' || s[i] == '\t') { 144 | i++ 145 | } 146 | s = s[i:] 147 | j := len(s) 148 | for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') { 149 | j-- 150 | } 151 | return s[:j] 152 | } 153 | 154 | func trimSpace(s string) string { 155 | i := 0 156 | for i < len(s) && (s[i] == ' ' || s[i] == '\t') { 157 | i++ 158 | } 159 | s = s[i:] 160 | j := len(s) 161 | for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') { 162 | j-- 163 | } 164 | return s[:j] 165 | } 166 | 167 | func trimSpaceTabNewline(s string) string { 168 | i := 0 169 | for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') { 170 | i++ 171 | } 172 | s = s[i:] 173 | j := len(s) 174 | for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t' || s[j-1] == '\n') { 175 | j-- 176 | } 177 | return s[:j] 178 | } 179 | 180 | func (s *line) isBlank() bool { 181 | return s.nonblank == len(s.text) 182 | } 183 | 184 | func (s *line) eof() bool { 185 | return s.i >= len(s.text) 186 | } 187 | 188 | func (s *line) trimSpaceString() string { 189 | return s.text[s.nonblank:] 190 | } 191 | 192 | func (s *line) trimString() string { 193 | if s.nonblank < s.i { 194 | panic("bad blank") 195 | } 196 | return trimSpaceTab(s.text[s.nonblank:]) 197 | } 198 | -------------------------------------------------------------------------------- /footnote.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | type Footnote struct { 13 | Position 14 | Label string 15 | Blocks []Block 16 | } 17 | 18 | type FootnoteLink struct { 19 | Label string 20 | Footnote *Footnote 21 | } 22 | 23 | type printedNote struct { 24 | num string 25 | note *Footnote 26 | refs []string 27 | } 28 | 29 | func (*FootnoteLink) Inline() {} 30 | 31 | func (x *Footnote) printed(p *printer) *printedNote { 32 | if p.footnotes == nil { 33 | p.footnotes = make(map[*Footnote]*printedNote) 34 | } 35 | pr, ok := p.footnotes[x] 36 | if !ok { 37 | pr = &printedNote{ 38 | num: strconv.Itoa(len(p.footnotes) + 1), 39 | note: x, 40 | } 41 | p.footnotes[x] = pr 42 | p.footnotelist = append(p.footnotelist, pr) 43 | } 44 | ref := pr.num 45 | if len(pr.refs) > 0 { 46 | ref += "-" + strconv.Itoa(len(pr.refs)+1) 47 | } 48 | pr.refs = append(pr.refs, ref) 49 | return pr 50 | } 51 | 52 | func (x *FootnoteLink) printHTML(p *printer) { 53 | note := x.Footnote 54 | if note == nil { 55 | return 56 | } 57 | pr := note.printed(p) 58 | ref := pr.refs[len(pr.refs)-1] 59 | p.html(``, pr.num, ``) 60 | } 61 | 62 | func (x *FootnoteLink) printMarkdown(p *printer) { 63 | note := x.Footnote 64 | if note == nil { 65 | return 66 | } 67 | note.printed(p) // add to list for printFootnoteMarkdown 68 | p.text(`[^`, x.Label, `]`) 69 | } 70 | 71 | func (x *FootnoteLink) printText(p *printer) { 72 | p.text(`[^`, x.Label, `]`) 73 | } 74 | 75 | func printFootnoteHTML(p *printer) { 76 | if len(p.footnotelist) == 0 { 77 | return 78 | } 79 | 80 | p.html(`
Footnotes
`, "\n") 81 | p.html("
    \n") 82 | for num, note := range p.footnotelist { 83 | num++ 84 | str := strconv.Itoa(num) 85 | p.html(`
  1. `, "\n") 86 | for _, b := range note.note.Blocks { 87 | b.printHTML(p) 88 | } 89 | if !p.eraseCloseP() { 90 | p.html("

    \n") 91 | } 92 | for _, ref := range note.refs { 93 | p.html("\n", ``) 94 | } 95 | p.html("

    \n") 96 | p.html("
  2. \n") 97 | } 98 | p.html("
\n") 99 | } 100 | 101 | func (x *Footnote) printMarkdown(p *printer) { 102 | p.md(`[^`, x.Label, `]: `) 103 | defer p.pop(p.push(" ")) 104 | printMarkdownBlocks(x.Blocks, p) 105 | } 106 | 107 | func printFootnoteMarkdown(p *printer) { 108 | if len(p.footnotelist) == 0 { 109 | return 110 | } 111 | 112 | p.maybeNL() 113 | for _, note := range p.footnotelist { 114 | p.nl() 115 | note.note.printMarkdown(p) 116 | } 117 | } 118 | 119 | func parseFootnoteRef(p *parser, s string, start int) (x Inline, end int, ok bool) { 120 | if !p.Footnote || start+1 >= len(s) || s[start+1] != '^' { 121 | return 122 | } 123 | end = strings.Index(s[start:], "]") 124 | if end < 0 { 125 | return 126 | } 127 | end += start + 1 128 | label := s[start+2 : end-1] 129 | note, ok := p.footnotes[normalizeLabel(label)] 130 | if !ok { 131 | return 132 | } 133 | return &FootnoteLink{label, note}, end, true 134 | } 135 | 136 | func startFootnote(p *parser, s line) (line, bool) { 137 | t := s 138 | t.trimSpace(0, 3, false) 139 | if !t.trim('[') || !t.trim('^') { 140 | return s, false 141 | } 142 | label := t.string() 143 | i := strings.Index(label, "]") 144 | if i < 0 || i+1 >= len(label) && label[i+1] != ':' { 145 | return s, false 146 | } 147 | label = label[:i] 148 | for j := 0; j < i; j++ { 149 | c := label[j] 150 | if c == ' ' || c == '\r' || c == '\n' || c == 0x00 || c == '\t' { 151 | return s, false 152 | } 153 | } 154 | t.skip(i + 2) 155 | 156 | if _, ok := p.footnotes[normalizeLabel(label)]; ok { 157 | // Already have a footnote with this label. 158 | // cmark-gfm ignores all future references, 159 | // dropping them from the document, 160 | // but it seems more helpful to not treat it 161 | // as a footnote. 162 | p.corner = true 163 | return s, false 164 | } 165 | 166 | fb := &footnoteBuilder{label} 167 | p.addBlock(fb) 168 | return t, true 169 | } 170 | 171 | type footnoteBuilder struct { 172 | label string 173 | } 174 | 175 | func (b *footnoteBuilder) extend(p *parser, s line) (line, bool) { 176 | if !s.trimSpace(4, 4, true) { 177 | return s, false 178 | } 179 | return s, true 180 | } 181 | 182 | func (b *footnoteBuilder) build(p *parser) Block { 183 | if p.footnotes == nil { 184 | p.footnotes = make(map[string]*Footnote) 185 | } 186 | p.footnotes[normalizeLabel(b.label)] = &Footnote{p.pos(), b.label, p.blocks()} 187 | return &Empty{} 188 | } 189 | -------------------------------------------------------------------------------- /print.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import "bytes" 8 | 9 | const ( 10 | writeMarkdown = iota 11 | writeHTML 12 | writeText 13 | ) 14 | 15 | type printer struct { 16 | writeMode int 17 | buf bytes.Buffer 18 | prefix []byte 19 | prefixOld []byte 20 | prefixOlder []byte 21 | trimLimit int 22 | listOut 23 | footnotes map[*Footnote]*printedNote 24 | footnotelist []*printedNote 25 | } 26 | 27 | type listOut struct { 28 | bullet rune 29 | num int 30 | loose int 31 | tight int 32 | } 33 | 34 | func (w *printer) WriteStrings(list ...string) { 35 | for _, s := range list { 36 | w.WriteString(s) 37 | } 38 | } 39 | 40 | func cutLastNL(text []byte) (prefix, last []byte) { 41 | i := bytes.LastIndexByte(text, '\n') 42 | if i < 0 { 43 | return nil, text 44 | } 45 | return text[:i], text[i+1:] 46 | } 47 | 48 | func (b *printer) noTrim() { 49 | b.trimLimit = len(b.buf.Bytes()) 50 | } 51 | 52 | func (b *printer) nl() { 53 | text := b.buf.Bytes() 54 | for len(text) > b.trimLimit && text[len(text)-1] == ' ' { 55 | text = text[:len(text)-1] 56 | } 57 | b.buf.Truncate(len(text)) 58 | 59 | b.buf.WriteByte('\n') 60 | b.buf.Write(b.prefix) 61 | b.prefixOlder, b.prefixOld = b.prefixOld, b.prefix 62 | } 63 | 64 | func (b *printer) maybeNL() bool { 65 | // Starting a new block that may need a blank line before it 66 | // to avoid being mixed into a previous block 67 | // as paragraph continuation text. 68 | // 69 | // If the prefix on the current line (all of cur) 70 | // is the same as the current continuation prefix 71 | // (not first line of a list item) 72 | // and the previous line started with the same prefix, 73 | // then we need a blank line to avoid looking like 74 | // paragraph continuation text. 75 | before, cur := cutLastNL(b.buf.Bytes()) 76 | before, prev := cutLastNL(before) 77 | if b.buf.Len() > 0 && bytes.Equal(cur, b.prefix) && bytes.HasPrefix(prev, b.prefix) { 78 | b.nl() 79 | return true 80 | } 81 | return true 82 | } 83 | 84 | func ToHTML(b Block) string { 85 | var p printer 86 | p.writeMode = writeHTML 87 | b.printHTML(&p) 88 | printFootnoteHTML(&p) 89 | return p.buf.String() 90 | } 91 | 92 | func Format(b Block) string { 93 | var p printer 94 | b.printMarkdown(&p) 95 | printFootnoteMarkdown(&p) 96 | // TODO footnotes? 97 | return p.buf.String() 98 | } 99 | 100 | var closeP = []byte("

\n") 101 | 102 | func (b *printer) eraseCloseP() bool { 103 | if bytes.HasSuffix(b.buf.Bytes(), closeP) { 104 | b.buf.Truncate(b.buf.Len() - len(closeP)) 105 | return true 106 | } 107 | return false 108 | } 109 | 110 | func (b *printer) maybeQuoteNL(quote byte) bool { 111 | // Starting a new quote block. 112 | // Make sure it doesn't look like it is part of a preceding quote block. 113 | before, cur := cutLastNL(b.buf.Bytes()) 114 | before, prev := cutLastNL(before) 115 | if len(prev) >= len(cur)+1 && bytes.HasPrefix(prev, cur) && prev[len(cur)] == quote { 116 | b.nl() 117 | return true 118 | } 119 | return false 120 | } 121 | 122 | func (b *printer) WriteByte(c byte) error { 123 | if c == '\n' { 124 | panic("Write \\n") 125 | } 126 | return b.buf.WriteByte(c) 127 | } 128 | 129 | func (p *printer) Write(text []byte) (int, error) { 130 | if p.writeMode == writeMarkdown { 131 | for i := range text { 132 | if text[i] == '\n' { 133 | panic("Write \\n") 134 | } 135 | } 136 | } 137 | return p.buf.Write(text) 138 | } 139 | 140 | func (p *printer) html(list ...string) { 141 | if p.writeMode != writeHTML { 142 | panic("raw HTML in non-HTML output") 143 | } 144 | for _, s := range list { 145 | p.buf.WriteString(s) 146 | } 147 | } 148 | 149 | func (p *printer) text(list ...string) { 150 | if p.writeMode == writeHTML { 151 | for _, s := range list { 152 | htmlEscaper.WriteString(&p.buf, s) 153 | } 154 | return 155 | } 156 | for _, s := range list { 157 | p.buf.WriteString(s) 158 | } 159 | 160 | } 161 | 162 | func (p *printer) md(list ...string) { 163 | if p.writeMode != writeMarkdown { 164 | panic("markdown in non-markdown output") 165 | } 166 | for _, s := range list { 167 | p.buf.WriteString(s) 168 | } 169 | } 170 | 171 | func (b *printer) WriteString(s string) (int, error) { 172 | if b.writeMode == writeMarkdown { 173 | for i := 0; i < len(s); i++ { 174 | if s[i] == '\n' { 175 | panic("Write \\n") 176 | } 177 | } 178 | } 179 | return b.buf.WriteString(s) 180 | } 181 | 182 | func (b *printer) push(s string) int { 183 | n := len(b.prefix) 184 | b.prefix = append(b.prefix, s...) 185 | return n 186 | } 187 | 188 | func (b *printer) pop(n int) { 189 | b.prefix = b.prefix[:n] 190 | } 191 | -------------------------------------------------------------------------------- /big_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "fmt" 9 | "strings" 10 | "testing" 11 | ) 12 | 13 | var rep = strings.Repeat 14 | 15 | func repf(f func(int) string, n int) string { 16 | out := make([]string, n) 17 | for i := 0; i < n; i++ { 18 | out[i] = f(i) 19 | } 20 | return strings.Join(out, "") 21 | } 22 | 23 | // Many cases here derived from cmark-gfm/test/pathological_tests.py 24 | 25 | var bigTests = []struct { 26 | name string 27 | in string 28 | out string 29 | }{ 30 | { 31 | "nested strong emph", 32 | rep("*a **a ", 65000) + "b" + rep(" a** a*", 65000), 33 | "

" + rep("a a ", 65000) + "b" + rep(" a a", 65000) + "

\n", 34 | }, 35 | { 36 | "many emph closers with no openers", 37 | rep("a_ ", 65000), 38 | "", 39 | }, 40 | { 41 | "many emph openers with no closers", 42 | rep("_a ", 65000), 43 | "", 44 | }, 45 | { 46 | "many link closers with no openers", 47 | rep("a]", 65000), 48 | "", 49 | }, 50 | { 51 | "many link openers with no closers", 52 | rep("[a", 65000), 53 | "", 54 | }, 55 | { 56 | "mismatched openers and closers", 57 | rep("*a_ ", 50000), 58 | "", 59 | }, 60 | { 61 | "openers and closers multiple of 3", 62 | "a**b" + rep("c* ", 50000), 63 | "", 64 | }, 65 | { 66 | "link openers and emph closers", 67 | rep("[ a_", 50000), 68 | "", 69 | }, 70 | { 71 | "pattern [ (]( repeated", 72 | rep("[ (](", 80000), 73 | "", 74 | }, 75 | { 76 | "pattern ![[]() repeated", 77 | rep("![[]()", 160000), 78 | "

" + rep(`![`, 160000) + "

\n", 79 | }, 80 | { 81 | "hard link/emph case", 82 | "**x [a*b**c*](d)", 83 | `

**x ab**c

` + "\n", 84 | }, 85 | { 86 | "nested brackets", 87 | rep("[", 50000) + "a" + rep("]", 50000), 88 | "", 89 | }, 90 | { 91 | "nested block quotes", 92 | rep("> ", 50000) + "a", 93 | rep("
\n", 50000) + "

a

\n" + rep("
\n", 50000), 94 | }, 95 | { 96 | "deeply nested lists", 97 | repf(func(x int) string { return rep(" ", x) + "* a\n" }, 4000), 98 | "
    \n" + rep("
  • a\n
      \n", 4000-1) + "
    • a
    • \n" + rep("
    \n
  • \n", 4000-1) + "
\n", 99 | }, 100 | { 101 | "backticks", 102 | repf(func(x int) string { return "e" + rep("`", x) }, 5000), 103 | "", 104 | }, 105 | { 106 | "backticks2", 107 | repf(func(x int) string { return "e" + rep("`", 5000-x) }, 5000), 108 | "", 109 | }, 110 | { 111 | "unclosed links A", 112 | rep("[a](" + rep("[a](<b", 30000) + "

\n", 114 | }, 115 | { 116 | "unclosed links B", 117 | rep("[a](b", 30000), 118 | "", 119 | }, 120 | { 121 | "unclosed links C", 122 | rep("[a](b\\#", 30000), 123 | "

" + rep("[a](b#", 30000) + "

\n", 124 | }, 125 | { 126 | "unclosed 336 | The `go` subcommands now accept 337 | `-C` `` to change directory to \ 338 | before performing the command, which may be useful for scripts that need to 339 | execute commands in multiple different modules. 340 | -- want -- 341 | 342 | 343 | The `go` subcommands now accept 344 | `-C` `` to change directory to \ 345 | before performing the command, which may be useful for scripts that need to 346 | execute commands in multiple different modules. 347 | -------------------------------------------------------------------------------- /heading.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "fmt" 9 | "strings" 10 | ) 11 | 12 | // A Heading is a [Block] representing an [ATX heading] or 13 | // [Setext heading], usually displayed with the

through

tags. 14 | // 15 | // [ATX heading]: https://spec.commonmark.org/0.31.2/#atx-headings 16 | // [Setext heading]: https://spec.commonmark.org/0.31.2/#setext-headings 17 | type Heading struct { 18 | Position 19 | 20 | // Level is the heading level: 1 through 6. 21 | // Other values are clamped to the valid range. 22 | Level int 23 | 24 | // Text is the text of the heading. 25 | Text *Text 26 | 27 | // ID is the HTML id attribute. 28 | // The parser populates this field if [Parser.HeadingID] is true 29 | // and the heading ends with text like "{#id}". 30 | ID string 31 | } 32 | 33 | func (*Heading) Block() {} 34 | 35 | // level returns the effective level, clamping Level to the range [1, 6]. 36 | func (h *Heading) level() int { 37 | return max(1, min(6, h.Level)) 38 | } 39 | 40 | func (b *Heading) printHTML(p *printer) { 41 | fmt.Fprintf(p, "') 46 | b.Text.printHTML(p) 47 | fmt.Fprintf(p, "\n", b.level()) 48 | } 49 | 50 | func (b *Heading) printMarkdown(p *printer) { 51 | p.maybeNL() 52 | 53 | // TODO: handle setext headings properly. 54 | for i := b.level(); i > 0; i-- { 55 | p.WriteByte('#') 56 | } 57 | p.WriteByte(' ') 58 | b.Text.printMarkdown(p) 59 | if b.ID != "" { 60 | fmt.Fprintf(p, " {#%s}", b.ID) 61 | } 62 | } 63 | 64 | // startATXHeading is a [starter] for an ATX [Heading], like "## Heading". 65 | // 66 | // See https://spec.commonmark.org/0.31.2/#atx-headings. 67 | func startATXHeading(p *parser, s line) (line, bool) { 68 | n, ok := trimATX(&s) 69 | if !ok { 70 | return s, false 71 | } 72 | text := trimRightSpaceTab(s.string()) 73 | 74 | // Remove any number of trailing '#'s if preceded by a space or tab. 75 | if inner := strings.TrimRight(text, "#"); inner != trimRightSpaceTab(inner) || inner == "" { 76 | text = inner 77 | } 78 | 79 | // Extract id if extension is enabled. 80 | var id string 81 | if p.HeadingID { 82 | // Extension: Parse and remove ID attribute. 83 | // It must come before trailing '#'s to more closely follow the spec: 84 | // The optional closing sequence of #s must be preceded by spaces or tabs 85 | // and may be followed by spaces or tabs only. 86 | // But Goldmark allows it to come after. 87 | text, id = trimHeadingID(p, text) 88 | } 89 | 90 | pos := Position{p.lineno, p.lineno} 91 | p.doneBlock(&Heading{pos, n, p.newText(pos, text), id}) // TODO rename doneBlock? 92 | return line{}, true 93 | } 94 | 95 | // trimHeadingID trims an {#id} suffix from s if one is present, 96 | // returning the prefix before the {#id} and the id. 97 | // If there is no {#id} suffix, trimID returns s, "". 98 | // The {#id} suffix can be followed by spaces, which are 99 | // ignored and discarded. 100 | func trimHeadingID(p *parser, s string) (text, id string) { 101 | text = s // failure result 102 | i := strings.LastIndexByte(s, '{') 103 | if i < 0 { 104 | return 105 | } 106 | j := i + strings.IndexByte(s[i:], '}') 107 | if j < i || trimRightSpaceTab(s[j+1:]) != "" { 108 | return 109 | } 110 | if j == i+1 || j == i+2 && s[i+1] == '#' { 111 | p.corner = true // goldmark accepts {} and {#} 112 | return 113 | } 114 | if s[i+1] != '#' { 115 | return 116 | } 117 | text, id = s[:i], strings.TrimSpace(s[i+2:j]) // TODO maybe trimSpace? 118 | 119 | // Goldmark is strict about the id syntax. 120 | for i := range len(id) { 121 | if c := id[i]; c >= 0x80 || !isLetterDigit(byte(c)) { 122 | p.corner = true 123 | } 124 | } 125 | 126 | return 127 | } 128 | 129 | // startSetextHeading is a [starter] for a Setext [Heading], which is an 130 | // underlined paragraph of text. The parargraph is assumed to have 131 | // been parsed already; startSetextHeading looks for the underline. 132 | // 133 | // See https://spec.commonmark.org/0.31.2/#setext-headings. 134 | func startSetextHeading(p *parser, s line) (line, bool) { 135 | // Topmost block must be a paragraph. 136 | if p.nextB() != p.para() { 137 | return s, false 138 | } 139 | 140 | // Need Setext underline. 141 | t := s 142 | level, ok := trimSetext(&t) 143 | if !ok { 144 | return s, false 145 | } 146 | 147 | // The Setext heading forces an end-of-paragraph, 148 | // but this still may not be a Setext heading if the paragraph 149 | // closer decides this wasn't a paragraph after all. 150 | // Might turn out to be a link reference, for example. 151 | // Close active paragraph to find out. 152 | p.closeBlock() 153 | para, ok := p.last().(*Paragraph) 154 | if !ok { 155 | // Paragraph text didn't end in a pargraph after all. 156 | // Leave underline text for processing by something else. 157 | return s, false 158 | } 159 | 160 | p.deleteLast() 161 | p.doneBlock(&Heading{Position{para.StartLine, p.lineno}, level, para.Text, ""}) 162 | return line{}, true 163 | } 164 | 165 | // trimATX trims an ATX heading prefix 166 | // (optional spaces and then 1-6 #s followd by a space) from s. 167 | // reporting the heading level and whether it was successful. 168 | // If trimATX is unsuccessful, it leaves s unmodified. 169 | func trimATX(s *line) (level int, ok bool) { 170 | t := *s 171 | t.trimSpace(0, 3, false) 172 | if !t.trim('#') { 173 | return 174 | } 175 | n := 1 176 | for n < 6 && t.trim('#') { 177 | n++ 178 | } 179 | if !t.trimSpace(1, 1, true) { 180 | return 181 | } 182 | *s = t 183 | return n, true 184 | } 185 | 186 | // trimSetext trims a Setext heading underline 187 | // (optional spaces and then only -'s or ='s 188 | // followed by optional spaces and EOL) from s, 189 | // reporting the leading level and whether it was successful. 190 | // If trimSetext is unsuccessful, it leaves s unmodiifed. 191 | func trimSetext(s *line) (level int, ok bool) { 192 | t := *s 193 | t.trimSpace(0, 3, false) 194 | c := t.peek() 195 | if c != '-' && c != '=' { 196 | return 197 | } 198 | for t.trim(c) { 199 | } 200 | t.skipSpace() 201 | if !t.eof() { 202 | return 203 | } 204 | level = 1 205 | if c == '-' { 206 | level = 2 207 | } 208 | *s = line{} 209 | return level, true 210 | } 211 | -------------------------------------------------------------------------------- /code.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "strings" 9 | ) 10 | 11 | // A CodeBlock is a [Block] representing an [indented code block] 12 | // or [fenced code block], 13 | // usually displayed in
 tags.
 14 | //
 15 | // When printing a CodeBlock as Markdown, the Fence field is used as
 16 | // a starting hint but is made longer as needed if the suggested fence text
 17 | // appears in Text.
 18 | //
 19 | // [indented code block]: https://spec.commonmark.org/0.31.2/#indented-code-blocks
 20 | // [fenced code block]: https://spec.commonmark.org/0.31.2/#fenced-code-blocks
 21 | type CodeBlock struct {
 22 | 	Position
 23 | 	Fence string   // fence to use
 24 | 	Info  string   // info following open fence
 25 | 	Text  []string // lines of code block
 26 | }
 27 | 
 28 | func (*CodeBlock) Block() {}
 29 | 
 30 | func (b *CodeBlock) printHTML(p *printer) {
 31 | 	p.html("
")
 50 | 	for _, s := range b.Text {
 51 | 		p.text(s, "\n")
 52 | 	}
 53 | 	p.html("
\n") 54 | } 55 | 56 | func (b *CodeBlock) printMarkdown(p *printer) { 57 | if b.Fence == "" { 58 | p.maybeNL() 59 | for i, line := range b.Text { 60 | if i > 0 { 61 | p.nl() 62 | } 63 | p.md(" ") 64 | p.md(line) 65 | p.noTrim() 66 | } 67 | } else { 68 | // TODO compute correct fence 69 | if p.tight == 0 { 70 | p.maybeNL() 71 | } 72 | p.md(b.Fence) 73 | p.md(b.Info) 74 | for _, line := range b.Text { 75 | p.nl() 76 | p.md(line) 77 | p.noTrim() 78 | } 79 | p.nl() 80 | p.md(b.Fence) 81 | } 82 | } 83 | 84 | // startIndentedCodeBlock is a [starter] for an indented [CodeBlock]. 85 | // See https://spec.commonmark.org/0.31.2/#indented-code-blocks. 86 | func startIndentedCodeBlock(p *parser, s line) (line, bool) { 87 | // Line must start with 4 spaces and then not be blank. 88 | peek := s 89 | if p.para() != nil || !peek.trimSpace(4, 4, false) || peek.isBlank() { 90 | return s, false 91 | } 92 | 93 | b := &indentBuilder{} 94 | p.addBlock(b) 95 | if peek.nl != '\n' { 96 | p.corner = true // goldmark does not normalize to \n 97 | } 98 | b.text = append(b.text, peek.string()) 99 | return line{}, true 100 | } 101 | 102 | // startFencedCodeBlock is a [starter] for a fenced [CodeBlock]. 103 | // See https://spec.commonmark.org/0.31.2/#fenced-code-blocks. 104 | func startFencedCodeBlock(p *parser, s line) (line, bool) { 105 | // Line must start with fence. 106 | indent, fence, info, ok := trimFence(&s) 107 | if !ok { 108 | return s, false 109 | } 110 | 111 | // Note presence of corner cases, for testing. 112 | if fence[0] == '~' && info != "" { 113 | // goldmark does not handle info after ~~~ 114 | p.corner = true 115 | } else if info != "" && !isLetter(info[0]) { 116 | // goldmark does not allow numbered info. 117 | // goldmark does not treat a tab as introducing a new word. 118 | p.corner = true 119 | } 120 | for _, c := range info { 121 | if isUnicodeSpace(c) { 122 | if c != ' ' { 123 | // goldmark only breaks on space 124 | p.corner = true 125 | } 126 | break 127 | } 128 | } 129 | 130 | p.addBlock(&fenceBuilder{indent, fence, info, nil}) 131 | return line{}, true 132 | } 133 | 134 | // trimFence attempts to trim leading indentation (up to 3 spaces), 135 | // a code fence, and an info string from s. 136 | // If successful, it returns those values and ok=true, leaving s empty. 137 | // If unsuccessful, it leaves s unmodified and returns ok=false. 138 | func trimFence(s *line) (indent int, fence, info string, ok bool) { 139 | t := *s 140 | indent = 0 141 | for indent < 3 && t.trimSpace(1, 1, false) { 142 | indent++ 143 | } 144 | c := t.peek() 145 | if c != '`' && c != '~' { 146 | return 147 | } 148 | 149 | f := t.string() 150 | n := 0 151 | for t.trim(c) { 152 | n++ 153 | } 154 | if n < 3 { 155 | return 156 | } 157 | 158 | txt := mdUnescaper.Replace(t.trimString()) 159 | if c == '`' && strings.Contains(txt, "`") { 160 | return 161 | } 162 | info = trimSpaceTab(txt) 163 | fence = f[:n] 164 | ok = true 165 | *s = line{} 166 | return 167 | } 168 | 169 | // An indentBuilder is a [blockBuilder] for an indented (unfenced) [CodeBlock]. 170 | type indentBuilder struct { 171 | indent string 172 | text []string 173 | } 174 | 175 | func (c *indentBuilder) extend(p *parser, s line) (line, bool) { 176 | // Extension lines must start with 4 spaces or be blank. 177 | if !s.trimSpace(4, 4, true) { 178 | return s, false 179 | } 180 | c.text = append(c.text, s.string()) 181 | if s.nl != '\n' { 182 | p.corner = true // goldmark does not normalize to \n 183 | } 184 | return line{}, true 185 | } 186 | 187 | func (b *indentBuilder) build(p *parser) Block { 188 | // Remove trailing blank lines, which are often used 189 | // just to separate the indented code block from what follows. 190 | for len(b.text) > 0 && b.text[len(b.text)-1] == "" { 191 | b.text = b.text[:len(b.text)-1] 192 | } 193 | return &CodeBlock{p.pos(), "", "", b.text} 194 | } 195 | 196 | // A fenceBuilder is a [blockBuilder] for a fenced [CodeBlock]. 197 | type fenceBuilder struct { 198 | indent int 199 | fence string 200 | info string 201 | text []string 202 | } 203 | 204 | func (c *fenceBuilder) extend(p *parser, s line) (line, bool) { 205 | // Check for closing fence, which must be at least as long as opening fence, with no info. 206 | // The closing fence can be indented less than the opening one. 207 | peek := s 208 | if _, fence, info, ok := trimFence(&peek); ok && strings.HasPrefix(fence, c.fence) && info == "" { 209 | return line{}, false 210 | } 211 | 212 | // Otherwise trim the indentation from the fence line, if present. 213 | if !s.trimSpace(c.indent, c.indent, false) { 214 | p.corner = true // goldmark mishandles fenced blank lines with not enough spaces 215 | s.trimSpace(0, c.indent, false) 216 | } 217 | 218 | c.text = append(c.text, s.string()) 219 | p.corner = p.corner || s.nl != '\n' // goldmark does not normalize to \n 220 | return line{}, true 221 | } 222 | 223 | func (c *fenceBuilder) build(p *parser) Block { 224 | return &CodeBlock{p.pos(), c.fence, c.info, c.text} 225 | } 226 | -------------------------------------------------------------------------------- /testdata/gfm_ext.txt: -------------------------------------------------------------------------------- 1 | // go run cmark2txtar.go /users/rsc/pub/cmark-gfm/test/extensions.txt 2 | -- parser.json -- 3 | {"Strikethrough": true, "Table": true} 4 | -- 1.md -- 5 | | abc | def | 6 | | --- | --- | 7 | | ghi | jkl | 8 | | mno | pqr | 9 | -- 1.html -- 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
abcdef
ghijkl
mnopqr
28 | -- 2.md -- 29 | Hello! 30 | 31 | | _abc_ | セン | 32 | | ----- | ---- | 33 | | 1. Block elements inside cells don't work. | | 34 | | But _**inline elements do**_. | x | 35 | 36 | Hi! 37 | -- 2.html -- 38 |

Hello!

39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 |
abcセン
1. Block elements inside cells don't work.
But inline elements do.x
57 |

Hi!

58 | -- 3.md -- 59 | | Not enough table | to be considered table | 60 | 61 | | Not enough table | to be considered table | 62 | | Not enough table | to be considered table | 63 | 64 | | Just enough table | to be considered table | 65 | | ----------------- | ---------------------- | 66 | 67 | | ---- | --- | 68 | 69 | |x| 70 | |-| 71 | 72 | | xyz | 73 | | --- | 74 | -- 3.html -- 75 |

| Not enough table | to be considered table |

76 |

| Not enough table | to be considered table | 77 | | Not enough table | to be considered table |

78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 |
Just enough tableto be considered table
86 |

| ---- | --- |

87 | 88 | 89 | 90 | 91 | 92 | 93 |
x
94 | 95 | 96 | 97 | 98 | 99 | 100 |
xyz
101 | -- 4.md -- 102 | abc | def 103 | --- | --- 104 | xyz | ghi 105 | -- 4.html -- 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 |
abcdef
xyzghi
120 | -- 5.md -- 121 | Hello! 122 | 123 | | _abc_ | セン | 124 | | ----- | ---- | 125 | | this row has a space at the end | | ^J 126 | | But _**inline elements do**_. | x | 127 | 128 | Hi! 129 | -- 5.html -- 130 |

Hello!

131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 |
abcセン
this row has a space at the end
But inline elements do.x
149 |

Hi!

150 | -- 6.md -- 151 | aaa | bbb | ccc | ddd | eee 152 | :-- | --- | :-: | --- | --: 153 | fff | ggg | hhh | iii | jjj 154 | -- 6.html -- 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 |
aaabbbcccdddeee
fffggghhhiiijjj
175 | -- 7.md -- 176 | | a | b | c | 177 | | --- | --- | 178 | | this | isn't | okay | 179 | -- 7.html -- 180 |

| a | b | c | 181 | | --- | --- | 182 | | this | isn't | okay |

183 | -- 8.md -- 184 | | a | b | c | 185 | | --- | --- | --- 186 | | x 187 | | a | b 188 | | 1 | 2 | 3 | 4 | 5 | 189 | -- 8.html -- 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 |
abc
x
ab
123
216 | -- 9.md -- 217 | | a | b | 218 | | --- | --- | 219 | | Escaped pipes are \|okay\|. | Like \| this. | 220 | | Within `\|code\| is okay` too. | 221 | | _**`c\|`**_ \| complex 222 | | don't **\_reparse\_** 223 | -- 9.html -- 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 |
ab
Escaped pipes are |okay|.Like | this.
Within |code| is okay too.
c| | complex
don't _reparse_
250 | -- 10.md -- 251 | | a | 252 | --- | 253 | -- 10.html -- 254 | 255 | 256 | 257 | 258 | 259 | 260 |
a
261 | -- 11.md -- 262 | | a | b | 263 | | --- | --- | 264 | | \\ | `\\` | 265 | | \\\\ | `\\\\` | 266 | | \_ | `\_` | 267 | | \| | `\|` | 268 | | \a | `\a` | 269 | 270 | \\ `\\` 271 | 272 | \\\\ `\\\\` 273 | 274 | \_ `\_` 275 | 276 | \| `\|` 277 | 278 | \a `\a` 279 | -- 11.html -- 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 |
ab
\\\
\\\\\\
_\_
||
\a\a
310 |

\ \\

311 |

\\ \\\\

312 |

_ \_

313 |

| \|

314 |

\a \a

315 | -- 12.md -- 316 | | a | 317 | | --- | 318 | | hello | 319 | | ok
sure | 320 | -- 12.html -- 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 |
a
hello
ok
sure
336 | -- 13.md -- 337 | Here's a link to [Freedom Planet 2][]. 338 | 339 | | Here's a link to [Freedom Planet 2][] in a table header. | 340 | | --- | 341 | | Here's a link to [Freedom Planet 2][] in a table row. | 342 | 343 | [Freedom Planet 2]: http://www.freedomplanet2.com/ 344 | -- 13.html -- 345 |

Here's a link to Freedom Planet 2.

346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 |
Here's a link to Freedom Planet 2 in a table header.
Here's a link to Freedom Planet 2 in a table row.
358 | -- 14.md -- 359 | | a | b | c | 360 | | --- | --- | --- | 361 | | d || e | 362 | -- 14.html -- 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 |
abc
de
379 | -- 15.md -- 380 | | a | b | 381 | | --- | --- | 382 | |***(a)***| 383 | -- 15.html -- 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 |
ab
(a)
398 | -- 16.md -- 399 | 123 400 | 456 401 | | a | b | 402 | | ---| --- | 403 | d | e 404 | -- 16.html -- 405 |

123 406 | 456

407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 |
ab
de
421 | -- 17.md -- 422 | A proper ~strikethrough~. 423 | -- 17.html -- 424 |

A proper strikethrough.

425 | -- 18.md -- 426 | These are ~not strikethroughs. 427 | 428 | No, they are not~ 429 | 430 | This ~is ~ legit~ isn't ~ legit. 431 | 432 | This is not ~~~~~one~~~~~ huge strikethrough. 433 | 434 | ~one~ ~~two~~ ~~~three~~~ 435 | 436 | No ~mismatch~~ 437 | -- 18.html -- 438 |

These are ~not strikethroughs.

439 |

No, they are not~

440 |

This is ~ legit isn't ~ legit.

441 |

This is not ~~~~~one~~~~~ huge strikethrough.

442 |

one two ~~~three~~~

443 |

No ~mismatch~~

444 | -------------------------------------------------------------------------------- /parse.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "strings" 9 | ) 10 | 11 | type blockBuilder interface { 12 | extend(p *parser, s line) (line, bool) 13 | build(*parser) Block 14 | } 15 | 16 | type openBlock struct { 17 | builder blockBuilder 18 | inner []Block 19 | pos Position 20 | } 21 | 22 | func (p *parser) last() Block { 23 | ob := &p.stack[len(p.stack)-1] 24 | return ob.inner[len(ob.inner)-1] 25 | } 26 | 27 | func (p *parser) deleteLast() { 28 | ob := &p.stack[len(p.stack)-1] 29 | ob.inner = ob.inner[:len(ob.inner)-1] 30 | } 31 | 32 | type rootBuilder struct{} 33 | 34 | func (b *rootBuilder) build(p *parser) Block { 35 | return &Document{p.pos(), p.blocks(), p.links} 36 | } 37 | 38 | // A Parser is a Markdown parser. 39 | // The exported fields in the struct can be filled in before calling 40 | // [Parser.Parse] in order to customize the details of the parsing process. 41 | // A Parser is safe for concurrent use by multiple goroutines. 42 | type Parser struct { 43 | // HeadingID determines whether the parser accepts 44 | // the {#hdr} syntax for an HTML id="hdr" attribute on headings. 45 | // For example, if HeadingIDs is true then the Markdown 46 | // ## Overview {#overview} 47 | // will render as the HTML 48 | //

Overview

49 | HeadingID bool 50 | 51 | // Strikethrough determines whether the parser accepts 52 | // ~abc~ and ~~abc~~ as strikethrough syntax, producing 53 | // abc in HTML. 54 | Strikethrough bool 55 | 56 | // TaskList determines whether the parser accepts 57 | // “task list items” as defined in GitHub Flavored Markdown. 58 | // When a list item begins with the plain text [ ] or [x] 59 | // that turns into an unchecked or checked check box. 60 | TaskList bool 61 | 62 | // TODO 63 | AutoLinkText bool 64 | AutoLinkAssumeHTTP bool 65 | 66 | // TODO 67 | Table bool 68 | 69 | // TODO 70 | Emoji bool 71 | 72 | // TODO 73 | SmartDot bool 74 | SmartDash bool 75 | SmartQuote bool 76 | 77 | // TODO 78 | Footnote bool 79 | } 80 | 81 | type parser struct { 82 | *Parser 83 | 84 | corner bool // noticed corner case to ignore in cross-implementation testing 85 | 86 | root *Document 87 | links map[string]*Link 88 | lineno int 89 | stack []openBlock 90 | lineDepth int 91 | lineInfo 92 | 93 | // texts to apply inline processing to 94 | texts []textRaw 95 | 96 | footnotes map[string]*Footnote 97 | 98 | // inline parsing 99 | s string 100 | emitted int // s[:emitted] has been emitted into list 101 | list []Inline 102 | 103 | backticks backtickParser 104 | 105 | fixups []func() 106 | } 107 | 108 | func (p *parser) addFixup(f func()) { 109 | p.fixups = append(p.fixups, f) 110 | } 111 | 112 | type lineInfo struct { 113 | noDeclEnd bool // no > on line 114 | noCommentEnd bool // no --> on line 115 | noProcInstEnd bool // no ?> on line 116 | noCDATAEnd bool // ]]> on line 117 | } 118 | 119 | type textRaw struct { 120 | *Text 121 | raw string 122 | } 123 | 124 | func (p *parser) newText(pos Position, text string) *Text { 125 | b := &Text{Position: pos} 126 | p.texts = append(p.texts, textRaw{b, text}) 127 | return b 128 | } 129 | 130 | func (p *parser) blocks() []Block { 131 | b := &p.stack[len(p.stack)-1] 132 | return b.inner 133 | } 134 | 135 | func (p *parser) pos() Position { 136 | b := &p.stack[len(p.stack)-1] 137 | return b.pos 138 | } 139 | 140 | func (p *Parser) Parse(text string) *Document { 141 | d, _ := p.parse(text) 142 | return d 143 | } 144 | 145 | func (p *Parser) parse(text string) (d *Document, corner bool) { 146 | var ps parser 147 | ps.Parser = p 148 | if strings.Contains(text, "\x00") { 149 | text = strings.ReplaceAll(text, "\x00", "\uFFFD") 150 | ps.corner = true // goldmark does not replace NUL 151 | } 152 | 153 | ps.lineDepth = -1 154 | ps.addBlock(&rootBuilder{}) 155 | for text != "" { 156 | end := 0 157 | for end < len(text) && text[end] != '\n' && text[end] != '\r' { 158 | end++ 159 | } 160 | ln := text[:end] 161 | text = text[end:] 162 | nl := byte(0) 163 | switch { 164 | case len(text) >= 2 && text[0] == '\r' && text[1] == '\n': 165 | nl = '\r' + '\n' 166 | text = text[2:] 167 | case len(text) >= 1: 168 | nl = text[0] 169 | text = text[1:] 170 | } 171 | ps.lineno++ 172 | ps.addLine(makeLine(ln, nl)) 173 | } 174 | ps.trimStack(0) 175 | 176 | for _, t := range ps.texts { 177 | t.Inline = ps.inline(t.raw) 178 | } 179 | 180 | for _, f := range ps.fixups { 181 | f() 182 | } 183 | 184 | // TODO move into its own function 185 | var fixBlock func(Block) 186 | 187 | fixBlocks := func(blocks []Block) []Block { 188 | keep := blocks[:0] 189 | for _, b := range blocks { 190 | fixBlock(b) 191 | if _, ok := b.(*Empty); ok { 192 | continue 193 | } 194 | keep = append(keep, b) 195 | } 196 | return keep 197 | } 198 | 199 | fixBlock = func(x Block) { 200 | switch x := x.(type) { 201 | case *Document: 202 | x.Blocks = fixBlocks(x.Blocks) 203 | case *Quote: 204 | x.Blocks = fixBlocks(x.Blocks) 205 | case *List: 206 | for _, item := range x.Items { 207 | fixBlock(item) 208 | } 209 | case *Item: 210 | x.Blocks = fixBlocks(x.Blocks) 211 | } 212 | } 213 | 214 | fixBlock(ps.root) 215 | 216 | return ps.root, ps.corner 217 | } 218 | 219 | func (p *parser) curB() blockBuilder { 220 | if p.lineDepth < len(p.stack) { 221 | return p.stack[p.lineDepth].builder 222 | } 223 | return nil 224 | } 225 | 226 | func (p *parser) nextB() blockBuilder { 227 | if p.lineDepth+1 < len(p.stack) { 228 | return p.stack[p.lineDepth+1].builder 229 | } 230 | return nil 231 | } 232 | func (p *parser) trimStack(depth int) { 233 | if len(p.stack) < depth { 234 | // unreachable 235 | panic("trimStack") 236 | } 237 | for len(p.stack) > depth { 238 | p.closeBlock() 239 | } 240 | } 241 | 242 | func (p *parser) addBlock(c blockBuilder) { 243 | p.trimStack(p.lineDepth + 1) 244 | p.stack = append(p.stack, openBlock{}) 245 | ob := &p.stack[len(p.stack)-1] 246 | ob.builder = c 247 | ob.pos.StartLine = p.lineno 248 | ob.pos.EndLine = p.lineno 249 | } 250 | 251 | func (p *parser) doneBlock(b Block) { 252 | p.trimStack(p.lineDepth + 1) 253 | ob := &p.stack[len(p.stack)-1] 254 | ob.inner = append(ob.inner, b) 255 | } 256 | 257 | func (p *parser) para() *paraBuilder { 258 | if b, ok := p.stack[len(p.stack)-1].builder.(*paraBuilder); ok { 259 | return b 260 | } 261 | return nil 262 | } 263 | 264 | func (p *parser) closeBlock() Block { 265 | b := &p.stack[len(p.stack)-1] 266 | if b.builder == nil { 267 | println("closeBlock", len(p.stack)-1) 268 | } 269 | blk := b.builder.build(p) 270 | p.stack = p.stack[:len(p.stack)-1] 271 | if len(p.stack) > 0 { 272 | b := &p.stack[len(p.stack)-1] 273 | b.inner = append(b.inner, blk) 274 | // _ = b 275 | } else { 276 | p.root = blk.(*Document) 277 | } 278 | return blk 279 | } 280 | 281 | func (p *parser) link(label string) *Link { 282 | return p.links[label] 283 | } 284 | 285 | func (p *parser) defineLink(label string, link *Link) { 286 | if p.links == nil { 287 | p.links = make(map[string]*Link) 288 | } 289 | p.links[label] = link 290 | } 291 | 292 | func (p *parser) addLine(s line) { 293 | // Process continued prefixes. 294 | p.lineDepth = 0 295 | for ; p.lineDepth+1 < len(p.stack); p.lineDepth++ { 296 | old := s 297 | var ok bool 298 | s, ok = p.stack[p.lineDepth+1].builder.extend(p, s) 299 | // Note: s != old is efficient only because s.text is either the same string (same pointer, len) 300 | // as old.text or has a different length or is empty; either way so there is no actual data comparison. 301 | // Sometimes s.text = "" and there is still 302 | if (ok || s != old) && !old.isBlank() { 303 | p.stack[p.lineDepth+1].pos.EndLine = p.lineno 304 | } 305 | if !ok { 306 | break 307 | } 308 | } 309 | 310 | if s.isBlank() { 311 | p.trimStack(p.lineDepth + 1) 312 | return 313 | } 314 | 315 | // Process new prefixes, if any. 316 | Prefixes: 317 | // Start new block inside p.stack[depth]. 318 | for _, fn := range starters { 319 | if l, ok := fn(p, s); ok { 320 | s = l 321 | if s.isBlank() { 322 | return 323 | } 324 | p.lineDepth++ 325 | goto Prefixes 326 | } 327 | } 328 | 329 | startParagraph(p, s) 330 | } 331 | 332 | func (c *rootBuilder) extend(p *parser, s line) (line, bool) { 333 | // unreachable 334 | panic("root extend") 335 | } 336 | 337 | type starter func(*parser, line) (line, bool) 338 | 339 | var starters = []starter{ 340 | startIndentedCodeBlock, 341 | startFencedCodeBlock, 342 | startBlockQuote, 343 | startATXHeading, 344 | startSetextHeading, 345 | startThematicBreak, 346 | startListItem, 347 | startHTMLBlock, 348 | startFootnote, 349 | } 350 | -------------------------------------------------------------------------------- /table.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "strings" 9 | "unicode/utf8" 10 | ) 11 | 12 | // A Table is a [Block] representing a [table], a GitHub-flavored Markdown extension. 13 | // 14 | // [table]: https://github.github.com/gfm/#tables-extension- 15 | type Table struct { 16 | Position 17 | Header []*Text // header row (slice of columns) 18 | Align []string // alignment for columns: "left", "center", "right"; "" for unset 19 | Rows [][]*Text // data rows (slices of columns, not necessarily all same width) 20 | } 21 | 22 | func (*Table) Block() {} 23 | 24 | func (t *Table) printHTML(p *printer) { 25 | p.html("\n") 26 | p.html("\n") 27 | p.html("\n") 28 | for i, hdr := range t.Header { 29 | p.html("") 34 | hdr.printHTML(p) 35 | p.html("\n") 36 | } 37 | p.html("\n") 38 | p.html("\n") 39 | if len(t.Rows) > 0 { 40 | p.html("\n") 41 | for _, row := range t.Rows { 42 | p.html("\n") 43 | for i, cell := range row { 44 | p.html("") 49 | cell.printHTML(p) 50 | p.html("\n") 51 | } 52 | p.html("\n") 53 | } 54 | p.html("\n") 55 | } 56 | p.html("
\n") 57 | } 58 | 59 | func (t *Table) printMarkdown(p *printer) { 60 | // TODO: double-check this 61 | // inline all Text values in Header and Rows to 62 | // get final, rendered widths 63 | var ( 64 | hdr = make([]string, len(t.Header)) 65 | rows = make([][]string, 0, len(t.Rows)) 66 | maxWidths = make([]int, len(t.Header)) 67 | 68 | xb = &printer{} 69 | xs string 70 | ) 71 | 72 | toString := func(txt *Text) string { 73 | xb.buf.Reset() 74 | txt.printMarkdown(xb) 75 | return strings.TrimSpace(xb.buf.String()) 76 | } 77 | 78 | for i, txt := range t.Header { 79 | xs = toString(txt) 80 | hdr[i] = xs 81 | maxWidths[i] = utf8.RuneCountInString(xs) 82 | } 83 | 84 | for _, row := range t.Rows { 85 | xrow := make([]string, len(hdr)) 86 | for j := range t.Header { 87 | xs = toString(row[j]) 88 | xrow[j] = xs 89 | if n := utf8.RuneCountInString(xs); n > maxWidths[j] { 90 | maxWidths[j] = n 91 | } 92 | } 93 | rows = append(rows, xrow) 94 | } 95 | 96 | p.maybeQuoteNL('|') 97 | for i, cell := range hdr { 98 | p.WriteString("| ") 99 | pad(p, cell, t.Align[i], maxWidths[i]) 100 | p.WriteString(" ") 101 | } 102 | p.WriteString("|") 103 | 104 | p.nl() 105 | for i, a := range t.Align { 106 | w := maxWidths[i] 107 | p.WriteString("| ") 108 | switch a { 109 | case "left": 110 | p.WriteString(":") 111 | repeat(p, '-', w-1) 112 | case "center": 113 | p.WriteString(":") 114 | repeat(p, '-', w-2) 115 | p.WriteString(":") 116 | case "right": 117 | repeat(p, '-', w-1) 118 | p.WriteString(":") 119 | default: 120 | repeat(p, '-', w) 121 | } 122 | p.WriteString(" ") 123 | } 124 | p.WriteString("|") 125 | 126 | for _, row := range rows { 127 | p.nl() 128 | for i := range t.Header { 129 | p.WriteString("| ") 130 | pad(p, row[i], t.Align[i], maxWidths[i]) 131 | p.WriteString(" ") 132 | } 133 | p.WriteString("|") 134 | } 135 | } 136 | 137 | // repeat prints c n times to p. 138 | func repeat(p *printer, c byte, n int) { 139 | for i := 0; i < n; i++ { 140 | p.WriteByte(c) 141 | } 142 | } 143 | 144 | // pad prints text to p aligned according to align, 145 | // aiming for a width of w runes. 146 | // It can happen that multiple runes appear as a single “character”, 147 | // which will break the alignment, but this is the best we can do for now. 148 | func pad(p *printer, text, align string, w int) { 149 | n := w - utf8.RuneCountInString(text) 150 | switch align { 151 | default: 152 | p.WriteString(text) 153 | repeat(p, ' ', n) 154 | case "right": 155 | repeat(p, ' ', n) 156 | p.WriteString(text) 157 | case "center": 158 | repeat(p, ' ', n/2) 159 | p.WriteString(text) 160 | repeat(p, ' ', n-n/2) 161 | } 162 | } 163 | 164 | // A tableTrimmed is a table row with the outer pipes (if any) removed. 165 | // It is a separate type to avoid accidentally trimming the outer pipes multiple times, 166 | // which would instead discard outer empty cells. 167 | type tableTrimmed string 168 | 169 | // isTableSpace reports whether c is a space as far as tables are concerned. 170 | func isTableSpace(c byte) bool { 171 | return c == ' ' || c == '\t' || c == '\v' || c == '\f' 172 | } 173 | 174 | // tableTrimSpace returns s with table space prefixes and suffixes removed. 175 | func tableTrimSpace(s string) string { 176 | i := 0 177 | for i < len(s) && isTableSpace(s[i]) { 178 | i++ 179 | } 180 | j := len(s) 181 | for j > i && isTableSpace(s[j-1]) { 182 | j-- 183 | } 184 | return s[i:j] 185 | } 186 | 187 | // tableTrimOuter trims the outer | |, if any, from the row. 188 | func tableTrimOuter(row string) tableTrimmed { 189 | row = tableTrimSpace(row) 190 | if len(row) > 0 && row[0] == '|' { 191 | row = row[1:] 192 | } 193 | if len(row) > 0 && row[len(row)-1] == '|' { 194 | row = row[:len(row)-1] 195 | } 196 | return tableTrimmed(row) 197 | } 198 | 199 | // isTableStart reports whether the pair of lines hdr1, delim1 200 | // are a valid table start. 201 | func isTableStart(hdr1, delim1 string) bool { 202 | // Scan potential delimiter string, counting columns. 203 | // This happens on every line of text, 204 | // so make it relatively quick - nothing expensive. 205 | col := 0 206 | delim := tableTrimOuter(delim1) 207 | i := 0 208 | for ; ; col++ { 209 | for i < len(delim) && isTableSpace(delim[i]) { 210 | i++ 211 | } 212 | if i >= len(delim) { 213 | break 214 | } 215 | if i < len(delim) && delim[i] == ':' { 216 | i++ 217 | } 218 | if i >= len(delim) || delim[i] != '-' { 219 | return false 220 | } 221 | i++ 222 | for i < len(delim) && delim[i] == '-' { 223 | i++ 224 | } 225 | if i < len(delim) && delim[i] == ':' { 226 | i++ 227 | } 228 | for i < len(delim) && isTableSpace(delim[i]) { 229 | i++ 230 | } 231 | if i < len(delim) && delim[i] == '|' { 232 | i++ 233 | } 234 | } 235 | 236 | if tableTrimSpace(hdr1) == "|" { 237 | // https://github.com/github/cmark-gfm/pull/127 and 238 | // https://github.com/github/cmark-gfm/pull/128 239 | // fixed a buffer overread by rejecting | by itself as a table line. 240 | // That seems to violate the “spec”, but we will play along. 241 | return false 242 | } 243 | 244 | return col == tableCount(tableTrimOuter(hdr1)) 245 | } 246 | 247 | // tableCount returns the number of columns in the row. 248 | func tableCount(row tableTrimmed) int { 249 | col := 1 250 | prev := byte(0) 251 | for i := 0; i < len(row); i++ { 252 | c := row[i] 253 | if c == '|' && prev != '\\' { 254 | col++ 255 | } 256 | prev = c 257 | } 258 | return col 259 | } 260 | 261 | // A tableBuilder is a [blockBuilder] for a [Table]. 262 | type tableBuilder struct { 263 | hdr tableTrimmed // header line 264 | delim tableTrimmed // delimiter line 265 | rows []tableTrimmed // data lines 266 | } 267 | 268 | // start starts the builder with the given header and delimiter lines. 269 | func (b *tableBuilder) start(hdr, delim string) { 270 | b.hdr = tableTrimOuter(hdr) 271 | b.delim = tableTrimOuter(delim) 272 | } 273 | 274 | // addRow adds a new row to the table. 275 | func (b *tableBuilder) addRow(row string) { 276 | b.rows = append(b.rows, tableTrimOuter(row)) 277 | } 278 | 279 | // build returns the [Table] for this tableBuilder. 280 | func (b *tableBuilder) build(p *parser) Block { 281 | pos := p.pos() 282 | pos.StartLine-- // builder does not count header 283 | pos.EndLine = pos.StartLine + 1 + len(b.rows) 284 | t := &Table{ 285 | Position: pos, 286 | } 287 | width := tableCount(b.hdr) 288 | t.Header = b.parseRow(p, b.hdr, pos.StartLine, width) 289 | t.Align = b.parseAlign(b.delim, width) 290 | t.Rows = make([][]*Text, len(b.rows)) 291 | for i, row := range b.rows { 292 | t.Rows[i] = b.parseRow(p, row, pos.StartLine+2+i, width) 293 | } 294 | return t 295 | } 296 | 297 | // parseRow TODO explain 298 | func (b *tableBuilder) parseRow(p *parser, row tableTrimmed, line int, width int) []*Text { 299 | out := make([]*Text, 0, width) 300 | pos := Position{StartLine: line, EndLine: line} 301 | start := 0 302 | unesc := nop 303 | for i := 0; i < len(row); i++ { 304 | c := row[i] 305 | if c == '\\' && i+1 < len(row) && row[i+1] == '|' { 306 | unesc = tableUnescape 307 | i++ 308 | continue 309 | } 310 | if c == '|' { 311 | out = append(out, p.newText(pos, unesc(strings.Trim(string(row[start:i]), " \t\v\f")))) 312 | if len(out) == width { 313 | // Extra cells are discarded! 314 | return out 315 | } 316 | start = i + 1 317 | unesc = nop 318 | } 319 | } 320 | out = append(out, p.newText(pos, unesc(strings.Trim(string(row[start:]), " \t\v\f")))) 321 | for len(out) < width { 322 | // Missing cells are considered empty. 323 | out = append(out, p.newText(pos, "")) 324 | } 325 | return out 326 | } 327 | 328 | func nop(text string) string { 329 | return text 330 | } 331 | 332 | // tableUnescape TODO 333 | func tableUnescape(text string) string { 334 | out := make([]byte, 0, len(text)) 335 | for i := 0; i < len(text); i++ { 336 | c := text[i] 337 | if c == '\\' && i+1 < len(text) && text[i+1] == '|' { 338 | i++ 339 | c = '|' 340 | } 341 | out = append(out, c) 342 | } 343 | return string(out) 344 | } 345 | 346 | // parseAlign TODO 347 | func (b *tableBuilder) parseAlign(delim tableTrimmed, n int) []string { 348 | align := make([]string, 0, tableCount(delim)) 349 | start := 0 350 | for i := 0; i < len(delim); i++ { 351 | if delim[i] == '|' { 352 | align = append(align, tableAlign(string(delim[start:i]))) 353 | start = i + 1 354 | } 355 | } 356 | align = append(align, tableAlign(string(delim[start:]))) 357 | return align 358 | } 359 | 360 | // tableAlign TODO 361 | func tableAlign(cell string) string { 362 | cell = tableTrimSpace(cell) 363 | l := cell[0] == ':' 364 | r := cell[len(cell)-1] == ':' 365 | switch { 366 | case l && r: 367 | return "center" 368 | case l: 369 | return "left" 370 | case r: 371 | return "right" 372 | } 373 | return "" 374 | } 375 | -------------------------------------------------------------------------------- /list.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "fmt" 9 | "strconv" 10 | ) 11 | 12 | // TODO should Item implement Block? 13 | // maybe make a itemBlock internal Block for use with the builders? 14 | 15 | // A List is a [Block] representing a [list], 16 | // either an unordered (bullet) list 17 | // or an ordered (numbered) list. 18 | // 19 | // Lists can be [loose or tight], which controls the spacing between list items. 20 | // In Markdown, a list is loose when there is a blank line 21 | // between any two list items, or when any list item 22 | // directly contains two blocks that are separated by a blank line. 23 | // (Note that because paragraphs must be separated by blank lines, 24 | // any multi-paragraph item necessarily creates a loose list.) 25 | // When rendering HTML, loose list items are formatted in the usual way. 26 | // For tight lists, a list item consisting of a single paragraph omits 27 | // the

...

tags around the paragraph text. 28 | // 29 | // [list]: https://spec.commonmark.org/0.31.2/#lists 30 | // [loose or tight]: https://spec.commonmark.org/0.31.2/#loose 31 | type List struct { 32 | Position 33 | 34 | // Bullet is the bullet character used in the list: '-', '+', or '*'. 35 | // For an ordered list, Bullet is the character following the number: '.' or ')'. 36 | Bullet rune 37 | 38 | // Start is the number of the first item in an ordered list. 39 | Start int 40 | 41 | // Loose indicates whether the list is loose. 42 | // (See the [List] doc comment for details.) 43 | Loose bool 44 | 45 | // Items is the list's items. 46 | // TODO: Should this be []*Item or Blocks? 47 | Items []Block // always *Item 48 | } 49 | 50 | func (*List) Block() {} 51 | 52 | // Ordered reports whether the list is ordered (numbered). 53 | func (l *List) Ordered() bool { 54 | return l.Bullet == '.' || l.Bullet == ')' 55 | } 56 | 57 | // An Item is a [Block] representing a [list item]. 58 | // 59 | // [list item]: https://spec.commonmark.org/0.31.2/#list-items 60 | type Item struct { 61 | Position 62 | 63 | // Blocks is the item content. 64 | Blocks []Block 65 | } 66 | 67 | func (*Item) Block() {} 68 | 69 | func (b *List) printHTML(p *printer) { 70 | if b.Bullet == '.' || b.Bullet == ')' { 71 | p.html("\n") 76 | } else { 77 | p.html("
    \n") 78 | } 79 | for _, item := range b.Items { 80 | item.printHTML(p) 81 | } 82 | if b.Bullet == '.' || b.Bullet == ')' { 83 | p.html("\n") 84 | } else { 85 | p.html("
\n") 86 | } 87 | } 88 | 89 | func (b *Item) printHTML(p *printer) { 90 | p.html("
  • ") 91 | if len(b.Blocks) > 0 { 92 | if _, ok := b.Blocks[0].(*Text); !ok { 93 | p.WriteString("\n") 94 | } 95 | } 96 | for i, c := range b.Blocks { 97 | c.printHTML(p) 98 | if i+1 < len(b.Blocks) { 99 | if _, ok := c.(*Text); ok { 100 | p.WriteString("\n") 101 | } 102 | } 103 | } 104 | p.html("
  • \n") 105 | } 106 | 107 | func (b *List) printMarkdown(p *printer) { 108 | old := p.listOut 109 | defer func() { 110 | p.listOut = old 111 | }() 112 | p.bullet = b.Bullet 113 | p.num = b.Start 114 | if b.Loose { 115 | p.loose++ 116 | } else { 117 | p.tight++ 118 | } 119 | p.maybeNL() 120 | for i, item := range b.Items { 121 | if i > 0 { 122 | p.nl() 123 | if b.Loose { 124 | p.nl() 125 | } 126 | } 127 | item.printMarkdown(p) 128 | p.num++ 129 | } 130 | } 131 | 132 | func (b *Item) printMarkdown(p *printer) { 133 | var marker string 134 | if p.bullet == '.' || p.bullet == ')' { 135 | marker = fmt.Sprintf(" %d%c ", p.num, p.bullet) 136 | } else { 137 | marker = fmt.Sprintf(" %c ", p.bullet) 138 | } 139 | p.WriteString(marker) 140 | n := len(marker) 141 | if n > 4 { 142 | n = 4 143 | } 144 | defer p.pop(p.push(" "[:n])) 145 | printMarkdownBlocks(b.Blocks, p) 146 | } 147 | 148 | // A listBuilder is a [blockBuilder] for a [List]. 149 | type listBuilder struct { 150 | // List fields 151 | bullet rune 152 | start int 153 | 154 | // item is the builder for the current item. 155 | item *itemBuilder 156 | 157 | // 158 | todo func() line 159 | } 160 | 161 | // An itemBuilder is a [blockBuilder] for an [Item]. 162 | type itemBuilder struct { 163 | list *listBuilder // list containing item 164 | width int // TODO 165 | haveContent bool // TODO 166 | } 167 | 168 | // TODO explain 169 | // startListItem is a [starter] for a list item. 170 | // The first list item in a list also starts the list itself. 171 | func startListItem(p *parser, s line) (_ line, _ bool) { 172 | if list, ok := p.curB().(*listBuilder); ok && list.todo != nil { 173 | s = list.todo() 174 | list.todo = nil 175 | return s, true 176 | } 177 | 178 | t := s 179 | n := 0 180 | for i := 0; i < 3; i++ { 181 | if !t.trimSpace(1, 1, false) { 182 | break 183 | } 184 | n++ 185 | } 186 | bullet := t.peek() 187 | var num int 188 | Switch: 189 | switch bullet { 190 | default: 191 | return 192 | case '-', '*', '+': 193 | t.trim(bullet) 194 | n++ 195 | case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 196 | for j := t.i; ; j++ { 197 | if j >= len(t.text) { 198 | return 199 | } 200 | c := t.text[j] 201 | if c == '.' || c == ')' { 202 | // success 203 | bullet = c 204 | j++ 205 | n += j - t.i 206 | t.i = j 207 | break Switch 208 | } 209 | if c < '0' || '9' < c { 210 | return 211 | } 212 | if j-t.i >= 9 { 213 | return 214 | } 215 | num = num*10 + int(c) - '0' 216 | } 217 | 218 | } 219 | if !t.trimSpace(1, 1, true) { 220 | return 221 | } 222 | n++ 223 | tt := t 224 | m := 0 225 | for i := 0; i < 3 && tt.trimSpace(1, 1, false); i++ { 226 | m++ 227 | } 228 | if !tt.trimSpace(1, 1, true) { 229 | n += m 230 | t = tt 231 | } 232 | 233 | // Pretty sure we have a list item now. 234 | 235 | var list *listBuilder 236 | if c, ok := p.nextB().(*listBuilder); ok { 237 | list = c 238 | } 239 | if list == nil || list.bullet != rune(bullet) { 240 | // “When the first list item in a list interrupts a paragraph—that is, 241 | // when it starts on a line that would otherwise count as 242 | // paragraph continuation text—then (a) the lines Ls must 243 | // not begin with a blank line, 244 | // and (b) if the list item is ordered, the start number must be 1.” 245 | if list == nil && p.para() != nil && (t.isBlank() || (bullet == '.' || bullet == ')') && num != 1) { 246 | // Goldmark and Dingus both seem to get this wrong 247 | // (or the words above don't mean what we think they do). 248 | // when the paragraph that could be continued 249 | // is inside a block quote. 250 | // See testdata/extra.txt 117.md. 251 | p.corner = true 252 | return 253 | } 254 | list = &listBuilder{bullet: rune(bullet), start: num} 255 | p.addBlock(list) 256 | } 257 | b := &itemBuilder{list: list, width: n, haveContent: !t.isBlank()} 258 | list.todo = func() line { 259 | p.addBlock(b) 260 | list.item = b 261 | return t 262 | } 263 | 264 | // TODO explain s not t 265 | return s, true 266 | } 267 | 268 | func (c *listBuilder) extend(p *parser, s line) (line, bool) { 269 | // TODO explain 270 | item := c.item 271 | if item == nil && s.isBlank() { // TODO how can this happen 272 | return s, true 273 | } 274 | 275 | // If we can trim the indentation required by the current item, 276 | // do that and return true, allowing s to be passed to the 277 | // item builder. 278 | if item != nil && s.trimSpace(item.width, item.width, true) { 279 | return s, true 280 | } 281 | return s, false 282 | } 283 | 284 | func (c *itemBuilder) extend(p *parser, s line) (line, bool) { 285 | blank := s.isBlank() 286 | 287 | // If there is a blank line and no content so far, 288 | // the item is over. TODO explain 289 | if blank && !c.haveContent { 290 | return s, false 291 | } 292 | 293 | // TODO explain 294 | if blank { 295 | // Goldmark does this and apparently commonmark.js too. 296 | // Not sure why it is necessary. 297 | return line{}, true 298 | } 299 | 300 | // TODO explain 301 | if !blank { 302 | c.haveContent = true 303 | } 304 | return s, true 305 | } 306 | 307 | func (b *itemBuilder) build(p *parser) Block { 308 | b.list.item = nil 309 | return &Item{p.pos(), p.blocks()} 310 | } 311 | 312 | func (b *listBuilder) build(p *parser) Block { 313 | blocks := p.blocks() 314 | pos := p.pos() 315 | 316 | // list can have wrong pos b/c extend dance. 317 | // TODO explain 318 | pos.EndLine = blocks[len(blocks)-1].Pos().EndLine 319 | 320 | // Decide whether list is loose. 321 | loose := false 322 | Loose: 323 | for i, c := range blocks { 324 | c := c.(*Item) 325 | if i+1 < len(blocks) { 326 | if blocks[i+1].Pos().StartLine-c.EndLine > 1 { 327 | loose = true 328 | break Loose 329 | } 330 | } 331 | for j, d := range c.Blocks { 332 | endLine := d.Pos().EndLine 333 | if j+1 < len(c.Blocks) { 334 | if c.Blocks[j+1].Pos().StartLine-endLine > 1 { 335 | loose = true 336 | break Loose 337 | } 338 | } 339 | } 340 | } 341 | 342 | if !loose { 343 | // TODO: rethink whether this is correct. 344 | // Perhaps the blocks should still be Paragraph 345 | // and we just skip over the

    during formatting? 346 | // Then Text might not need to be a Block. 347 | for _, c := range blocks { 348 | c := c.(*Item) 349 | for i, d := range c.Blocks { 350 | if p, ok := d.(*Paragraph); ok { 351 | c.Blocks[i] = p.Text 352 | } 353 | } 354 | } 355 | } 356 | 357 | x := &List{ 358 | pos, 359 | b.bullet, 360 | b.start, 361 | loose, 362 | p.blocks(), 363 | } 364 | listCorner(p, x) 365 | if p.TaskList { 366 | p.addFixup(func() { 367 | parseTaskList(p, x) 368 | }) 369 | } 370 | return x 371 | } 372 | 373 | // listCorner checks whether list contains any corner cases 374 | // that other implementations mishandle, and if so sets p.corner. 375 | func listCorner(p *parser, list *List) { 376 | for _, item := range list.Items { 377 | item := item.(*Item) 378 | if len(item.Blocks) == 0 { 379 | // Goldmark mishandles what follows; see testdata/extra.txt 111.md. 380 | p.corner = true 381 | return 382 | } 383 | switch item.Blocks[0].(type) { 384 | case *List, *ThematicBreak, *CodeBlock: 385 | // Goldmark mishandles a list with various block items inside it. 386 | p.corner = true 387 | return 388 | } 389 | } 390 | } 391 | 392 | // GitHub task list extension 393 | 394 | // A Task is an [Inline] for a [task list item marker] (a checkbox), 395 | // a GitHub-flavored Markdown extension. 396 | // 397 | // [task list item marker]: https://github.github.com/gfm/#task-list-items-extension- 398 | type Task struct { 399 | Checked bool 400 | } 401 | 402 | func (*Task) Inline() {} 403 | 404 | func (x *Task) printHTML(p *printer) { 405 | p.html(" `) 410 | } 411 | 412 | func (x *Task) printMarkdown(p *printer) { 413 | if x.Checked { 414 | p.text(`[x] `) 415 | } else { 416 | p.text(`[ ] `) 417 | } 418 | } 419 | 420 | func (x *Task) printText(p *printer) { 421 | // Unreachable: printText is only used to render the 422 | // alt text of an image, which can only contain inlines, 423 | // and while Task is an inline, it only appears inside 424 | // lists, and a list cannot appear in an alt text. 425 | // Even so, maybe someone will make malformed syntax trees. 426 | x.printMarkdown(p) 427 | } 428 | 429 | // taskList checks whether any items in list begin with task list markers. 430 | // If so, it replaces the markers with [Task]s. 431 | func parseTaskList(p *parser, list *List) { 432 | for _, item := range list.Items { 433 | item := item.(*Item) 434 | if len(item.Blocks) == 0 { 435 | continue 436 | } 437 | var text *Text 438 | switch b := item.Blocks[0].(type) { 439 | default: 440 | continue 441 | case *Paragraph: 442 | text = b.Text 443 | case *Text: 444 | text = b 445 | } 446 | if len(text.Inline) < 1 { 447 | // unreachable with standard parser 448 | continue 449 | } 450 | pl, ok := text.Inline[0].(*Plain) 451 | if !ok { 452 | continue 453 | } 454 | s := pl.Text 455 | if len(s) < 4 || s[0] != '[' || s[2] != ']' || (s[1] != ' ' && s[1] != 'x' && s[1] != 'X') { 456 | continue 457 | } 458 | if s[3] != ' ' && s[3] != '\t' { 459 | p.corner = true // goldmark does not require the space 460 | continue 461 | } 462 | text.Inline = append([]Inline{&Task{Checked: s[1] == 'x' || s[1] == 'X'}, 463 | &Plain{Text: s[len("[x] "):]}}, text.Inline[1:]...) 464 | } 465 | } 466 | -------------------------------------------------------------------------------- /testdata/autoext.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true} 3 | -- gfm622.md -- 4 | www.commonmark.org 5 | -- gfm622.html -- 6 |

    www.commonmark.org

    7 | -- gfm623.md -- 8 | Visit www.commonmark.org/help for more information. 9 | -- gfm623.html -- 10 |

    Visit www.commonmark.org/help for more information.

    11 | -- gfm624.md -- 12 | Visit www.commonmark.org. 13 | 14 | Visit www.commonmark.org/a.b. 15 | -- gfm624.html -- 16 |

    Visit www.commonmark.org.

    17 |

    Visit www.commonmark.org/a.b.

    18 | -- gfm625.md -- 19 | www.google.com/search?q=Markup+(business) 20 | 21 | www.google.com/search?q=Markup+(business))) 22 | 23 | (www.google.com/search?q=Markup+(business)) 24 | 25 | (www.google.com/search?q=Markup+(business) 26 | -- gfm625.html -- 27 |

    www.google.com/search?q=Markup+(business)

    28 |

    www.google.com/search?q=Markup+(business)))

    29 |

    (www.google.com/search?q=Markup+(business))

    30 |

    (www.google.com/search?q=Markup+(business)

    31 | -- gfm626.md -- 32 | www.google.com/search?q=(business))+ok 33 | -- gfm626.html -- 34 |

    www.google.com/search?q=(business))+ok

    35 | -- gfm627.md -- 36 | www.google.com/search?q=commonmark&hl=en 37 | 38 | www.google.com/search?q=commonmark&hl; 39 | -- gfm627.html -- 40 |

    www.google.com/search?q=commonmark&hl=en

    41 |

    www.google.com/search?q=commonmark&hl;

    42 | -- gfm628.md -- 43 | www.commonmark.org/hewww.commonmark.org/he<lp

    46 | -- gfm629.md -- 47 | http://commonmark.org 48 | 49 | (Visit http://encrypted.google.com/search?q=Markup+(business)) 50 | -- gfm629.html -- 51 |

    http://commonmark.org

    52 |

    (Visit http://encrypted.google.com/search?q=Markup+(business))

    53 | -- gfm630.md -- 54 | foo@bar.baz 55 | -- gfm630.html -- 56 |

    foo@bar.baz

    57 | -- gfm631.md -- 58 | hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is. 59 | -- gfm631.html -- 60 |

    hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.

    61 | -- gfm632.md -- 62 | a.b-c_d@a.b 63 | 64 | a.b-c_d@a.b. 65 | 66 | a.b-c_d@a.b- 67 | 68 | a.b-c_d@a.b_ 69 | -- gfm632.html -- 70 |

    a.b-c_d@a.b

    71 |

    a.b-c_d@a.b.

    72 |

    a.b-c_d@a.b-

    73 |

    a.b-c_d@a.b_

    74 | -- gfm633.md -- 75 | mailto:foo@bar.baz 76 | 77 | mailto:a.b-c_d@a.b 78 | 79 | mailto:a.b-c_d@a.b. 80 | 81 | mailto:a.b-c_d@a.b/ 82 | 83 | mailto:a.b-c_d@a.b- 84 | 85 | mailto:a.b-c_d@a.b_ 86 | 87 | xmpp:foo@bar.baz 88 | 89 | xmpp:foo@bar.baz. 90 | -- gfm633.html -- 91 |

    mailto:foo@bar.baz

    92 |

    mailto:a.b-c_d@a.b

    93 |

    mailto:a.b-c_d@a.b.

    94 |

    mailto:a.b-c_d@a.b/

    95 |

    mailto:a.b-c_d@a.b-

    96 |

    mailto:a.b-c_d@a.b_

    97 |

    xmpp:foo@bar.baz

    98 |

    xmpp:foo@bar.baz.

    99 | -- gfm634.md -- 100 | xmpp:foo@bar.baz/txt 101 | 102 | xmpp:foo@bar.baz/txt@bin 103 | 104 | xmpp:foo@bar.baz/txt@bin.com 105 | -- gfm634.html -- 106 |

    xmpp:foo@bar.baz/txt

    107 |

    xmpp:foo@bar.baz/txt@bin

    108 |

    xmpp:foo@bar.baz/txt@bin.com

    109 | -- gfm635.md -- 110 | xmpp:foo@bar.baz/txt/bin 111 | -- gfm635.html -- 112 |

    xmpp:foo@bar.baz/txt/bin

    113 | -- 1.md -- 114 | xhttp://go.dev y z 115 | αhttp://go.dev y z 116 | -- 1.html -- 117 |

    xhttp://go.dev y z 118 | αhttp://go.dev y z

    119 | -- 1a.md -- 120 | xhttps://go.dev y z 121 | αhttps://go.dev y z 122 | -- 1a.html -- 123 |

    xhttps://go.dev y z 124 | αhttps://go.dev y z

    125 | -- 2.md -- 126 | cannot follow ascii letter 127 | xhttp://go.dev y z 128 | x0http://go.dev 129 | αhttp://go.dev 130 | -- 2.html -- 131 |

    cannot follow ascii letter 132 | xhttp://go.dev y z 133 | x0http://go.dev 134 | αhttp://go.dev

    135 | -- 3.md -- 136 | deviations - github would include the suffixes in the URLs 137 | www.go.dev@def.ghi is my email 138 | www.go.dev!wtf 139 | -- 3.html -- 140 |

    deviations - github would include the suffixes in the URLs 141 | www.go.dev@def.ghi is my email 142 | www.go.dev!wtf

    143 | -- 4.md -- 144 | trimming 145 | www.google.com/search?q=Markup+(business))) 146 | -- 4.html -- 147 |

    trimming 148 | www.google.com/search?q=Markup+(business)))

    149 | -- 5.md -- 150 | www.google.com/search?q=Markup+(business))). 151 | -- 5.html -- 152 |

    www.google.com/search?q=Markup+(business))).

    153 | -- 6.md -- 154 | www.google.com/search?q=Markup+(business). 155 | -- 6.html -- 156 |

    www.google.com/search?q=Markup+(business).

    157 | -- 7.md -- 158 | www.google.com/search?q=Markup+)()((business) 159 | -- 7.html -- 160 |

    www.google.com/search?q=Markup+)()((business)

    161 | -- 8.md -- 162 | www.google.com/search?q=commonmark&hl; 163 | -- 8.html -- 164 |

    www.google.com/search?q=commonmark&hl;

    165 | -- 9.md -- 166 | www.google.com/search?q=commonmark&hl;) 167 | -- 9.html -- 168 |

    www.google.com/search?q=commonmark&hl;)

    169 | -- 10.md -- 170 | www.google.com/search?q=(commonmark&hl;) 171 | -- 10.html -- 172 |

    www.google.com/search?q=(commonmark&hl;)

    173 | -- 11.md -- 174 | www.google.com/search?q=commonmark)&hl; 175 | -- 11.html -- 176 |

    www.google.com/search?q=commonmark)&hl;

    177 | -- 12.md -- 178 | www.google.com/search?q=commonmark).&hl; 179 | -- 12.html -- 180 |

    www.google.com/search?q=commonmark).&hl;

    181 | -- 13.md -- 182 | www.google.com/search?q=commonmark).&hl 183 | -- 13.html -- 184 |

    www.google.com/search?q=commonmark).&hl

    185 | -- 14.md -- 186 | www.google.com/search?q=commonmark).&hl 187 | -- 14.html -- 188 |

    www.google.com/search?q=commonmark).&hl

    189 | -- 15.md -- 190 | www.goo-gle.com/search 191 | -- 15.html -- 192 |

    www.goo-gle.com/search

    193 | -- 16.md -- 194 | www.goo_gle.com/search 195 | -- 16.html -- 196 |

    www.goo_gle.com/search

    197 | -- 17.md -- 198 | www.foo_bar.google.com/search 199 | -- 17.html -- 200 |

    www.foo_bar.google.com/search

    201 | -- 18.md -- 202 | www./search 203 | -- 18.html -- 204 |

    www./search

    205 | -- 19.md -- 206 | www.google.com.foo_bar/search 207 | -- 19.html -- 208 |

    www.google.com.foo_bar/search

    209 | -- 20.md -- 210 | www.search 211 | -- 20.html -- 212 |

    www.search

    213 | -- 21.md -- 214 | www. 215 | -- 21.html -- 216 |

    www.

    217 | -- 21a.md -- 218 | www.!search 219 | -- 21a.html -- 220 |

    www.!search

    221 | -- 22.md -- 222 | www.sea_rch 223 | -- 22.html -- 224 |

    www.sea_rch

    225 | -- 23.md -- 226 | http://!search 227 | -- 23.html -- 228 |

    http://!search

    229 | -- 24.md -- 230 | http://!search 231 | -- 24.html -- 232 |

    http://!search

    233 | -- 25.md -- 234 | http://search 235 | -- 25.html -- 236 |

    http://search

    237 | -- 26.md -- 238 | https://search 239 | -- 26.html -- 240 |

    https://search

    241 | -- 27.md -- 242 | http://sea_rch 243 | -- 27.html -- 244 |

    http://sea_rch

    245 | -- 28.md -- 246 | https://sea_rch 247 | -- 28.html -- 248 |

    https://sea_rch

    249 | -- 29.md -- 250 | http://sea_rch.x 251 | -- 29.html -- 252 |

    http://sea_rch.x

    253 | -- 30.md -- 254 | https://sea_rch.x 255 | -- 30.html -- 256 |

    https://sea_rch.x

    257 | -- 31.md -- 258 | http://sea_rch.x.y 259 | -- 31.html -- 260 |

    http://sea_rch.x.y

    261 | -- 32.md -- 262 | http://sea_rch.x.y.http://www.google.com 263 | -- 32.html -- 264 |

    http://sea_rch.x.y.http://www.google.com

    265 | -- 33.md -- 266 | http://sea_rch.http://www.google.com 267 | -- 33.html -- 268 |

    http://sea_rch.http://www.google.com

    269 | -- 34.md -- 270 | _abc_@ghi.def is my email 271 | -- 34.html -- 272 |

    abc@ghi.def is my email

    273 | -- 35.md -- 274 | _abc@ghi_.def is my email 275 | -- 35.html -- 276 |

    abc@ghi.def is my email

    277 | -- 36.md -- 278 | `hello`abc@def.ghi is my email 279 | -- 36.html -- 280 |

    helloabc@def.ghi is my email

    281 | -- 37.md -- 282 | `hello` abc@def.ghi is my email 283 | -- 37.html -- 284 |

    hello abc@def.ghi is my email

    285 | -- 38.md -- 286 | *hello*abc@def.ghi is my email 287 | -- 38.html -- 288 |

    helloabc@def.ghi is my email

    289 | -- 39.md -- 290 | [link](link)abc@def.ghi is my email 291 | -- 39.html -- 292 |

    linkabc@def.ghi is my email

    293 | -- 40.md -- 294 | \!abc@def.ghi is my email 295 | -- 40.html -- 296 |

    !abc@def.ghi is my email

    297 | -- 41.md -- 298 | $abc@def.ghi is my email 299 | -- 41.html -- 300 |

    $abc@def.ghi is my email

    301 | -- 42.md -- 302 | www.go.dev@def.ghi is my email 303 | -- 42.html -- 304 |

    www.go.dev@def.ghi is my email

    305 | -- 43.md -- 306 | abc@www.go.dev is my email 307 | -- 43.html -- 308 |

    abc@www.go.dev is my email

    309 | -- 44.md -- 310 | αabc@def.ghi 311 | -- 44.html -- 312 |

    αabc@def.ghi

    313 | -- 45.md -- 314 | https://web.site:8080/~matloob 315 | -- 45.html -- 316 |

    https://web.site:8080/~matloob

    317 | -- parser.json -- 318 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true, "Strikethrough": true} 319 | -- 46.md -- 320 | https://web.site:8080/~matloob 321 | -- 46.html -- 322 |

    https://web.site:8080/~matloob

    323 | -- parser.json -- 324 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true} 325 | -- 47.md -- 326 | https://web.site:8080/*matlo_ob 327 | -- 47.html -- 328 |

    https://web.site:8080/*matlo_ob

    329 | -- parser.json -- 330 | {"AutoLinkText": true, "Strikethrough": true} 331 | -- 48.md -- 332 | *user@dom.org* 333 | -- 48.html -- 334 |

    user@dom.org

    335 | -- 49.md -- 336 | **user@dom.org** 337 | -- 49.html -- 338 |

    user@dom.org

    339 | -- 50.md -- 340 | ~~user@dom.org~~ 341 | -- 50.html -- 342 |

    user@dom.org

    343 | -- 51.md -- 344 | www.google.com/search?q=cmark&-hl; 345 | -- 51.html -- 346 |

    www.google.com/search?q=cmark&-hl;

    347 | -- 52.md -- 348 | foo@.bar 349 | -- 52.html -- 350 |

    foo@.bar

    351 | -- 53.md -- 352 | foo@..bar 353 | -- 53.html -- 354 |

    foo@..bar

    355 | -- 54.md -- 356 | mailto:none 357 | mailto:none# 358 | -- 54.html -- 359 |

    mailto:none 360 | mailto:none#

    361 | -- 55.md -- 362 | xmpp:none 363 | xmpp:none# 364 | xmpp:foo@..bar 365 | -- 55.html -- 366 |

    xmpp:none 367 | xmpp:none# 368 | xmpp:foo@..bar

    369 | -------------------------------------------------------------------------------- /md_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "bytes" 9 | "encoding/json" 10 | "flag" 11 | "fmt" 12 | "go/token" 13 | "io" 14 | "net/url" 15 | "os" 16 | "path/filepath" 17 | "reflect" 18 | "strings" 19 | "testing" 20 | 21 | "github.com/yuin/goldmark" 22 | gext "github.com/yuin/goldmark/extension" 23 | gparser "github.com/yuin/goldmark/parser" 24 | ghtml "github.com/yuin/goldmark/renderer/html" 25 | "golang.org/x/tools/txtar" 26 | ) 27 | 28 | var goldmarkFlag = flag.Bool("goldmark", false, "run goldmark tests") 29 | 30 | var roundTripFailures = map[string]bool{ 31 | "TestToHTML/extra/13": true, // indentation of tag 32 | "TestToHTML/extra/75": true, // weird list 33 | "TestToHTML/extra/76": true, // weird list 34 | "TestToHTML/extra/115": true, // weird list 35 | 36 | "TestToHTML/gfm_ext/9": true, // table 37 | "TestToHTML/gfm_ext/11": true, // table 38 | 39 | "TestToHTML/spec0.29/19": true, // thematic break 40 | "TestToHTML/spec0.29/40": true, // indentation of heading 41 | "TestToHTML/spec0.29/51": true, // newline in heading 42 | "TestToHTML/spec0.29/52": true, // newline in heading 43 | "TestToHTML/spec0.29/57": true, // setext heading 44 | "TestToHTML/spec0.29/63": true, // setext heading 45 | "TestToHTML/spec0.29/65": true, // newline in heading 46 | "TestToHTML/spec0.29/171": true, // link ref def 47 | "TestToHTML/spec0.29/208": true, // weird list 48 | "TestToHTML/spec0.29/227": true, // weird list 49 | "TestToHTML/spec0.29/241": true, // weird list 50 | "TestToHTML/spec0.29/282": true, // weird list 51 | "TestToHTML/spec0.29/283": true, // weird list 52 | "TestToHTML/spec0.29/312": true, // escape plain 53 | "TestToHTML/spec0.29/323": true, // escape plain 54 | "TestToHTML/spec0.29/324": true, // escape plain 55 | "TestToHTML/spec0.29/325": true, // escape plain 56 | "TestToHTML/spec0.29/326": true, // escape plain 57 | "TestToHTML/spec0.29/327": true, // escape plain 58 | "TestToHTML/spec0.29/331": true, // backtick spaces 59 | "TestToHTML/spec0.29/349": true, // backticks 60 | "TestToHTML/spec0.29/502": true, // escape quotes 61 | 62 | "TestToHTML/spec0.30/26": true, // escape plain 63 | "TestToHTML/spec0.30/37": true, // escape plain 64 | "TestToHTML/spec0.30/38": true, // escape plain 65 | "TestToHTML/spec0.30/39": true, // escape plain 66 | "TestToHTML/spec0.30/40": true, // escape plain 67 | "TestToHTML/spec0.30/41": true, // escape plain 68 | "TestToHTML/spec0.30/49": true, // thematic break 69 | "TestToHTML/spec0.30/70": true, // indentation of heading 70 | "TestToHTML/spec0.30/81": true, // newline in heading 71 | "TestToHTML/spec0.30/82": true, // newline in heading 72 | "TestToHTML/spec0.30/87": true, // setext heading 73 | "TestToHTML/spec0.30/93": true, // setext heading 74 | "TestToHTML/spec0.30/95": true, // newline in heading 75 | "TestToHTML/spec0.30/202": true, // link ref def 76 | "TestToHTML/spec0.30/238": true, // weird list 77 | "TestToHTML/spec0.30/257": true, // weird list 78 | "TestToHTML/spec0.30/271": true, // weird list 79 | "TestToHTML/spec0.30/312": true, // weird list 80 | "TestToHTML/spec0.30/313": true, // weird list 81 | "TestToHTML/spec0.30/331": true, // backtick spaces 82 | "TestToHTML/spec0.30/349": true, // backticks 83 | "TestToHTML/spec0.30/505": true, // escape quotes 84 | 85 | "TestToHTML/spec0.31.2/26": true, // escape plain 86 | "TestToHTML/spec0.31.2/37": true, // escape plain 87 | "TestToHTML/spec0.31.2/38": true, // escape plain 88 | "TestToHTML/spec0.31.2/39": true, // escape plain 89 | "TestToHTML/spec0.31.2/40": true, // escape plain 90 | "TestToHTML/spec0.31.2/41": true, // escape plain 91 | "TestToHTML/spec0.31.2/49": true, // thematic break 92 | "TestToHTML/spec0.31.2/70": true, // indentation of heading 93 | "TestToHTML/spec0.31.2/81": true, // newline in heading 94 | "TestToHTML/spec0.31.2/82": true, // newline in heading 95 | "TestToHTML/spec0.31.2/87": true, // setext heading 96 | "TestToHTML/spec0.31.2/93": true, // setext heading 97 | "TestToHTML/spec0.31.2/95": true, // newline in heading 98 | "TestToHTML/spec0.31.2/202": true, // link ref def 99 | "TestToHTML/spec0.31.2/238": true, // weird list 100 | "TestToHTML/spec0.31.2/257": true, // weird list 101 | "TestToHTML/spec0.31.2/271": true, // weird list 102 | "TestToHTML/spec0.31.2/312": true, // weird list 103 | "TestToHTML/spec0.31.2/313": true, // weird list 104 | "TestToHTML/spec0.31.2/331": true, // backtick spaces 105 | "TestToHTML/spec0.31.2/349": true, // backticks 106 | "TestToHTML/spec0.31.2/506": true, // escape quotes 107 | 108 | "TestToHTML/table/gfm200": true, // table 109 | "TestToHTML/table/2": true, // table 110 | } 111 | 112 | func TestToHTML(t *testing.T) { 113 | files, err := filepath.Glob("testdata/*.txt") 114 | if err != nil { 115 | t.Fatal(err) 116 | } 117 | for _, file := range files { 118 | if strings.HasSuffix(file, "_fmt.txt") { 119 | continue 120 | } 121 | t.Run(strings.TrimSuffix(filepath.Base(file), ".txt"), func(t *testing.T) { 122 | a, err := txtar.ParseFile(file) 123 | if err != nil { 124 | t.Fatal(err) 125 | } 126 | 127 | var p Parser 128 | var ncase, npass int 129 | for i := 0; i+2 <= len(a.Files); { 130 | if a.Files[i].Name == "parser.json" { 131 | p = parseParser(t, a.Files[i].Data) 132 | i++ 133 | continue 134 | } 135 | ncase++ 136 | md := a.Files[i] 137 | html := a.Files[i+1] 138 | i += 2 139 | name := strings.TrimSuffix(md.Name, ".md") 140 | if name != strings.TrimSuffix(html.Name, ".html") { 141 | t.Fatalf("mismatched file pair: %s and %s", md.Name, html.Name) 142 | } 143 | 144 | t.Run(name, func(t *testing.T) { 145 | doc := p.Parse(decode(string(md.Data))) 146 | h := encode(ToHTML(doc)) 147 | if h != string(html.Data) { 148 | q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20") 149 | t.Fatalf("input %q\nparse:\n%s\nhave %q\nwant %q\ndingus: (https://spec.commonmark.org/dingus/?text=%s)\ngithub: (https://github.com/rsc/tmp/issues/new?body=%s)", md.Data, dump(doc), h, html.Data, q, q) 150 | } 151 | 152 | // Make sure unexported types like emphPlain don't leak into result. 153 | if x, ok := findUnexported(reflect.ValueOf(doc)); ok { 154 | t.Fatalf("input %q\nparse:\n%s\nfound parsed value of unexported type %s", md.Data, dump(doc), x.Type()) 155 | } 156 | 157 | // Make sure Format preserves the HTML. 158 | md1 := Format(doc) 159 | doc1 := p.Parse(md1) 160 | h1 := encode(ToHTML(doc1)) 161 | if h1 != string(html.Data) && !roundTripFailures[t.Name()] { 162 | q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20") 163 | t.Fatalf("input %q\nreformat %q\n%s\n%s\nhave %q\nwant %q\ndingus: (https://spec.commonmark.org/dingus/?text=%s)\ngithub: (https://github.com/rsc/tmp/issues/new?body=%s)", md.Data, md1, dump(doc), dump(doc1), h1, html.Data, q, q) 164 | } 165 | if h1 == string(html.Data) && roundTripFailures[t.Name()] { 166 | t.Fatalf("no longer failing") 167 | } 168 | 169 | npass++ 170 | }) 171 | 172 | if !*goldmarkFlag { 173 | continue 174 | } 175 | t.Run("goldmark/"+name, func(t *testing.T) { 176 | in := decode(string(md.Data)) 177 | _, corner := p.parse(in) 178 | if corner { 179 | t.Skip("known corner case") 180 | } 181 | gm := goldmarkParser(&p) 182 | var buf bytes.Buffer 183 | if err := gm.Convert([]byte(in), &buf); err != nil { 184 | t.Fatal(err) 185 | } 186 | if buf.Len() > 0 && buf.Bytes()[buf.Len()-1] != '\n' { 187 | buf.WriteByte('\n') 188 | } 189 | want := decode(string(html.Data)) 190 | want = strings.ReplaceAll(want, " />", ">") 191 | out := buf.String() 192 | out = strings.ReplaceAll(out, " />", ">") 193 | q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20") 194 | if out != want { 195 | t.Fatalf("\n - input: ``%q``\n - output: ``%q``\n - golden: ``%q``\n - [dingus](https://spec.commonmark.org/dingus/?text=%s)\n - [github](https://github.com/rsc/tmp/issues/new?body=%s)", in, out, want, q, q) 196 | } 197 | npass++ 198 | 199 | }) 200 | } 201 | t.Logf("%d/%d pass", npass, ncase) 202 | }) 203 | } 204 | } 205 | 206 | func goldmarkParser(p *Parser) goldmark.Markdown { 207 | opts := []goldmark.Option{ 208 | goldmark.WithRendererOptions(ghtml.WithUnsafe()), 209 | } 210 | if p.HeadingID { 211 | opts = append(opts, goldmark.WithParserOptions(gparser.WithHeadingAttribute())) 212 | } 213 | if p.Strikethrough { 214 | opts = append(opts, goldmark.WithExtensions(gext.Strikethrough)) 215 | } 216 | if p.TaskList { 217 | opts = append(opts, goldmark.WithExtensions(gext.TaskList)) 218 | } 219 | if p.AutoLinkText { 220 | opts = append(opts, goldmark.WithExtensions(gext.Linkify)) 221 | } 222 | if p.Table { 223 | opts = append(opts, goldmark.WithExtensions(gext.Table)) 224 | } 225 | return goldmark.New(opts...) 226 | } 227 | 228 | func decode(s string) string { 229 | s = strings.ReplaceAll(s, "^J\n", "\n") 230 | s = strings.ReplaceAll(s, "^M", "\r") 231 | s = strings.ReplaceAll(s, "^D\n", "") 232 | s = strings.ReplaceAll(s, "^@", "\x00") 233 | return s 234 | } 235 | 236 | func encode(s string) string { 237 | s = strings.ReplaceAll(s, "\r\n", "^M\n") 238 | s = strings.ReplaceAll(s, "\r", "^M^D\n") 239 | s = strings.ReplaceAll(s, " \n", " ^J\n") 240 | s = strings.ReplaceAll(s, "\t\n", "\t^J\n") 241 | s = strings.ReplaceAll(s, "\x00", "^@") 242 | if s != "" && !strings.HasSuffix(s, "\n") { 243 | s += "^D\n" 244 | } 245 | return s 246 | } 247 | 248 | func parseParser(t *testing.T, data []byte) Parser { 249 | d := json.NewDecoder(bytes.NewReader(data)) 250 | d.DisallowUnknownFields() 251 | var p Parser 252 | err := d.Decode(&p) 253 | if err != nil { 254 | t.Fatalf("reading parser.json: %v", err) 255 | } 256 | err = d.Decode(new(json.RawMessage)) 257 | if err != io.EOF { 258 | t.Fatalf("junk on end of parser.json") 259 | } 260 | return p 261 | } 262 | 263 | func TestFormat(t *testing.T) { 264 | files, err := filepath.Glob(filepath.Join("testdata", "*_fmt.txt")) 265 | if err != nil { 266 | t.Fatal(err) 267 | } 268 | for _, file := range files { 269 | t.Run(strings.TrimSuffix(filepath.Base(file), ".txt"), func(t *testing.T) { 270 | a, err := txtar.ParseFile(file) 271 | if err != nil { 272 | t.Fatal(err) 273 | } 274 | var p Parser 275 | for i := 0; i < len(a.Files); { 276 | if a.Files[i].Name == "parser.json" { 277 | p = parseParser(t, a.Files[i].Data) 278 | i++ 279 | continue 280 | } 281 | // Each test case is a single markdown document that should render either as itself, 282 | // or if followed by a file named "want", then by that file. 283 | name := a.Files[i].Name 284 | in := a.Files[i].Data 285 | wantb := in 286 | i++ 287 | if i < len(a.Files) && a.Files[i].Name == "want" { 288 | wantb = a.Files[i].Data 289 | i++ 290 | } 291 | t.Run(name, func(t *testing.T) { 292 | doc := p.Parse(decode(string(in))) 293 | want := decode(string(wantb)) 294 | docWant := p.Parse(want) 295 | if ToHTML(doc) != ToHTML(docWant) { 296 | t.Errorf("bad testdata: input and want are different markdown documents:\ninput:\n%s\n\nwant:\n%s", dump(doc), dump(docWant)) 297 | } 298 | h := Format(doc) 299 | h = encode(h) 300 | if h != want { 301 | t.Errorf("input %q\nparse: \n%s\nhave %q\nwant %q", in, dump(doc), h, want) 302 | } 303 | }) 304 | } 305 | }) 306 | } 307 | 308 | // Files ending in ".md" should render as themselves. 309 | files, err = filepath.Glob(filepath.Join("testdata", "*.md")) 310 | if err != nil { 311 | t.Fatal(err) 312 | } 313 | for _, file := range files { 314 | t.Run(strings.TrimSuffix(filepath.Base(file), ".md"), func(t *testing.T) { 315 | data, err := os.ReadFile(file) 316 | if err != nil { 317 | t.Fatal(err) 318 | } 319 | w := string(data) 320 | var p Parser 321 | doc := p.Parse(w) 322 | h := Format(doc) 323 | if h != w { 324 | t.Errorf("have:\n%s\nwant:\n%s", h, w) 325 | outfile := file + ".have" 326 | t.Logf("writing have to %s", outfile) 327 | if err := os.WriteFile(outfile, []byte(h), 0666); err != nil { 328 | t.Fatal(err) 329 | } 330 | } 331 | }) 332 | } 333 | } 334 | 335 | func TestInline(t *testing.T) { 336 | // Test that these don't crash, 337 | // and also "cover" the bodies. 338 | new(HardBreak).Inline() 339 | new(SoftBreak).Inline() 340 | new(HTMLTag).Inline() 341 | new(Plain).Inline() 342 | new(Code).Inline() 343 | new(Strong).Inline() 344 | new(Del).Inline() 345 | new(Emph).Inline() 346 | new(Emoji).Inline() 347 | new(AutoLink).Inline() 348 | new(Link).Inline() 349 | new(Image).Inline() 350 | new(Task).Inline() 351 | } 352 | 353 | func findUnexported(v reflect.Value) (reflect.Value, bool) { 354 | if t := v.Type(); t.PkgPath() != "" && !token.IsExported(t.Name()) { 355 | return v, true 356 | } 357 | switch v.Kind() { 358 | case reflect.Interface, reflect.Pointer: 359 | if !v.IsNil() { 360 | if u, ok := findUnexported(v.Elem()); ok { 361 | return u, true 362 | } 363 | } 364 | case reflect.Struct: 365 | for i := 0; i < v.Type().NumField(); i++ { 366 | if !v.Type().Field(i).IsExported() { 367 | return v, true 368 | } 369 | if u, ok := findUnexported(v.Field(i)); ok { 370 | return u, true 371 | } 372 | } 373 | case reflect.Slice, reflect.Array: 374 | for i := 0; i < v.Len(); i++ { 375 | if u, ok := findUnexported(v.Index(i)); ok { 376 | return u, true 377 | } 378 | } 379 | } 380 | return v, false 381 | } 382 | 383 | var ( 384 | blockType = reflect.TypeOf(new(Block)).Elem() 385 | blocksType = reflect.TypeOf(new([]Block)).Elem() 386 | inlinesType = reflect.TypeOf(new(Inlines)).Elem() 387 | ) 388 | 389 | func printb(buf *bytes.Buffer, b Block, prefix string) { 390 | fmt.Fprintf(buf, "(%T", b) 391 | v := reflect.ValueOf(b) 392 | v = reflect.Indirect(v) 393 | if v.Kind() != reflect.Struct { 394 | fmt.Fprintf(buf, " %v", b) 395 | } 396 | t := v.Type() 397 | for i := 0; i < t.NumField(); i++ { 398 | tf := t.Field(i) 399 | if !tf.IsExported() { 400 | continue 401 | } 402 | if tf.Type == inlinesType { 403 | printis(buf, v.Field(i).Interface().(Inlines)) 404 | } else if tf.Type.Kind() == reflect.Slice && tf.Type.Elem().Kind() == reflect.String { 405 | fmt.Fprintf(buf, " %s:%q", tf.Name, v.Field(i)) 406 | } else if tf.Type != blocksType && !tf.Type.Implements(blockType) && tf.Type.Kind() != reflect.Slice { 407 | fmt.Fprintf(buf, " %s:%v", tf.Name, v.Field(i)) 408 | } 409 | } 410 | 411 | prefix += "\t" 412 | for i := 0; i < t.NumField(); i++ { 413 | tf := t.Field(i) 414 | if !tf.IsExported() { 415 | continue 416 | } 417 | if tf.Type.Implements(blockType) { 418 | fmt.Fprintf(buf, "\n%s", prefix) 419 | printb(buf, v.Field(i).Interface().(Block), prefix) 420 | } else if tf.Type == blocksType { 421 | vf := v.Field(i) 422 | for i := 0; i < vf.Len(); i++ { 423 | fmt.Fprintf(buf, "\n%s", prefix) 424 | printb(buf, vf.Index(i).Interface().(Block), prefix) 425 | } 426 | } else if tf.Type.Kind() == reflect.Slice && tf.Type != inlinesType && tf.Type.Elem().Kind() != reflect.String { 427 | fmt.Fprintf(buf, "\n%s%s:", prefix, t.Field(i).Name) 428 | printslice(buf, v.Field(i), prefix) 429 | } 430 | } 431 | fmt.Fprintf(buf, ")") 432 | } 433 | 434 | func printslice(buf *bytes.Buffer, v reflect.Value, prefix string) { 435 | if v.Type().Elem().Kind() == reflect.Slice { 436 | for i := 0; i < v.Len(); i++ { 437 | fmt.Fprintf(buf, "\n%s#%d:", prefix, i) 438 | printslice(buf, v.Index(i), prefix+"\t") 439 | } 440 | return 441 | } 442 | for i := 0; i < v.Len(); i++ { 443 | fmt.Fprintf(buf, " ") 444 | printb(buf, v.Index(i).Interface().(Block), prefix+"\t") 445 | } 446 | } 447 | 448 | func printi(buf *bytes.Buffer, in Inline) { 449 | fmt.Fprintf(buf, "%T(", in) 450 | v := reflect.ValueOf(in).Elem() 451 | label := v.FieldByName("Label") 452 | if label.IsValid() { 453 | fmt.Fprintf(buf, "%q", label) 454 | } 455 | text := v.FieldByName("Text") 456 | if text.IsValid() { 457 | fmt.Fprintf(buf, "%q", text) 458 | } 459 | inner := v.FieldByName("Inner") 460 | if inner.IsValid() { 461 | printis(buf, inner.Interface().(Inlines)) 462 | } 463 | buf.WriteString(")") 464 | } 465 | 466 | func printis(buf *bytes.Buffer, ins []Inline) { 467 | for _, in := range ins { 468 | buf.WriteByte(' ') 469 | printi(buf, in) 470 | } 471 | } 472 | 473 | func dump(b Block) string { 474 | var buf bytes.Buffer 475 | printb(&buf, b, "") 476 | return buf.String() 477 | } 478 | -------------------------------------------------------------------------------- /html.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "strconv" 9 | "strings" 10 | "unicode" 11 | ) 12 | 13 | // An HTMLBlock is a [Block] representing an [HTML block]. 14 | // 15 | // [HTML block]: https://spec.commonmark.org/0.31.2/#html-blocks 16 | type HTMLBlock struct { 17 | Position 18 | // TODO should these be 'Text string'? 19 | Text []string // lines, without trailing newlines 20 | } 21 | 22 | func (*HTMLBlock) Block() {} 23 | 24 | func (b *HTMLBlock) printHTML(p *printer) { 25 | for _, s := range b.Text { 26 | p.html(s) 27 | p.html("\n") 28 | } 29 | } 30 | 31 | func (b *HTMLBlock) printMarkdown(p *printer) { 32 | p.maybeNL() 33 | for i, line := range b.Text { 34 | if i > 0 { 35 | p.nl() 36 | } 37 | p.WriteString(line) 38 | p.noTrim() 39 | } 40 | } 41 | 42 | // An htmlBuilder is a [blockBuilder] for an [HTMLBlock]. 43 | // If endBlank is true, the block ends immediately before the first blank line. 44 | // If endFunc is non-nil, the block ends immediately after the first line 45 | // for which endFunc returns true. 46 | type htmlBuilder struct { 47 | endBlank bool 48 | endFunc func(string) bool 49 | text []string //accumulated text 50 | } 51 | 52 | func (c *htmlBuilder) extend(p *parser, s line) (line, bool) { 53 | if c.endBlank && s.isBlank() { 54 | return s, false 55 | } 56 | t := s.string() 57 | c.text = append(c.text, t) 58 | if c.endFunc != nil && c.endFunc(t) { 59 | return line{}, false 60 | } 61 | return line{}, true 62 | } 63 | 64 | func (c *htmlBuilder) build(p *parser) Block { 65 | return &HTMLBlock{ 66 | p.pos(), 67 | c.text, 68 | } 69 | } 70 | 71 | // An HTMLTag is an [Inline] representing a [raw HTML tag]. 72 | // 73 | // [raw HTML tag]: https://spec.commonmark.org/0.31.2/#raw-html 74 | type HTMLTag struct { 75 | Text string // TODO rename to HTML? 76 | } 77 | 78 | func (*HTMLTag) Inline() {} 79 | 80 | func (x *HTMLTag) printHTML(p *printer) { 81 | p.html(x.Text) 82 | } 83 | 84 | func (x *HTMLTag) printMarkdown(p *printer) { 85 | // TODO are there newlines? probably not 86 | for i, line := range strings.Split(x.Text, "\n") { 87 | if i > 0 { 88 | p.nl() 89 | } 90 | p.WriteString(line) 91 | p.noTrim() 92 | } 93 | } 94 | 95 | func (x *HTMLTag) printText(p *printer) {} 96 | 97 | // startHTMLBlock is a [starter] for an [HTMLBlock]. 98 | // 99 | // See https://spec.commonmark.org/0.31.2/#html-blocks. 100 | func startHTMLBlock(p *parser, s line) (line, bool) { 101 | // Early out: block must start with a <. 102 | tt := s 103 | tt.trimSpace(0, 3, false) // TODO figure out trimSpace final argument 104 | if tt.peek() != '<' { 105 | return s, false 106 | } 107 | t := tt.string() 108 | 109 | // Check all 7 block types. 110 | if startHTMLBlock1(p, s, t) || 111 | startHTMLBlock2345(p, s, t) || 112 | startHTMLBlock6(p, s, t) || 113 | startHTMLBlock7(p, s, t) { 114 | return line{}, true 115 | } 116 | 117 | return s, false 118 | } 119 | 120 | const forceLower = 0x20 // ASCII letter | forceLower == ASCII lower-case 121 | 122 | // startHTMLBlock1 handles HTML block type 1: 123 | // line starting with or . 125 | // 126 | // s is the entire line, for saving if starting a block. 127 | // t is the line as a string, with leading spaces removed; it starts with <. 128 | func startHTMLBlock1(p *parser, s line, t string) bool { 129 | if len(t) < 2 { 130 | return false 131 | } 132 | if c := t[1] | forceLower; c != 'p' && c != 's' && c != 't' { // early out; check first letter 133 | return false 134 | } 135 | i := 2 136 | for i < len(t) && (t[i] != ' ' && t[i] != '\t' && t[i] != '>') { 137 | i++ 138 | } 139 | if !isBlock1Tag(t[1:i]) { 140 | return false 141 | } 142 | b := &htmlBuilder{endFunc: endBlock1} 143 | p.addBlock(b) 144 | b.text = append(b.text, s.string()) 145 | if endBlock1(t) { 146 | p.closeBlock() 147 | } 148 | return true 149 | } 150 | 151 | // endBlock1 reports whether the string contains 152 | //
    , , , or , 153 | // using ASCII case-insensitive matching. 154 | func endBlock1(s string) bool { 155 | start := -1 156 | for i := 0; i < len(s); i++ { 157 | if s[i] == '<' && i+1 < len(s) && s[i+1] == '/' { 158 | start = i + 2 159 | } 160 | if s[i] == '>' && start >= 0 { 161 | if isBlock1Tag(s[start:i]) { 162 | return true 163 | } 164 | start = -1 165 | } 166 | } 167 | return false 168 | } 169 | 170 | // isBlock1Tag reports whether tag is a tag that can open or close 171 | // HTML block type 1. 172 | func isBlock1Tag(tag string) bool { 173 | return lowerEq(tag, "pre") || lowerEq(tag, "script") || lowerEq(tag, "style") || lowerEq(tag, "textarea") 174 | } 175 | 176 | // lowerEq reports whether strings.ToLower(s) == lower 177 | // assuming lower is entirely ASCII lower-case letters. 178 | func lowerEq(s, lower string) bool { 179 | if len(s) != len(lower) { 180 | return false 181 | } 182 | lower = lower[:len(s)] 183 | for i := 0; i < len(s); i++ { 184 | if s[i]|forceLower != lower[i] { 185 | return false 186 | } 187 | } 188 | return true 189 | } 190 | 191 | // startHTMLBlock2345 handles HTML blocks types 2, 3, 4, and 5, 192 | // the ones that start and end a specific string constant. 193 | // 194 | // s is the entire line, for saving if starting a block. 195 | // t is the line as a string, with leading spaces removed; it starts with <. 196 | func startHTMLBlock2345(p *parser, s line, t string) bool { 197 | var end string 198 | switch { 199 | default: 200 | return false 201 | 202 | // type 2: , or or because of simplistic parsing. 203 | case strings.HasPrefix(t, "" 205 | 206 | // type 3: , or because of simplistic parsing. 207 | case strings.HasPrefix(t, "" 209 | 210 | // type 4: 211 | case strings.HasPrefix(t, "" 213 | 214 | // type 5: 215 | // The spec says nothing about requiring a leading upper-case letter, 216 | // only that it should be an ASCII letter, but cmark-gfm, Goldmark, 217 | // and the Dingus all require upper-case, so we do too. 218 | // Presumably this is because the actual goal is to recognize the few 219 | // XML definitions that can appear, and they are all upper-case. 220 | // The result is that is an HTMLBlock but is an HTMLTag. 221 | // That's inconsistent, but Markdown is full of them, so we prioritize 222 | // consistency with all the existing implementations. 223 | case strings.HasPrefix(t, "= 3 && 'A' <= t[2] && t[2] <= 'Z': 224 | end = ">" 225 | } 226 | 227 | b := &htmlBuilder{endFunc: func(s string) bool { return strings.Contains(s, end) }} 228 | p.addBlock(b) 229 | b.text = append(b.text, s.string()) 230 | if b.endFunc(t) { 231 | // If terminator appears on the starting line, we're done. 232 | p.closeBlock() 233 | } 234 | return true 235 | } 236 | 237 | // startHTMLBlock6 handles HTML block type 6, 238 | // which starts with the start of a recognized tag 239 | // and ends at a blank line. 240 | // 241 | // s is the entire line, for saving if starting a block. 242 | // t is the line as a string, with leading spaces removed; it starts with <. 243 | func startHTMLBlock6(p *parser, s line, t string) bool { 244 | // Skip over < or 1 && t[1] == '/' { 247 | start = 2 248 | } 249 | 250 | // Scan ASCII alphanumeric tag name; 251 | // must be followed by space, tab, >, />, or end of line. 252 | end := start 253 | for end < len(t) && end < 16 && isLetterDigit(t[end]) { 254 | end++ 255 | } 256 | if end < len(t) { 257 | switch t[end] { 258 | default: 259 | return false 260 | case ' ', '\t', '>': 261 | // ok 262 | case '/': 263 | if end+1 >= len(t) || t[end+1] != '>' { 264 | return false 265 | } 266 | } 267 | } 268 | 269 | // Check whether tag is a recognized name. 270 | tag := t[start:end] 271 | if tag == "" { 272 | return false 273 | } 274 | c := tag[0] | forceLower 275 | for _, name := range htmlTags { 276 | if name[0] == c && len(name) == len(tag) && lowerEq(tag, name) { 277 | if end < len(t) && t[end] == '\t' { 278 | // Goldmark recognizes space but not tab. 279 | // testdata/extra.txt 143.md 280 | p.corner = true 281 | } 282 | b := &htmlBuilder{endBlank: true} 283 | p.addBlock(b) 284 | b.text = append(b.text, s.string()) 285 | return true 286 | } 287 | } 288 | return false 289 | } 290 | 291 | // startHTMLBlock7 handles HTML block type 7, 292 | // which starts with a complete tag on a line by itself 293 | // and ends at a blank line. 294 | // 295 | // s is the entire line, for saving if starting a block. 296 | // t is the line as a string, with leading spaces removed; it starts with <. 297 | func startHTMLBlock7(p *parser, s line, t string) bool { 298 | // Type 7 blocks cannot interrupt a paragraph, 299 | // so that rewrapping a paragraph with inline tags 300 | // cannot change them into starting an HTML block. 301 | if p.para() != nil { 302 | return false 303 | } 304 | 305 | if _, end, ok := parseHTMLOpenTag(p, t, 0); ok && skipSpace(t, end) == len(t) { 306 | if end != len(t) { 307 | // Goldmark disallows trailing space 308 | p.corner = true 309 | } 310 | b := &htmlBuilder{endBlank: true} 311 | p.addBlock(b) 312 | b.text = append(b.text, s.string()) 313 | return true 314 | } 315 | if _, end, ok := parseHTMLClosingTag(p, t, 0); ok && skipSpace(t, end) == len(t) { 316 | b := &htmlBuilder{endBlank: true} 317 | p.addBlock(b) 318 | b.text = append(b.text, s.string()) 319 | return true 320 | } 321 | return false 322 | } 323 | 324 | // parseHTMLTag is an [inlineParser] for an [HTMLTag]. 325 | // The caller has has checked that s[start] is '<'. 326 | func parseHTMLTag(p *parser, s string, start int) (x Inline, end int, ok bool) { 327 | // “An HTML tag consists of an open tag, a closing tag, an HTML comment, 328 | // a processing instruction, a declaration, or a CDATA section.” 329 | if len(s)-start < 3 || s[start] != '<' { 330 | return 331 | } 332 | switch s[start+1] { 333 | default: 334 | return parseHTMLOpenTag(p, s, start) 335 | case '/': 336 | return parseHTMLClosingTag(p, s, start) 337 | case '!': 338 | switch s[start+2] { 339 | case '-': 340 | return parseHTMLComment(p, s, start) 341 | case '[': 342 | return parseHTMLCDATA(p, s, start) 343 | default: 344 | return parseHTMLDecl(p, s, start) 345 | } 346 | case '?': 347 | return parseHTMLProcInst(p, s, start) 348 | } 349 | } 350 | 351 | // parseHTMLOpenTag is an [inlineParser] for an HTML open tag. 352 | // The caller has has checked that s[start] is '<'. 353 | func parseHTMLOpenTag(p *parser, s string, i int) (x Inline, end int, ok bool) { 354 | // “An open tag consists of a < character, a tag name, zero or more attributes, 355 | // optional spaces, tabs, and up to one line ending, an optional / character, and a > character.” 356 | 357 | // < character 358 | if i >= len(s) || s[i] != '<' { 359 | // unreachable unless called wrong 360 | return 361 | } 362 | 363 | // tag name 364 | name, j, ok1 := parseTagName(s, i+1) 365 | if !ok1 { 366 | return 367 | } 368 | switch name { 369 | case "pre", "script", "style", "textarea": 370 | // Goldmark treats these as starting a new HTMLBlock 371 | // and ending the paragraph they appear in. 372 | p.corner = true 373 | } 374 | 375 | // zero or more attributes 376 | for { 377 | if j >= len(s) || s[j] != ' ' && s[j] != '\t' && s[j] != '\n' && s[j] != '/' && s[j] != '>' { 378 | return 379 | } 380 | _, k, ok := parseAttr(p, s, skipSpace(s, j)) 381 | if !ok { 382 | break 383 | } 384 | j = k 385 | } 386 | 387 | // optional spaces, tabs, and up to one line ending 388 | k := skipSpace(s, j) 389 | if k != j { 390 | // Goldmark mishandles spaces before >. 391 | p.corner = true 392 | } 393 | j = k 394 | 395 | // an optional / character 396 | if j < len(s) && s[j] == '/' { 397 | j++ 398 | } 399 | 400 | // and a > character. 401 | if j >= len(s) || s[j] != '>' { 402 | return 403 | } 404 | 405 | return &HTMLTag{s[i : j+1]}, j + 1, true 406 | } 407 | 408 | // parseHTMLClosingTag is an [inlineParser] for an HTML closing tag. 409 | // The caller has has checked that s[start:] begins with ".” 413 | if i+2 >= len(s) || s[i] != '<' || s[i+1] != '/' { 414 | return 415 | } 416 | if skipSpace(s, i+2) != i+2 { 417 | // Goldmark allows spaces here but the spec and the Dingus do not. 418 | p.corner = true 419 | } 420 | 421 | if _, j, ok := parseTagName(s, i+2); ok { 422 | j = skipSpace(s, j) 423 | if j < len(s) && s[j] == '>' { 424 | return &HTMLTag{s[i : j+1]}, j + 1, true 425 | } 426 | } 427 | return 428 | } 429 | 430 | // parseTagName parses a leading tag name from s[start:], 431 | // returning the tag and the end location. 432 | func parseTagName(s string, start int) (tag string, end int, ok bool) { 433 | // “A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).” 434 | if start >= len(s) || !isLetter(s[start]) { 435 | return 436 | } 437 | end = start + 1 438 | for end < len(s) && isLDH(s[end]) { 439 | end++ 440 | } 441 | return s[start:end], end, true 442 | } 443 | 444 | // parseAttr parses a leading attr (or attr=value) from s[start:], 445 | // returning the entire attribute (including the =value) and the end location. 446 | func parseAttr(p *parser, s string, start int) (attr string, end int, ok bool) { 447 | // “An attribute consists of spaces, tabs, and up to one line ending, 448 | // an attribute name, and an optional attribute value specification.” 449 | _, end, ok = parseAttrName(s, start) 450 | if !ok { 451 | return 452 | } 453 | if endVal, ok := parseAttrValueSpec(p, s, end); ok { 454 | end = endVal 455 | } 456 | return s[start:end], end, true 457 | } 458 | 459 | // parseAttrName parses a leading attribute name from s[start:], 460 | // returning the name and the end location. 461 | func parseAttrName(s string, start int) (name string, end int, ok bool) { 462 | // “An attribute name consists of an ASCII letter, _, or :, 463 | // followed by zero or more ASCII letters, digits, _, ., :, or -.” 464 | if start+1 >= len(s) || (!isLetter(s[start]) && s[start] != '_' && s[start] != ':') { 465 | return 466 | } 467 | end = start + 1 468 | for end < len(s) && (isLDH(s[end]) || s[end] == '_' || s[end] == '.' || s[end] == ':') { 469 | end++ 470 | } 471 | return s[start:end], end, true 472 | } 473 | 474 | // parseAttrValueSpec parses a leading attribute value specification 475 | // from s[start:], returning the end location. 476 | func parseAttrValueSpec(p *parser, s string, start int) (end int, ok bool) { 477 | // “An attribute value specification consists of 478 | // optional spaces, tabs, and up to one line ending, 479 | // a = character, 480 | // optional spaces, tabs, and up to one line ending, 481 | // and an attribute value.” 482 | end = skipSpace(s, start) 483 | if end >= len(s) || s[end] != '=' { 484 | return 485 | } 486 | end = skipSpace(s, end+1) 487 | 488 | // “An attribute value consists of 489 | // an unquoted attribute value, 490 | // a single-quoted attribute value, 491 | // or a double-quoted attribute value.” 492 | // TODO: No escaping??? 493 | if end < len(s) && (s[end] == '\'' || s[end] == '"') { 494 | // “A single-quoted attribute value consists of ', 495 | // zero or more characters not including ', and a final '.” 496 | // “A double-quoted attribute value consists of ", 497 | // zero or more characters not including ", and a final ".” 498 | i := strings.IndexByte(s[end+1:], s[end]) 499 | if i < 0 { 500 | return 501 | } 502 | return end + 1 + i + 1, true 503 | } 504 | 505 | // “An unquoted attribute value is a nonempty string of characters 506 | // not including spaces, tabs, line endings, ", ', =, <, >, or `.” 507 | isAttrVal := func(c byte) bool { 508 | return c != ' ' && c != '\t' && c != '\n' && 509 | c != '"' && c != '\'' && 510 | c != '=' && c != '<' && c != '>' && c != '`' 511 | } 512 | i := end 513 | for i < len(s) && isAttrVal(s[i]) { 514 | i++ 515 | } 516 | if i == end { 517 | return 518 | } 519 | return i, true 520 | } 521 | 522 | // parseHTMLComment is an [inlineParser] for an HTML comment. 523 | // The caller has has checked that s[start:] begins with ", 526 | // where text does not start with > or ->, 527 | // does not end with -, and does not contain --.” 528 | if strings.HasPrefix(s[start:], "") { 529 | end = start + len("") 530 | return &HTMLTag{s[start:end]}, end, true 531 | } 532 | if strings.HasPrefix(s[start:], "") { 533 | end = start + len("") 534 | return &HTMLTag{s[start:end]}, end, true 535 | } 536 | if x, end, ok := parseHTMLMarker(p, s, start, ""); ok { 537 | return x, end, ok 538 | } 539 | return 540 | } 541 | 542 | // parseHTMLCDATA is an [inlineParser] for an HTML CDATA section. 543 | // The caller has has checked that s[start:] begins with ", and the string ]]>.” 547 | return parseHTMLMarker(p, s, i, "") 548 | } 549 | 550 | // parseHTMLDecl is an [inlineParser] for an HTML declaration section. 551 | // The caller has has checked that s[start:] begins with ", and the character >.” 555 | if i+2 < len(s) && isLetter(s[i+2]) { 556 | if 'a' <= s[i+2] && s[i+2] <= 'z' { 557 | p.corner = true // goldmark requires uppercase 558 | } 559 | return parseHTMLMarker(p, s, i, "") 560 | } 561 | return 562 | } 563 | 564 | // parseHTMLDecl is an [inlineParser] for an HTML processing instruction. 565 | // The caller has has checked that s[start:] begins with ", and the string ?>.” 569 | return parseHTMLMarker(p, s, i, "") 570 | } 571 | 572 | // parseHTMLMarker is a generalized parser for the 573 | // various prefix/suffix-denote HTML markers. 574 | // If s[start:] starts with prefix and is followed eventually by suffix, 575 | // then parseHTMLMarker returns an HTMLTag for that section of s 576 | // along with start, end, ok to implement the result of an [inlineParser]. 577 | func parseHTMLMarker(p *parser, s string, start int, prefix, suffix string) (x Inline, end int, ok bool) { 578 | if strings.HasPrefix(s[start:], prefix) { 579 | // To avoid quadratic behavior looking at on line 612 | case '?': 613 | p.noProcInstEnd = true // no ?> on line 614 | } 615 | } 616 | return 617 | } 618 | 619 | // parseHTMLEntity is an [inlineParser] for an HTML entity reference, 620 | // such as ", {, or ካ. 621 | func parseHTMLEntity(_ *parser, s string, start int) (x Inline, end int, ok bool) { 622 | i := start 623 | if i+1 < len(s) && s[i+1] == '#' { 624 | i += 2 625 | var r int 626 | if i < len(s) && (s[i] == 'x' || s[i] == 'X') { 627 | // hex 628 | i++ 629 | j := i 630 | for j < len(s) && isHexDigit(s[j]) { 631 | j++ 632 | } 633 | if j-i < 1 || j-i > 6 || j >= len(s) || s[j] != ';' { 634 | return 635 | } 636 | r64, _ := strconv.ParseInt(s[i:j], 16, 0) 637 | r = int(r64) 638 | end = j + 1 639 | } else { 640 | // decimal 641 | j := i 642 | for j < len(s) && isDigit(s[j]) { 643 | j++ 644 | } 645 | if j-i < 1 || j-i > 7 || j >= len(s) || s[j] != ';' { 646 | return 647 | } 648 | r, _ = strconv.Atoi(s[i:j]) 649 | end = j + 1 650 | } 651 | if r > unicode.MaxRune || r == 0 { 652 | // Invalid code points and U+0000 are replaced by U+FFFD. 653 | r = unicode.ReplacementChar 654 | } 655 | return &Plain{string(rune(r))}, end, true 656 | } 657 | 658 | // Max name in list is 32 bytes. Try for 64 for good measure. 659 | for j := i + 1; j < len(s) && j-i < 64; j++ { 660 | if s[j] == '&' { // Stop possible quadratic search on &&&&&&&. 661 | break 662 | } 663 | if s[j] == ';' { 664 | if r, ok := htmlEntity[s[i:j+1]]; ok { 665 | return &Plain{r}, j + 1, true 666 | } 667 | break 668 | } 669 | } 670 | 671 | return 672 | } 673 | --------------------------------------------------------------------------------