├── testdata ├── task_fmt.txt ├── fuzz │ └── Fuzz │ │ ├── 5d90cadcbf2fc0a05c34346f2e0d544de4e230b1a7b56412ab4b5fdbb413d147 │ │ ├── e2e384485b8d6c08f62211a6db9cf55e3582dfe088c6ffc6f3ee80446171e148 │ │ ├── 900d64f4df082a036ff8da05207cb0b00379ef7c2714addee6a3000b7d42f046 │ │ ├── 6e1ec98995f90b7237a488109ef07219eea37739b6fc18f69c0c61cfd43590ce │ │ ├── 99b7e429a4c90c1eddd1560b79014ab2442f3b5b9b80d84d0a04a96a4a8c9906 │ │ ├── b6461168fb519180a65d1a230dc6c5cb03194e5817bf4a192c33b6fbd8eec65f │ │ ├── 4f0397bfd8cdada4815be61da4ee7a80200dc512dc0bfc09dd086dad03b335dc │ │ ├── e73b40d4a194f94ba52c4774f577ca9d90e71698cf76d2944c688c0b4a9927b9 │ │ ├── 38a2bce29a092521f5d1f873dd7bab598b72474bee79f396ac5d1515128baa71 │ │ └── ce9879da2226220068fd4085fff503aa5ebf62c5879af3dc23fc47dd29e500f0 ├── heading_fmt.txt ├── code_fmt.txt ├── smart.txt ├── emoji.txt ├── linkref_fmt.txt ├── table_fmt.txt ├── headings.txt ├── del.txt ├── footnote.txt ├── spec2txtar.go ├── task.txt ├── gfm_smart.txt ├── cmark2txtar.go ├── table.txt ├── gfm_regress.txt ├── basic_fmt.txt ├── gfm_ext.txt └── autoext.txt ├── go.mod ├── go.sum ├── README.md ├── block.go ├── doc.go ├── htmltags.go ├── emoji2gist.go ├── LICENSE ├── quote.go ├── mdfmt └── main.go ├── table_test.go ├── entity2go.go ├── md2html └── main.go ├── emoji2go.go ├── fuzz_test.go ├── break.go ├── htmlesc.go ├── lex.go ├── line.go ├── footnote.go ├── print.go ├── big_test.go ├── para.go ├── heading.go ├── code.go ├── parse.go ├── table.go ├── list.go ├── md_test.go └── html.go /testdata/task_fmt.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"TaskList": true} 3 | -- gfm279.md -- 4 | - [ ] foo 5 | - [x] bar 6 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/5d90cadcbf2fc0a05c34346f2e0d544de4e230b1a7b56412ab4b5fdbb413d147: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("*[_*]()\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/e2e384485b8d6c08f62211a6db9cf55e3582dfe088c6ffc6f3ee80446171e148: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("\\\\\nr\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/900d64f4df082a036ff8da05207cb0b00379ef7c2714addee6a3000b7d42f046: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("*[a*r*]()\n") 3 | -------------------------------------------------------------------------------- /testdata/heading_fmt.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"HeadingID": true} 3 | -- 1 -- 4 | # H {# id } 5 | -- want -- 6 | # H {#id} 7 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/6e1ec98995f90b7237a488109ef07219eea37739b6fc18f69c0c61cfd43590ce: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("!][![[]()]()]()\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/99b7e429a4c90c1eddd1560b79014ab2442f3b5b9b80d84d0a04a96a4a8c9906: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("- e\n\n o\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/b6461168fb519180a65d1a230dc6c5cb03194e5817bf4a192c33b6fbd8eec65f: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("- a\n > b\n ` `\n c'= ```\n; d\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/4f0397bfd8cdada4815be61da4ee7a80200dc512dc0bfc09dd086dad03b335dc: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/e73b40d4a194f94ba52c4774f577ca9d90e71698cf76d2944c688c0b4a9927b9: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("1. a\n\n \x05\x05\x05\x05\x052n b\n\n 3. c\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/38a2bce29a092521f5d1f873dd7bab598b72474bee79f396ac5d1515128baa71: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("![([foo]![([oo][i.)](u.\u007ffoo][i1)](u1.)](uri3)\n") 3 | -------------------------------------------------------------------------------- /testdata/fuzz/Fuzz/ce9879da2226220068fd4085fff503aa5ebf62c5879af3dc23fc47dd29e500f0: -------------------------------------------------------------------------------- 1 | go test fuzz v1 2 | string("1. foo\n\n ```\n bar\n ``\n\n ` baz\n\n > bo\n\n `>m\n") 3 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module rsc.io/markdown 2 | 3 | go 1.22.0 4 | 5 | require ( 6 | github.com/yuin/goldmark v1.6.0 // for testing only 7 | golang.org/x/text v0.3.7 8 | golang.org/x/tools v0.1.5 9 | ) 10 | -------------------------------------------------------------------------------- /testdata/code_fmt.txt: -------------------------------------------------------------------------------- 1 | -- 1 -- 2 | `x` 3 | -- want -- 4 | `x` 5 | -- 2 -- 6 | ```x``` 7 | -- want -- 8 | `x` 9 | -- 3 -- 10 | ```` `x` ```` 11 | -- want -- 12 | `` `x` `` 13 | -- 4 -- 14 | `````a ``` b`` ````` 15 | -- want -- 16 | ````a ``` b`` ```` 17 | -------------------------------------------------------------------------------- /testdata/smart.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"SmartQuote": true} 3 | -- 1.md -- 4 | 'hello' 5 | -- 1.html -- 6 |
‘hello’
7 | -- 2.md -- 8 | my'hello' 9 | -- 2.html -- 10 |my’hello’
11 | -- 3.md -- 12 | [my]'hello' 13 | -- 3.html -- 14 |[my]’hello’
15 | -------------------------------------------------------------------------------- /testdata/emoji.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"Emoji": true} 3 | -- 1.md -- 4 | emojis 5 | :+1: 6 | :100: 7 | :1st_place_medal: 8 | :negative_squared_cross_mark: 9 | :wales: 10 | :south_georgia_south_sandwich_islands: 11 | :woman_facepalming: 12 | end 13 | -- 1.html -- 14 |emojis 15 | 👍 16 | 💯 17 | 🥇 18 | ❎ 19 | 🏴 20 | 🇬🇸 21 | 🤦♀️ 22 | end
23 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/yuin/goldmark v1.6.0 h1:boZcn2GTjpsynOsC0iJHnBWa4Bi0qzfJjthwauItG68= 2 | github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 3 | golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= 4 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 5 | golang.org/x/tools v0.1.5 h1:ouewzE6p+/VEB31YYnTbEJdi8pFqKp4P4n85vwo3DHA= 6 | golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Package markdown is a Commonmark-compliant Markdown parser and 2 | HTML generator. It does not have many bells and whistles, but it does 3 | expose the parsed syntax in an easy-to-use form. 4 | 5 | Work in progress. 6 | 7 | TODO: 8 | - documentation 9 | - make Format always print valid markdown, 10 | even when the tree was constructed manually and may 11 | not correspond to something Parse would return. 12 | - footnote support 13 | - possibly math support 14 | - would it be simpler to have a lexer generated from regexps? 15 | -------------------------------------------------------------------------------- /testdata/linkref_fmt.txt: -------------------------------------------------------------------------------- 1 | Tests for rendering a document's link references in markdown. 2 | -- simple -- 3 | A document. 4 | 5 | [foo]: u 6 | -- want -- 7 | A document. 8 | 9 | [foo]: u 10 | -- sorted -- 11 | A document. 12 | 13 | [foo]: u1 14 | [bar]: u2 15 | -- want -- 16 | A document. 17 | 18 | [bar]: u2 19 | [foo]: u1 20 | -- interleaved -- 21 | First. 22 | 23 | [foo]: u1 24 | Second. 25 | 26 | [bar]: u2 27 | -- want -- 28 | First. 29 | 30 | Second. 31 | 32 | [bar]: u2 33 | [foo]: u1 34 | -- titles -- 35 | A document. 36 | 37 | [r1]: u1 (title1) 38 | [r2]: u2 "title2" 39 | [r3]: u3 'title3' 40 | -------------------------------------------------------------------------------- /block.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | // Block is implemented by: 8 | // 9 | // CodeBlock 10 | // Document 11 | // Empty 12 | // HTMLBlock 13 | // Heading 14 | // Item 15 | // List 16 | // Paragraph 17 | // Quote 18 | // Text 19 | // ThematicBreak 20 | type Block interface { 21 | Block() 22 | Pos() Position 23 | printHTML(p *printer) 24 | printMarkdown(p *printer) 25 | } 26 | 27 | type Position struct { 28 | StartLine int 29 | EndLine int 30 | } 31 | 32 | func (p Position) Pos() Position { 33 | return p 34 | } 35 | -------------------------------------------------------------------------------- /testdata/table_fmt.txt: -------------------------------------------------------------------------------- 1 | -- parser.json -- 2 | {"Table": true} 3 | -- padded -- 4 | |foo|bar|baz| 5 | |--|--|--| 6 | |1|2|3| 7 | |a|b|c| 8 | -- want -- 9 | | foo | bar | baz | 10 | | --- | --- | --- | 11 | | 1 | 2 | 3 | 12 | | a | b | c | 13 | -- aligned -- 14 | |foo|bär|baz| 15 | |:--|:-:|--:| 16 | |1|2|3| 17 | |a|b|c| 18 | -- want -- 19 | | foo | bär | baz | 20 | | :-- | :-: | --: | 21 | | 1 | 2 | 3 | 22 | | a | b | c | 23 | -- with_normalized_inline -- 24 | |[foo](u1 )| 25 | |---| 26 | |1| 27 | |a| 28 | -- want -- 29 | | [foo](u1) | 30 | | --------- | 31 | | 1 | 32 | | a | 33 | -- indented -- 34 | - item 1 35 | 36 | | col1 | col2 | 37 | | ---- | ---- | 38 | | 1 | 2 | 39 | -- bigvalues -- 40 | | foo | bar | baz | 41 | | --- | -------- | --- | 42 | | 1 | 22345678 | 3 | 43 | | a | b | c | 44 | -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | type Document struct { 8 | Position 9 | Blocks []Block 10 | Links map[string]*Link 11 | } 12 | 13 | func (*Document) Block() {} 14 | 15 | func (b *Document) printHTML(p *printer) { 16 | for _, c := range b.Blocks { 17 | c.printHTML(p) 18 | } 19 | } 20 | 21 | func (b *Document) printMarkdown(p *printer) { 22 | printMarkdownBlocks(b.Blocks, p) 23 | 24 | // Terminate with a single newline. 25 | text := p.buf.Bytes() 26 | w := len(text) 27 | for w > 0 && text[w-1] == '\n' { 28 | w-- 29 | } 30 | p.buf.Truncate(w) 31 | if w > 0 { 32 | p.nl() 33 | } 34 | 35 | // Add link reference definitions. 36 | if len(b.Links) > 0 { 37 | if p.buf.Len() > 0 { 38 | p.nl() 39 | } 40 | printLinks(p, b.Links) 41 | } 42 | } 43 | 44 | func printMarkdownBlocks(bs []Block, p *printer) { 45 | for bn, b := range bs { 46 | if bn > 0 { 47 | p.nl() // end block 48 | if p.loose > 0 { 49 | p.nl() 50 | } 51 | } 52 | b.printMarkdown(p) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /testdata/headings.txt: -------------------------------------------------------------------------------- 1 | Goldmark fails on 11 because it doesn't like slashes or spaces in ids. 2 | -- parser.json -- 3 | {"HeadingID": true} 4 | -- 1.md -- 5 | # Heading 6 | -- 1.html -- 7 |Hi Hello, there world!
This ~~has a
20 |new paragraph~~.
21 | -- gfm493.md -- 22 | This will ~~~not~~~ strike. 23 | -- gfm493.html -- 24 |This will ~~~not~~~ strike.
25 | -- 1.md -- 26 | 5*6*78 27 | 5_6_78 28 | 5~6~78 29 | -- 1.html -- 30 |5678
31 | 5_6_78
32 | 5678
Hi Hello, there world!
38 | 5678
this
this
this
this
this
this
A footnote can also[^3] have multiple lines2.
14 |My reference. 18 | ↩
19 |To add line breaks within a footnote, prefix new lines with 2 spaces. 22 | This is a second line. 23 | ↩ 24 | ↩
25 |Footnote1.
33 |Hi. 37 | ↩
38 |Footnote1.
46 |Hi. 50 | ↩
51 |\n") 19 | for _, c := range b.Blocks { 20 | c.printHTML(p) 21 | } 22 | p.html("\n") 23 | } 24 | 25 | func (b *Quote) printMarkdown(p *printer) { 26 | p.maybeQuoteNL('>') 27 | p.WriteString("> ") 28 | defer p.pop(p.push("> ")) 29 | printMarkdownBlocks(b.Blocks, p) 30 | } 31 | 32 | // A quoteBuildier is a [blockBuilder] for a block quote. 33 | type quoteBuilder struct{} 34 | 35 | // startBlockQuote is a [starter] for a [Quote]. 36 | func startBlockQuote(p *parser, s line) (line, bool) { 37 | line, ok := trimQuote(s) 38 | if !ok { 39 | return s, false 40 | } 41 | p.addBlock(new(quoteBuilder)) 42 | return line, true 43 | } 44 | 45 | func trimQuote(s line) (line, bool) { 46 | t := s 47 | t.trimSpace(0, 3, false) 48 | if !t.trim('>') { 49 | return s, false 50 | } 51 | t.trimSpace(0, 1, true) 52 | return t, true 53 | } 54 | 55 | func (b *quoteBuilder) extend(p *parser, s line) (line, bool) { 56 | return trimQuote(s) 57 | } 58 | 59 | func (b *quoteBuilder) build(p *parser) Block { 60 | return &Quote{p.pos(), p.blocks()} 61 | } 62 | -------------------------------------------------------------------------------- /mdfmt/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Mdfmt reformats Markdown data. 6 | // 7 | // Usage: 8 | // 9 | // mdfmt [-w] [file...] 10 | // 11 | // Mdfmt reads the named files, or else standard input, as Markdown documents 12 | // and then reprints the same Markdown documents to standard output. 13 | // 14 | // The -w flag specifies to rewrite the files in place. 15 | package main 16 | 17 | import ( 18 | "flag" 19 | "fmt" 20 | "io" 21 | "log" 22 | "os" 23 | 24 | "rsc.io/markdown" 25 | ) 26 | 27 | var ( 28 | wflag = flag.Bool("w", false, "write reformatted Markdown back to input files") 29 | exit = 0 30 | ) 31 | 32 | func usage() { 33 | fmt.Fprintf(os.Stderr, "usage: mdfmt [-w] [file...]\n") 34 | flag.PrintDefaults() 35 | os.Exit(2) 36 | } 37 | 38 | func main() { 39 | log.SetPrefix("mdfmt: ") 40 | log.SetFlags(0) 41 | flag.Usage = usage 42 | flag.Parse() 43 | 44 | if flag.NArg() == 0 { 45 | data, err := io.ReadAll(os.Stdin) 46 | if err != nil { 47 | log.Fatal(err) 48 | } 49 | convert(data, "") 50 | } else { 51 | for _, file := range flag.Args() { 52 | data, err := os.ReadFile(file) 53 | if err != nil { 54 | log.Print(err) 55 | exit = 1 56 | continue 57 | } 58 | convert(data, file) 59 | } 60 | } 61 | os.Exit(exit) 62 | } 63 | 64 | func convert(data []byte, file string) { 65 | var p markdown.Parser 66 | doc := p.Parse(string(data)) 67 | out := []byte(markdown.Format(doc)) 68 | if *wflag && file != "" { 69 | if err := os.WriteFile(file, out, 0666); err != nil { 70 | log.Print(err) 71 | exit = 1 72 | return 73 | } 74 | } else { 75 | os.Stdout.Write(out) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /testdata/spec2txtar.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // go run spec2txtar.go https://spec.commonmark.org/0.30/spec.json > spec0.30.txt 6 | 7 | package main 8 | 9 | import ( 10 | "encoding/json" 11 | "flag" 12 | "fmt" 13 | "io" 14 | "log" 15 | "net/http" 16 | "os" 17 | "strings" 18 | 19 | "golang.org/x/tools/txtar" 20 | ) 21 | 22 | type specCase struct { 23 | Name string 24 | Markdown string 25 | HTML string 26 | Example int 27 | } 28 | 29 | func main() { 30 | log.SetFlags(0) 31 | log.SetPrefix("spec2txtar: ") 32 | flag.Usage = func() { 33 | fmt.Fprintf(os.Stderr, "usage: spec2txtar url\n") 34 | os.Exit(2) 35 | } 36 | flag.Parse() 37 | if flag.NArg() != 1 { 38 | flag.Usage() 39 | } 40 | url := flag.Arg(0) 41 | 42 | resp, err := http.Get(url) 43 | if err != nil { 44 | log.Fatal(err) 45 | } 46 | if resp.StatusCode != 200 { 47 | log.Fatal(resp.Status) 48 | } 49 | data, err := io.ReadAll(resp.Body) 50 | if err != nil { 51 | log.Fatal(err) 52 | } 53 | 54 | var spec []specCase 55 | err = json.Unmarshal(data, &spec) 56 | if err != nil { 57 | log.Fatal(err) 58 | } 59 | 60 | a := &txtar.Archive{ 61 | Comment: []byte("// go run spec2txtar.go " + url + "\n"), 62 | } 63 | for _, cas := range spec { 64 | name := fmt.Sprintf("%d", cas.Example) 65 | a.Files = append(a.Files, 66 | txtar.File{ 67 | Name: name + ".md", 68 | Data: []byte(encode(cas.Markdown)), 69 | }, 70 | txtar.File{ 71 | Name: name + ".html", 72 | Data: []byte(encode(cas.HTML)), 73 | }, 74 | ) 75 | } 76 | 77 | os.Stdout.Write(txtar.Format(a)) 78 | } 79 | 80 | func encode(s string) string { 81 | s = strings.ReplaceAll(s, " \n", " ^J\n") 82 | s = strings.ReplaceAll(s, "\t\n", "\t^J\n") 83 | if s != "" && !strings.HasSuffix(s, "\n") { 84 | s += "^D\n" 85 | } 86 | return s 87 | } 88 | -------------------------------------------------------------------------------- /table_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | var tableCountTests = []struct { 12 | row string 13 | n int 14 | }{ 15 | {"|", 1}, 16 | {"|x|", 1}, 17 | {"||", 1}, 18 | {"| |", 1}, 19 | {"| | |", 2}, 20 | {"| | Foo | Bar |", 3}, 21 | {"| | Foo | Bar |", 3}, 22 | {"", 1}, 23 | {"|a|b", 2}, 24 | {"|a| ", 1}, 25 | {" |b", 1}, 26 | {"a|b", 2}, 27 | {`x\|y`, 1}, 28 | {`x\\|y`, 1}, 29 | {`x\\\|y`, 1}, 30 | {`x\\\\|y`, 1}, 31 | {`x\\\\\|y`, 1}, 32 | {`| 0\|1\\|2\\\|3\\\\|4\\\\\|5\\\\\\|6\\\\\\\|7\\\\\\\\|8 |`, 1}, 33 | } 34 | 35 | func TestTableCount(t *testing.T) { 36 | for _, tt := range tableCountTests { 37 | n := tableCount(tableTrimOuter(tt.row)) 38 | if n != tt.n { 39 | t.Errorf("tableCount(%#q) = %d, want %d", tt.row, n, tt.n) 40 | } 41 | } 42 | } 43 | 44 | func TestPad(t *testing.T) { 45 | testCases := []struct { 46 | raw, align string 47 | w int 48 | 49 | want string 50 | }{ 51 | {"foo", "center", 8, " foo "}, 52 | {"foo", "center", 6, " foo "}, 53 | {"foo", "center", 5, " foo "}, 54 | {"föó", "center", 5, " föó "}, 55 | {"foo", "center", 4, "foo "}, 56 | {"foo", "center", 3, "foo"}, 57 | 58 | {"foo", "left", 8, "foo "}, 59 | {"foo", "right", 8, " foo"}, 60 | {"foo", "", 8, "foo "}, 61 | 62 | {"foo", "left", 6, "foo "}, 63 | {"foo", "right", 6, " foo"}, 64 | {"foo", "", 6, "foo "}, 65 | 66 | {"foo", "left", 5, "foo "}, 67 | {"foo", "right", 5, " foo"}, 68 | {"foo", "", 5, "foo "}, 69 | 70 | {"foo", "left", 4, "foo "}, 71 | {"foo", "right", 4, " foo"}, 72 | {"foo", "", 4, "foo "}, 73 | 74 | {"foo", "left", 3, "foo"}, 75 | {"foo", "right", 3, "foo"}, 76 | {"foo", "", 3, "foo"}, 77 | } 78 | 79 | for _, tc := range testCases { 80 | in := tc.raw 81 | a := tc.align 82 | w := tc.w 83 | want := tc.want 84 | var p printer 85 | pad(&p, in, a, w) 86 | h := p.buf.String() 87 | if h != want { 88 | t.Errorf("\npad(%s, %s, %d)\n have %q\n want %q", in, a, w, h, want) 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /entity2go.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build ignore 6 | 7 | package main 8 | 9 | import ( 10 | "bytes" 11 | "encoding/json" 12 | "flag" 13 | "fmt" 14 | "go/format" 15 | "io" 16 | "log" 17 | "net/http" 18 | "os" 19 | "sort" 20 | "strings" 21 | ) 22 | 23 | var outfile = flag.String("o", "", "write output to `file`") 24 | 25 | func main() { 26 | log.SetFlags(0) 27 | log.SetPrefix("entity2go: ") 28 | flag.Parse() 29 | 30 | resp, err := http.Get("https://html.spec.whatwg.org/entities.json") 31 | if err != nil { 32 | log.Fatal(err) 33 | } 34 | if resp.StatusCode != 200 { 35 | log.Fatal(resp.Status) 36 | } 37 | data, err := io.ReadAll(resp.Body) 38 | if err != nil { 39 | log.Fatal(err) 40 | } 41 | 42 | list := make(map[string]struct { 43 | Codepoints []rune 44 | }) 45 | err = json.Unmarshal(data, &list) 46 | if err != nil { 47 | log.Fatal(err) 48 | } 49 | 50 | var names []string 51 | for name := range list { 52 | names = append(names, name) 53 | } 54 | sort.Strings(names) 55 | 56 | var buf bytes.Buffer 57 | buf.WriteString(hdr) 58 | fmt.Fprintf(&buf, "var htmlEntity = map[string]string{\n") 59 | for _, name := range names { 60 | if !strings.HasSuffix(name, ";") { 61 | continue 62 | } 63 | fmt.Fprintf(&buf, "\t%q: \"", name) 64 | for _, r := range list[name].Codepoints { 65 | if r <= 0xFFFF { 66 | fmt.Fprintf(&buf, "\\u%04x", r) 67 | } else { 68 | fmt.Fprintf(&buf, "\\U%08x", r) 69 | } 70 | } 71 | fmt.Fprintf(&buf, "\",\n") 72 | } 73 | fmt.Fprintf(&buf, "}\n") 74 | 75 | src, err := format.Source(buf.Bytes()) 76 | if err != nil { 77 | log.Fatalf("reformatting output: %v", err) 78 | } 79 | 80 | if *outfile != "" { 81 | if err := os.WriteFile(*outfile, src, 0666); err != nil { 82 | log.Fatal(err) 83 | } 84 | } else { 85 | os.Stdout.Write(buf.Bytes()) 86 | } 87 | } 88 | 89 | var hdr = `// Copyright 2023 The Go Authors. All rights reserved. 90 | // Use of this source code is governed by a BSD-style 91 | // license that can be found in the LICENSE file. 92 | 93 | //go:generate go run entity2go.go -o entity.go 94 | 95 | package markdown 96 | 97 | // htmlEntity maps known HTML entity sequences to their meanings. 98 | ` 99 | -------------------------------------------------------------------------------- /md2html/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Md2html converts Markdown to HTML. 6 | // 7 | // Usage: 8 | // 9 | // md2html [file...] 10 | // 11 | // Md2html reads the named files, or else standard input, as Markdown documents 12 | // and then prints the corresponding HTML to standard output. 13 | package main 14 | 15 | import ( 16 | "bytes" 17 | "flag" 18 | "io/ioutil" 19 | "log" 20 | "os" 21 | "unicode/utf8" 22 | 23 | "rsc.io/markdown" 24 | ) 25 | 26 | func main() { 27 | flag.Parse() 28 | args := flag.Args() 29 | if len(args) == 0 { 30 | do(os.Stdin) 31 | } else { 32 | for _, arg := range args { 33 | f, err := os.Open(arg) 34 | if err != nil { 35 | log.Fatal(err) 36 | } 37 | do(f) 38 | f.Close() 39 | } 40 | } 41 | } 42 | 43 | func do(f *os.File) { 44 | data, err := ioutil.ReadAll(f) 45 | if err != nil { 46 | log.Fatal(err) 47 | } 48 | os.Stdout.WriteString(toHTML(data)) 49 | } 50 | 51 | // toHTML converts Markdown to HTML. 52 | func toHTML(md []byte) string { 53 | var p markdown.Parser 54 | p.Table = true 55 | return markdown.ToHTML(p.Parse(string(replaceTabs(md)))) 56 | } 57 | 58 | // replaceTabs replaces all tabs in text with spaces up to a 4-space tab stop. 59 | // 60 | // In Markdown, tabs used for indentation are required to be interpreted as 61 | // 4-space tab stops. See https://spec.commonmark.org/0.30/#tabs. 62 | // Go also renders nicely and more compactly on the screen with 4-space 63 | // tab stops, while browsers often use 8-space. 64 | // Make the Go code consistently compact across browsers, 65 | // all while staying Markdown-compatible, by expanding to 4-space tab stops. 66 | // 67 | // This function does not handle multi-codepoint Unicode sequences correctly. 68 | func replaceTabs(text []byte) []byte { 69 | var buf bytes.Buffer 70 | col := 0 71 | for len(text) > 0 { 72 | r, size := utf8.DecodeRune(text) 73 | text = text[size:] 74 | 75 | switch r { 76 | case '\n': 77 | buf.WriteByte('\n') 78 | col = 0 79 | 80 | case '\t': 81 | buf.WriteByte(' ') 82 | col++ 83 | for col%4 != 0 { 84 | buf.WriteByte(' ') 85 | col++ 86 | } 87 | 88 | default: 89 | buf.WriteRune(r) 90 | col++ 91 | } 92 | } 93 | return buf.Bytes() 94 | } 95 | -------------------------------------------------------------------------------- /testdata/task.txt: -------------------------------------------------------------------------------- 1 | Task list items tests. 2 | 3 | gfm* from https://github.github.com/gfm/#task-list-items-extension- 4 | (version 0.29-gfm (2019-04-06)) 5 | 6 | Others by hand, guessing based on GitHub behavior. 7 | 8 | -- parser.json -- 9 | {"TaskList": true} 10 | -- gfm279.md -- 11 | - [ ] foo 12 | - [x] bar 13 | -- gfm279.html -- 14 |
80 |82 |quote
81 |
foo
95 |bar
99 |hello
102 |“Hello,” said the spider. 9 | “‘Shelob’ is my name.”
10 | -- 2.md -- 11 | 'A', 'B', and 'C' are letters. 12 | -- 2.html -- 13 |‘A’, ‘B’, and ‘C’ are letters.
14 | -- 3.md -- 15 | 'Oak,' 'elm,' and 'beech' are names of trees. 16 | So is 'pine.' 17 | -- 3.html -- 18 |‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. 19 | So is ‘pine.’
20 | -- 4.md -- 21 | 'He said, "I want to go."' 22 | -- 4.html -- 23 |‘He said, “I want to go.”’
24 | -- 5.md -- 25 | Were you alive in the 70's? 26 | -- 5.html -- 27 |Were you alive in the 70’s?
28 | -- 6.md -- 29 | Here is some quoted '`code`' and a "[quoted link](url)". 30 | -- 6.html -- 31 |Here is some quoted ‘code’ and a “quoted link”.
’tis the season to be ‘jolly’
36 | -- 8.md -- 37 | 'We'll use Jane's boat and John's truck,' Jenna said. 38 | -- 8.html -- 39 |‘We’ll use Jane’s boat and John’s truck,’ Jenna said.
40 | -- 9.md -- 41 | "A paragraph with no closing quote. 42 | 43 | "Second paragraph by same speaker, in fiction." 44 | -- 9.html -- 45 |“A paragraph with no closing quote.
46 |“Second paragraph by same speaker, in fiction.”
47 | -- 10.md -- 48 | [a]'s b' 49 | -- 10.html -- 50 |[a]’s b’
51 | -- 11.md -- 52 | \"This is not smart.\" 53 | This isn\'t either. 54 | 5\'8\" 55 | -- 11.html -- 56 |"This is not smart." 57 | This isn't either. 58 | 5'8"
59 | -- 12.md -- 60 | Some dashes: em---em 61 | en--en 62 | em --- em 63 | en -- en 64 | 2--3 65 | -- 12.html -- 66 |Some dashes: em—em 67 | en–en 68 | em — em 69 | en – en 70 | 2–3
71 | -- 13.md -- 72 | one- 73 | two-- 74 | three--- 75 | four---- 76 | five----- 77 | six------ 78 | seven------- 79 | eight-------- 80 | nine--------- 81 | thirteen-------------. 82 | -- 13.html -- 83 |one- 84 | two– 85 | three— 86 | four–– 87 | five—– 88 | six—— 89 | seven—–– 90 | eight–––– 91 | nine——— 92 | thirteen———––.
93 | -- 14.md -- 94 | Escaped hyphens: \-- \-\-\-. 95 | -- 14.html -- 96 |Escaped hyphens: -- ---.
97 | -- 15.md -- 98 | Ellipses...and...and.... 99 | -- 15.html -- 100 |Ellipses…and…and….
101 | -- 16.md -- 102 | No ellipses\.\.\. 103 | -- 16.html -- 104 |No ellipses...
105 | -------------------------------------------------------------------------------- /testdata/cmark2txtar.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "log" 11 | "os" 12 | "strings" 13 | 14 | "golang.org/x/tools/txtar" 15 | "rsc.io/markdown" 16 | ) 17 | 18 | var parsers = map[string]string{ 19 | "example autolink": `{"AutoLinkText": true, "AutoLinkAssumeHTTP": true}`, 20 | "example disabled": `{"TaskListItems": true}`, 21 | "example strikethrough": `{"Strikethrough": true}`, 22 | "example table": `{"Table": true}`, 23 | } 24 | 25 | func main() { 26 | log.SetFlags(0) 27 | log.SetPrefix("cmark2txtar: ") 28 | flag.Usage = func() { 29 | fmt.Fprintf(os.Stderr, "usage: cmark2txtar file\n") 30 | os.Exit(2) 31 | } 32 | flag.Parse() 33 | if flag.NArg() != 1 { 34 | flag.Usage() 35 | } 36 | file := flag.Arg(0) 37 | 38 | data, err := os.ReadFile(file) 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | 43 | a := &txtar.Archive{ 44 | Comment: []byte("// go run cmark2txtar.go " + file + "\n"), 45 | } 46 | 47 | var p markdown.Parser 48 | doc := p.Parse(string(data)) 49 | n := 0 50 | for _, b := range doc.Blocks { 51 | var in, out []string 52 | b, ok := b.(*markdown.CodeBlock) 53 | if !ok || !strings.HasPrefix(b.Info, "example") { 54 | continue 55 | } 56 | for i := 0; i < len(b.Text); i++ { 57 | if b.Text[i] == "." { 58 | in, out = b.Text[:i], b.Text[i+1:] 59 | goto Found 60 | } 61 | } 62 | log.Fatalf("did not find . in pre block:\n%s", strings.Join(b.Text, "\n")) 63 | Found: 64 | parserChange := false 65 | if b.Info != "example" { 66 | js, ok := parsers[b.Info] 67 | if !ok { 68 | log.Printf("skipping %s", b.Info) 69 | continue 70 | } 71 | parserChange = true 72 | a.Files = append(a.Files, txtar.File{Name: "parser.json", Data: []byte(js)}) 73 | } 74 | n++ 75 | name := fmt.Sprintf("%d", n) 76 | a.Files = append(a.Files, 77 | txtar.File{ 78 | Name: name + ".md", 79 | Data: []byte(encode(join(in))), 80 | }, 81 | txtar.File{ 82 | Name: name + ".html", 83 | Data: []byte(encode(join(out))), 84 | }, 85 | ) 86 | if parserChange { 87 | a.Files = append(a.Files, txtar.File{Name: "parser.json", Data: []byte(`{}`)}) 88 | } 89 | } 90 | 91 | os.Stdout.Write(txtar.Format(a)) 92 | } 93 | 94 | func encode(s string) string { 95 | s = strings.ReplaceAll(s, " \n", " ^J\n") 96 | s = strings.ReplaceAll(s, "\t\n", "\t^J\n") 97 | if s != "" && !strings.HasSuffix(s, "\n") { 98 | s += "^D\n" 99 | } 100 | return s 101 | } 102 | 103 | func join(s []string) string { 104 | if len(s) == 0 { 105 | return "" 106 | } 107 | x := strings.Join(s, "\n") + "\n" 108 | x = strings.ReplaceAll(x, "→", "\t") 109 | return x 110 | } 111 | -------------------------------------------------------------------------------- /emoji2go.go: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //go:build ignore 6 | 7 | package main 8 | 9 | import ( 10 | "bytes" 11 | "encoding/json" 12 | "flag" 13 | "fmt" 14 | "go/format" 15 | "io" 16 | "log" 17 | "net/http" 18 | "os" 19 | "regexp" 20 | "sort" 21 | "strconv" 22 | "strings" 23 | ) 24 | 25 | var outfile = flag.String("o", "", "write output to `file`") 26 | 27 | func get(url string) []byte { 28 | resp, err := http.Get(url) 29 | if err != nil { 30 | log.Fatal(err) 31 | } 32 | if resp.StatusCode != 200 { 33 | log.Fatal(resp.Status) 34 | } 35 | data, err := io.ReadAll(resp.Body) 36 | if err != nil { 37 | log.Fatal(err) 38 | } 39 | return data 40 | } 41 | 42 | var gemojiRE = regexp.MustCompile(`?g-emoji[^<>]*>`) 43 | 44 | func main() { 45 | log.SetFlags(0) 46 | log.SetPrefix("emoji2go: ") 47 | flag.Parse() 48 | 49 | emojiJSON := get("https://api.github.com/emojis") 50 | list := make(map[string]string) 51 | err := json.Unmarshal(emojiJSON, &list) 52 | if err != nil { 53 | log.Fatal(err) 54 | } 55 | 56 | var names []string 57 | for name := range list { 58 | names = append(names, name) 59 | } 60 | sort.Strings(names) 61 | 62 | emojiHTML := string(get("https://gist.github.com/rsc/316bc98c066ad111973634d435203aac")) 63 | 64 | bad := false 65 | var buf bytes.Buffer 66 | buf.WriteString(hdr) 67 | fmt.Fprintf(&buf, "var emoji = map[string]string{\n") 68 | n := 0 69 | for _, name := range names { 70 | n = max(n, len(name)) 71 | _, val, ok := strings.Cut(emojiHTML, ""+name+"| foo | 12 |bar | 13 |
|---|---|
| baz | 18 |bim | 19 |
| abc | 31 |defghi | 32 |
|---|---|
| bar | 37 |baz | 38 |
| f|oo | 51 |
|---|
b | az |
56 |
| b | im | 59 |
| abc | 72 |def | 73 |
|---|---|
| bar | 78 |baz | 79 |
83 |85 | -- gfm202.md -- 86 | | abc | def | 87 | | --- | --- | 88 | | bar | baz | 89 | bar 90 | 91 | bar 92 | -- gfm202.html -- 93 |bar
84 |
| abc | 97 |def | 98 |
|---|---|
| bar | 103 |baz | 104 |
| bar | 107 |108 | |
bar
112 | -- gfm203.md -- 113 | | abc | def | 114 | | --- | 115 | | bar | 116 | -- gfm203.html -- 117 || abc | def | 118 | | --- | 119 | | bar |
120 | -- gfm204.md -- 121 | | abc | def | 122 | | --- | --- | 123 | | bar | 124 | | bar | baz | boo | 125 | -- gfm204.html -- 126 || abc | 130 |def | 131 |
|---|---|
| bar | 136 |137 | |
| bar | 140 |baz | 141 |
| abc | 152 |def | 153 |
|---|
hello world 169 | this is a test
170 |171 |185 |a
172 |173 | 174 |
184 |175 | 177 | 178 | 179 |b 176 |180 | 182 | 183 |d 181 |
e
186 |187 |190 | -- 2.md -- 191 | | 0\|1\\|2\\\|3\\\\|4\\\\\|5\\\\\\|6\\\\\\\|7\\\\\\\\|8 | 192 | | ------ | 193 | -- 2.html -- 194 |e 188 | c
189 |
| 0|1|2\|3\|4\\|5\\|6\\\|7\\\|8 | 198 |
|---|
| 211 | | Foo | 212 |Bar | 213 |
|---|---|---|
| a | 218 |value1 | 219 |value2 | 220 |
| b | 223 |value3 | 224 |value4 | 225 |
| 235 | |- 236 | |x 237 | |
238 | -- 5.md -- 239 | || 240 | |- 241 | |x 242 | | 243 | -- 5.html -- 244 || 248 | |
|---|
| x | 253 |
|
257 | -------------------------------------------------------------------------------- /lex.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "strings" 9 | "unicode" 10 | ) 11 | 12 | // isPunct reports whether c is Markdown punctuation. 13 | func isPunct(c byte) bool { 14 | return '!' <= c && c <= '/' || ':' <= c && c <= '@' || '[' <= c && c <= '`' || '{' <= c && c <= '~' 15 | } 16 | 17 | // isLetter reports whether c is an ASCII letter. 18 | func isLetter(c byte) bool { 19 | return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 20 | } 21 | 22 | // isDigit reports whether c is an ASCII digit. 23 | func isDigit(c byte) bool { 24 | return '0' <= c && c <= '9' 25 | } 26 | 27 | // isLetterDigit reports whether c is an ASCII letter or digit. 28 | func isLetterDigit(c byte) bool { 29 | return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' 30 | } 31 | 32 | // isLDH reports whether c is an ASCII letter, digit, or hyphen. 33 | func isLDH(c byte) bool { 34 | return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '-' 35 | } 36 | 37 | // isHexDigit reports whether c is an ASCII hexadecimal digit. 38 | func isHexDigit(c byte) bool { 39 | return 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' || '0' <= c && c <= '9' 40 | } 41 | 42 | // isUnocdeSpace reports whether r is a Unicode space as defined by Markdown. 43 | // This is not the same as unicode.IsSpace. 44 | // For example, U+0085 does not satisfy isUnicodeSpace 45 | // but does satisfy unicode.IsSpace. 46 | func isUnicodeSpace(r rune) bool { 47 | if r < 0x80 { 48 | return r == ' ' || r == '\t' || r == '\f' || r == '\n' 49 | } 50 | return unicode.In(r, unicode.Zs) 51 | } 52 | 53 | // isUnocdeSpace reports whether r is Unicode punctuation as defined by Markdown. 54 | // This is not the same as unicode.Punct; it also includes unicode.Symbol. 55 | func isUnicodePunct(r rune) bool { 56 | if r < 0x80 { 57 | return isPunct(byte(r)) 58 | } 59 | return unicode.In(r, unicode.Punct, unicode.Symbol) 60 | } 61 | 62 | // skipSpace returns i + the number of spaces, tabs, carriage returns, and newlines 63 | // at the start of s[i:]. That is, it skips i past any such characters, returning the new i. 64 | func skipSpace(s string, i int) int { 65 | // Note: Blank lines have already been removed. 66 | for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') { 67 | i++ 68 | } 69 | return i 70 | } 71 | 72 | // mdEscaper escapes symbols that are used in inline Markdown sequences. 73 | // TODO(rsc): There is a better way to do this. 74 | var mdEscaper = strings.NewReplacer( 75 | `(`, `\(`, 76 | `)`, `\)`, 77 | `[`, `\[`, 78 | `]`, `\]`, 79 | `*`, `\*`, 80 | `_`, `\_`, 81 | `<`, `\<`, 82 | `>`, `\>`, 83 | ) 84 | 85 | // mdLinkEscaper escapes symbols that have meaning inside a link target. 86 | var mdLinkEscaper = strings.NewReplacer( 87 | `(`, `\(`, 88 | `)`, `\)`, 89 | `<`, `\<`, 90 | `>`, `\>`, 91 | ) 92 | 93 | // mdUnscape returns the Markdown unescaping of s. 94 | func mdUnescape(s string) string { 95 | if !strings.Contains(s, `\`) && !strings.Contains(s, `&`) { 96 | return s 97 | } 98 | return mdUnescaper.Replace(s) 99 | } 100 | 101 | // mdUnescaper unescapes Markdown escape sequences and HTML entities. 102 | // TODO(rsc): Perhaps there is a better way to do this. 103 | var mdUnescaper = func() *strings.Replacer { 104 | var list = []string{ 105 | `\!`, `!`, 106 | `\"`, `"`, 107 | `\#`, `#`, 108 | `\$`, `$`, 109 | `\%`, `%`, 110 | `\&`, `&`, 111 | `\'`, `'`, 112 | `\(`, `(`, 113 | `\)`, `)`, 114 | `\*`, `*`, 115 | `\+`, `+`, 116 | `\,`, `,`, 117 | `\-`, `-`, 118 | `\.`, `.`, 119 | `\/`, `/`, 120 | `\:`, `:`, 121 | `\;`, `;`, 122 | `\<`, `<`, 123 | `\=`, `=`, 124 | `\>`, `>`, 125 | `\?`, `?`, 126 | `\@`, `@`, 127 | `\[`, `[`, 128 | `\\`, `\`, 129 | `\]`, `]`, 130 | `\^`, `^`, 131 | `\_`, `_`, 132 | "\\`", "`", 133 | `\{`, `{`, 134 | `\|`, `|`, 135 | `\}`, `}`, 136 | `\~`, `~`, 137 | } 138 | 139 | for name, repl := range htmlEntity { 140 | list = append(list, name, repl) 141 | } 142 | return strings.NewReplacer(list...) 143 | }() 144 | -------------------------------------------------------------------------------- /testdata/gfm_regress.txt: -------------------------------------------------------------------------------- 1 | // go run cmark2txtar.go /users/rsc/pub/cmark-gfm/test/regression.txt 2 | -- 1.md -- 3 | line1 4 | 5 | line2 6 | -- 1.html -- 7 |line1
8 |line2
9 | -- 2.md -- 10 | By taking it apart 11 | 12 | - alternative solutions 13 | ^J 14 | Repeatedly solving 15 | ^J 16 | - how techniques 17 | -- 2.html -- 18 |By taking it apart
19 |Repeatedly solving
23 |a*b c
54 | -- 6.md -- 55 | [a] 56 | 57 | [a]:[a]
60 |[a]: <te
[a](te\ st)
65 | -- parser.json -- 66 | {"Strikethrough": true} 67 | -- 8.md -- 68 | ~~**_`this`_**~~ ^J 69 | ~~***`this`***~~ ^J 70 | ~~___`this`___~~ 71 | 72 | **_`this`_** ^J 73 | ***`this`*** ^J 74 | ___`this`___ 75 | 76 | ~~**_this_**~~ ^J 77 | ~~***this***~~ ^J 78 | ~~___this___~~ 79 | 80 | **_this_** ^J 81 | ***this*** ^J 82 | ___this___ 83 | -- 8.html -- 84 |this
85 | this
86 | this
this
88 | this
89 | this
this
91 | this
92 | this
this
94 | this
95 | this
City: 105 | 106 | 107 |
108 | -- parser.json -- 109 | {"Strikethrough": true} 110 | -- 10.md -- 111 | ~Hi~ Hello, world! 112 | -- 10.html -- 113 |Hi Hello, world!
This text is ~~~curious~~~.
City: 135 | 136 | 137 |
138 | -- 14.md -- 139 | [a](\ b) 140 | 141 | [a](<[a](\ b) 147 |[a](<<b)
148 |[a](<b 149 | )
150 | -- 15.md -- 151 | [link](url ((title)) 152 | -- 15.html -- 153 |[link](url ((title))
154 | -- 16.md -- 155 | 156 | 157 | 158 | 159 | 160 | -- 16.html -- 161 | 162 | 163 | 164 | -- 17.md -- 165 | [a]( 166 | -- 17.html -- 167 |[a](<b) c>
168 | -- parser.json -- 169 | {"Table": true} 170 | -- 18.md -- 171 | | 172 | -| 173 | -- 18.html -- 174 || 175 | -|
176 | -- parser.json -- 177 | {} 178 | -- 19.md -- 179 | *text* [link](#section) 180 | -- 19.html -- 181 |text link
182 | -------------------------------------------------------------------------------- /line.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | type line struct { 8 | spaces int 9 | i int 10 | tab int 11 | text string 12 | nl byte // newline character ending this line: \r or \n or \r+\n or zero for EOF 13 | nonblank int // index of first non-space, non-tab char in text; len(text) if none 14 | } 15 | 16 | func makeLine(text string, nl byte) line { 17 | s := line{text: text, nl: nl} 18 | s.setNonblank() 19 | return s 20 | } 21 | 22 | func (s *line) setNonblank() { 23 | i := s.i 24 | for i < len(s.text) && (s.text[i] == ' ' || s.text[i] == '\t') { 25 | i++ 26 | } 27 | s.nonblank = i 28 | } 29 | 30 | func (s *line) peek() byte { 31 | if s.spaces > 0 { 32 | return ' ' 33 | } 34 | if s.i >= len(s.text) { 35 | return 0 36 | } 37 | return s.text[s.i] 38 | } 39 | 40 | func (s *line) skipSpace() { 41 | s.spaces = 0 42 | if s.nonblank < s.i { 43 | panic("nonblank") 44 | } 45 | s.i = s.nonblank 46 | } 47 | 48 | func (s *line) trimSpace(min, max int, eolOK bool) bool { 49 | t := *s 50 | 51 | for n := 0; n < max; n++ { 52 | if t.spaces > 0 { 53 | t.spaces-- 54 | continue 55 | } 56 | if t.i >= len(t.text) && eolOK { 57 | continue 58 | } 59 | // TODO performance bottleneck here using trimSpace with list extensions? 60 | // but each only fails once? 61 | if t.i < len(t.text) { 62 | switch t.text[t.i] { 63 | case '\t': 64 | t.spaces = 4 - (t.i-t.tab)&3 - 1 65 | t.i++ 66 | t.tab = t.i // TODO seems wrong 67 | continue 68 | case ' ': 69 | t.i++ 70 | continue 71 | } 72 | } 73 | if n >= min { 74 | break 75 | } 76 | return false 77 | } 78 | if t.nonblank < t.i { 79 | t.setNonblank() 80 | } 81 | *s = t 82 | return true 83 | } 84 | 85 | func (s *line) trim(c byte) bool { 86 | if s.spaces > 0 { 87 | if c == ' ' { 88 | s.spaces-- 89 | return true 90 | } 91 | return false 92 | } 93 | if s.i < len(s.text) && s.text[s.i] == c { 94 | s.i++ 95 | if s.nonblank < s.i { 96 | s.setNonblank() 97 | } 98 | return true 99 | } 100 | return false 101 | } 102 | 103 | func (s *line) skip(n int) { 104 | s.i += n 105 | if s.nonblank < s.i { 106 | s.setNonblank() 107 | } 108 | } 109 | 110 | func (s *line) string() string { 111 | switch s.spaces { 112 | case 0: 113 | return s.text[s.i:] 114 | case 1: 115 | return " " + s.text[s.i:] 116 | case 2: 117 | return " " + s.text[s.i:] 118 | case 3: 119 | return " " + s.text[s.i:] 120 | } 121 | // unreachable 122 | panic("bad spaces") 123 | } 124 | 125 | func trimLeftSpaceTab(s string) string { 126 | i := 0 127 | for i < len(s) && (s[i] == ' ' || s[i] == '\t') { 128 | i++ 129 | } 130 | return s[i:] 131 | } 132 | 133 | func trimRightSpaceTab(s string) string { 134 | j := len(s) 135 | for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') { 136 | j-- 137 | } 138 | return s[:j] 139 | } 140 | 141 | func trimSpaceTab(s string) string { 142 | i := 0 143 | for i < len(s) && (s[i] == ' ' || s[i] == '\t') { 144 | i++ 145 | } 146 | s = s[i:] 147 | j := len(s) 148 | for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') { 149 | j-- 150 | } 151 | return s[:j] 152 | } 153 | 154 | func trimSpace(s string) string { 155 | i := 0 156 | for i < len(s) && (s[i] == ' ' || s[i] == '\t') { 157 | i++ 158 | } 159 | s = s[i:] 160 | j := len(s) 161 | for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') { 162 | j-- 163 | } 164 | return s[:j] 165 | } 166 | 167 | func trimSpaceTabNewline(s string) string { 168 | i := 0 169 | for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') { 170 | i++ 171 | } 172 | s = s[i:] 173 | j := len(s) 174 | for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t' || s[j-1] == '\n') { 175 | j-- 176 | } 177 | return s[:j] 178 | } 179 | 180 | func (s *line) isBlank() bool { 181 | return s.nonblank == len(s.text) 182 | } 183 | 184 | func (s *line) eof() bool { 185 | return s.i >= len(s.text) 186 | } 187 | 188 | func (s *line) trimSpaceString() string { 189 | return s.text[s.nonblank:] 190 | } 191 | 192 | func (s *line) trimString() string { 193 | if s.nonblank < s.i { 194 | panic("bad blank") 195 | } 196 | return trimSpaceTab(s.text[s.nonblank:]) 197 | } 198 | -------------------------------------------------------------------------------- /footnote.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package markdown 6 | 7 | import ( 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | type Footnote struct { 13 | Position 14 | Label string 15 | Blocks []Block 16 | } 17 | 18 | type FootnoteLink struct { 19 | Label string 20 | Footnote *Footnote 21 | } 22 | 23 | type printedNote struct { 24 | num string 25 | note *Footnote 26 | refs []string 27 | } 28 | 29 | func (*FootnoteLink) Inline() {} 30 | 31 | func (x *Footnote) printed(p *printer) *printedNote { 32 | if p.footnotes == nil { 33 | p.footnotes = make(map[*Footnote]*printedNote) 34 | } 35 | pr, ok := p.footnotes[x] 36 | if !ok { 37 | pr = &printedNote{ 38 | num: strconv.Itoa(len(p.footnotes) + 1), 39 | note: x, 40 | } 41 | p.footnotes[x] = pr 42 | p.footnotelist = append(p.footnotelist, pr) 43 | } 44 | ref := pr.num 45 | if len(pr.refs) > 0 { 46 | ref += "-" + strconv.Itoa(len(pr.refs)+1) 47 | } 48 | pr.refs = append(pr.refs, ref) 49 | return pr 50 | } 51 | 52 | func (x *FootnoteLink) printHTML(p *printer) { 53 | note := x.Footnote 54 | if note == nil { 55 | return 56 | } 57 | pr := note.printed(p) 58 | ref := pr.refs[len(pr.refs)-1] 59 | p.html(``, pr.num, ``) 60 | } 61 | 62 | func (x *FootnoteLink) printMarkdown(p *printer) { 63 | note := x.Footnote 64 | if note == nil { 65 | return 66 | } 67 | note.printed(p) // add to list for printFootnoteMarkdown 68 | p.text(`[^`, x.Label, `]`) 69 | } 70 | 71 | func (x *FootnoteLink) printText(p *printer) { 72 | p.text(`[^`, x.Label, `]`) 73 | } 74 | 75 | func printFootnoteHTML(p *printer) { 76 | if len(p.footnotelist) == 0 { 77 | return 78 | } 79 | 80 | p.html(`\n") 91 | } 92 | for _, ref := range note.refs { 93 | p.html("\n", `↩`) 94 | } 95 | p.html("
\n") 96 | p.html("" + rep("a a ", 65000) + "b" + rep(" a a", 65000) + "
\n", 34 | }, 35 | { 36 | "many emph closers with no openers", 37 | rep("a_ ", 65000), 38 | "", 39 | }, 40 | { 41 | "many emph openers with no closers", 42 | rep("_a ", 65000), 43 | "", 44 | }, 45 | { 46 | "many link closers with no openers", 47 | rep("a]", 65000), 48 | "", 49 | }, 50 | { 51 | "many link openers with no closers", 52 | rep("[a", 65000), 53 | "", 54 | }, 55 | { 56 | "mismatched openers and closers", 57 | rep("*a_ ", 50000), 58 | "", 59 | }, 60 | { 61 | "openers and closers multiple of 3", 62 | "a**b" + rep("c* ", 50000), 63 | "", 64 | }, 65 | { 66 | "link openers and emph closers", 67 | rep("[ a_", 50000), 68 | "", 69 | }, 70 | { 71 | "pattern [ (]( repeated", 72 | rep("[ (](", 80000), 73 | "", 74 | }, 75 | { 76 | "pattern ![[]() repeated", 77 | rep("![[]()", 160000), 78 | "\n", 79 | }, 80 | { 81 | "hard link/emph case", 82 | "**x [a*b**c*](d)", 83 | `**x ab**c
` + "\n", 84 | }, 85 | { 86 | "nested brackets", 87 | rep("[", 50000) + "a" + rep("]", 50000), 88 | "", 89 | }, 90 | { 91 | "nested block quotes", 92 | rep("> ", 50000) + "a", 93 | rep("\n", 50000) + "\n", 50000), 94 | }, 95 | { 96 | "deeply nested lists", 97 | repf(func(x int) string { return rep(" ", x) + "* a\n" }, 4000), 98 | "a
\n" + rep("
" + rep("[a](b#", 30000) + "
\n", 124 | }, 125 | { 126 | "unclosed 336 | The `go` subcommands now accept 337 | `-C` ` tags.
14 | //
15 | // When printing a CodeBlock as Markdown, the Fence field is used as
16 | // a starting hint but is made longer as needed if the suggested fence text
17 | // appears in Text.
18 | //
19 | // [indented code block]: https://spec.commonmark.org/0.31.2/#indented-code-blocks
20 | // [fenced code block]: https://spec.commonmark.org/0.31.2/#fenced-code-blocks
21 | type CodeBlock struct {
22 | Position
23 | Fence string // fence to use
24 | Info string // info following open fence
25 | Text []string // lines of code block
26 | }
27 |
28 | func (*CodeBlock) Block() {}
29 |
30 | func (b *CodeBlock) printHTML(p *printer) {
31 | p.html("")
50 | for _, s := range b.Text {
51 | p.text(s, "\n")
52 | }
53 | p.html("
\n")
54 | }
55 |
56 | func (b *CodeBlock) printMarkdown(p *printer) {
57 | if b.Fence == "" {
58 | p.maybeNL()
59 | for i, line := range b.Text {
60 | if i > 0 {
61 | p.nl()
62 | }
63 | p.md(" ")
64 | p.md(line)
65 | p.noTrim()
66 | }
67 | } else {
68 | // TODO compute correct fence
69 | if p.tight == 0 {
70 | p.maybeNL()
71 | }
72 | p.md(b.Fence)
73 | p.md(b.Info)
74 | for _, line := range b.Text {
75 | p.nl()
76 | p.md(line)
77 | p.noTrim()
78 | }
79 | p.nl()
80 | p.md(b.Fence)
81 | }
82 | }
83 |
84 | // startIndentedCodeBlock is a [starter] for an indented [CodeBlock].
85 | // See https://spec.commonmark.org/0.31.2/#indented-code-blocks.
86 | func startIndentedCodeBlock(p *parser, s line) (line, bool) {
87 | // Line must start with 4 spaces and then not be blank.
88 | peek := s
89 | if p.para() != nil || !peek.trimSpace(4, 4, false) || peek.isBlank() {
90 | return s, false
91 | }
92 |
93 | b := &indentBuilder{}
94 | p.addBlock(b)
95 | if peek.nl != '\n' {
96 | p.corner = true // goldmark does not normalize to \n
97 | }
98 | b.text = append(b.text, peek.string())
99 | return line{}, true
100 | }
101 |
102 | // startFencedCodeBlock is a [starter] for a fenced [CodeBlock].
103 | // See https://spec.commonmark.org/0.31.2/#fenced-code-blocks.
104 | func startFencedCodeBlock(p *parser, s line) (line, bool) {
105 | // Line must start with fence.
106 | indent, fence, info, ok := trimFence(&s)
107 | if !ok {
108 | return s, false
109 | }
110 |
111 | // Note presence of corner cases, for testing.
112 | if fence[0] == '~' && info != "" {
113 | // goldmark does not handle info after ~~~
114 | p.corner = true
115 | } else if info != "" && !isLetter(info[0]) {
116 | // goldmark does not allow numbered info.
117 | // goldmark does not treat a tab as introducing a new word.
118 | p.corner = true
119 | }
120 | for _, c := range info {
121 | if isUnicodeSpace(c) {
122 | if c != ' ' {
123 | // goldmark only breaks on space
124 | p.corner = true
125 | }
126 | break
127 | }
128 | }
129 |
130 | p.addBlock(&fenceBuilder{indent, fence, info, nil})
131 | return line{}, true
132 | }
133 |
134 | // trimFence attempts to trim leading indentation (up to 3 spaces),
135 | // a code fence, and an info string from s.
136 | // If successful, it returns those values and ok=true, leaving s empty.
137 | // If unsuccessful, it leaves s unmodified and returns ok=false.
138 | func trimFence(s *line) (indent int, fence, info string, ok bool) {
139 | t := *s
140 | indent = 0
141 | for indent < 3 && t.trimSpace(1, 1, false) {
142 | indent++
143 | }
144 | c := t.peek()
145 | if c != '`' && c != '~' {
146 | return
147 | }
148 |
149 | f := t.string()
150 | n := 0
151 | for t.trim(c) {
152 | n++
153 | }
154 | if n < 3 {
155 | return
156 | }
157 |
158 | txt := mdUnescaper.Replace(t.trimString())
159 | if c == '`' && strings.Contains(txt, "`") {
160 | return
161 | }
162 | info = trimSpaceTab(txt)
163 | fence = f[:n]
164 | ok = true
165 | *s = line{}
166 | return
167 | }
168 |
169 | // An indentBuilder is a [blockBuilder] for an indented (unfenced) [CodeBlock].
170 | type indentBuilder struct {
171 | indent string
172 | text []string
173 | }
174 |
175 | func (c *indentBuilder) extend(p *parser, s line) (line, bool) {
176 | // Extension lines must start with 4 spaces or be blank.
177 | if !s.trimSpace(4, 4, true) {
178 | return s, false
179 | }
180 | c.text = append(c.text, s.string())
181 | if s.nl != '\n' {
182 | p.corner = true // goldmark does not normalize to \n
183 | }
184 | return line{}, true
185 | }
186 |
187 | func (b *indentBuilder) build(p *parser) Block {
188 | // Remove trailing blank lines, which are often used
189 | // just to separate the indented code block from what follows.
190 | for len(b.text) > 0 && b.text[len(b.text)-1] == "" {
191 | b.text = b.text[:len(b.text)-1]
192 | }
193 | return &CodeBlock{p.pos(), "", "", b.text}
194 | }
195 |
196 | // A fenceBuilder is a [blockBuilder] for a fenced [CodeBlock].
197 | type fenceBuilder struct {
198 | indent int
199 | fence string
200 | info string
201 | text []string
202 | }
203 |
204 | func (c *fenceBuilder) extend(p *parser, s line) (line, bool) {
205 | // Check for closing fence, which must be at least as long as opening fence, with no info.
206 | // The closing fence can be indented less than the opening one.
207 | peek := s
208 | if _, fence, info, ok := trimFence(&peek); ok && strings.HasPrefix(fence, c.fence) && info == "" {
209 | return line{}, false
210 | }
211 |
212 | // Otherwise trim the indentation from the fence line, if present.
213 | if !s.trimSpace(c.indent, c.indent, false) {
214 | p.corner = true // goldmark mishandles fenced blank lines with not enough spaces
215 | s.trimSpace(0, c.indent, false)
216 | }
217 |
218 | c.text = append(c.text, s.string())
219 | p.corner = p.corner || s.nl != '\n' // goldmark does not normalize to \n
220 | return line{}, true
221 | }
222 |
223 | func (c *fenceBuilder) build(p *parser) Block {
224 | return &CodeBlock{p.pos(), c.fence, c.info, c.text}
225 | }
226 |
--------------------------------------------------------------------------------
/testdata/gfm_ext.txt:
--------------------------------------------------------------------------------
1 | // go run cmark2txtar.go /users/rsc/pub/cmark-gfm/test/extensions.txt
2 | -- parser.json --
3 | {"Strikethrough": true, "Table": true}
4 | -- 1.md --
5 | | abc | def |
6 | | --- | --- |
7 | | ghi | jkl |
8 | | mno | pqr |
9 | -- 1.html --
10 |
11 |
12 |
13 | abc
14 | def
15 |
16 |
17 |
18 |
19 | ghi
20 | jkl
21 |
22 |
23 | mno
24 | pqr
25 |
26 |
27 |
28 | -- 2.md --
29 | Hello!
30 |
31 | | _abc_ | セン |
32 | | ----- | ---- |
33 | | 1. Block elements inside cells don't work. | |
34 | | But _**inline elements do**_. | x |
35 |
36 | Hi!
37 | -- 2.html --
38 | Hello!
39 |
40 |
41 |
42 | abc
43 | セン
44 |
45 |
46 |
47 |
48 | 1. Block elements inside cells don't work.
49 |
50 |
51 |
52 | But inline elements do.
53 | x
54 |
55 |
56 |
57 | Hi!
58 | -- 3.md --
59 | | Not enough table | to be considered table |
60 |
61 | | Not enough table | to be considered table |
62 | | Not enough table | to be considered table |
63 |
64 | | Just enough table | to be considered table |
65 | | ----------------- | ---------------------- |
66 |
67 | | ---- | --- |
68 |
69 | |x|
70 | |-|
71 |
72 | | xyz |
73 | | --- |
74 | -- 3.html --
75 | | Not enough table | to be considered table |
76 | | Not enough table | to be considered table |
77 | | Not enough table | to be considered table |
78 |
79 |
80 |
81 | Just enough table
82 | to be considered table
83 |
84 |
85 |
86 | | ---- | --- |
87 |
88 |
89 |
90 | x
91 |
92 |
93 |
94 |
95 |
96 |
97 | xyz
98 |
99 |
100 |
101 | -- 4.md --
102 | abc | def
103 | --- | ---
104 | xyz | ghi
105 | -- 4.html --
106 |
107 |
108 |
109 | abc
110 | def
111 |
112 |
113 |
114 |
115 | xyz
116 | ghi
117 |
118 |
119 |
120 | -- 5.md --
121 | Hello!
122 |
123 | | _abc_ | セン |
124 | | ----- | ---- |
125 | | this row has a space at the end | | ^J
126 | | But _**inline elements do**_. | x |
127 |
128 | Hi!
129 | -- 5.html --
130 | Hello!
131 |
132 |
133 |
134 | abc
135 | セン
136 |
137 |
138 |
139 |
140 | this row has a space at the end
141 |
142 |
143 |
144 | But inline elements do.
145 | x
146 |
147 |
148 |
149 | Hi!
150 | -- 6.md --
151 | aaa | bbb | ccc | ddd | eee
152 | :-- | --- | :-: | --- | --:
153 | fff | ggg | hhh | iii | jjj
154 | -- 6.html --
155 |
156 |
157 |
158 | aaa
159 | bbb
160 | ccc
161 | ddd
162 | eee
163 |
164 |
165 |
166 |
167 | fff
168 | ggg
169 | hhh
170 | iii
171 | jjj
172 |
173 |
174 |
175 | -- 7.md --
176 | | a | b | c |
177 | | --- | --- |
178 | | this | isn't | okay |
179 | -- 7.html --
180 | | a | b | c |
181 | | --- | --- |
182 | | this | isn't | okay |
183 | -- 8.md --
184 | | a | b | c |
185 | | --- | --- | ---
186 | | x
187 | | a | b
188 | | 1 | 2 | 3 | 4 | 5 |
189 | -- 8.html --
190 |
191 |
192 |
193 | a
194 | b
195 | c
196 |
197 |
198 |
199 |
200 | x
201 |
202 |
203 |
204 |
205 | a
206 | b
207 |
208 |
209 |
210 | 1
211 | 2
212 | 3
213 |
214 |
215 |
216 | -- 9.md --
217 | | a | b |
218 | | --- | --- |
219 | | Escaped pipes are \|okay\|. | Like \| this. |
220 | | Within `\|code\| is okay` too. |
221 | | _**`c\|`**_ \| complex
222 | | don't **\_reparse\_**
223 | -- 9.html --
224 |
225 |
226 |
227 | a
228 | b
229 |
230 |
231 |
232 |
233 | Escaped pipes are |okay|.
234 | Like | this.
235 |
236 |
237 | Within |code| is okay too.
238 |
239 |
240 |
241 | c| | complex
242 |
243 |
244 |
245 | don't _reparse_
246 |
247 |
248 |
249 |
250 | -- 10.md --
251 | | a |
252 | --- |
253 | -- 10.html --
254 |
255 |
256 |
257 | a
258 |
259 |
260 |
261 | -- 11.md --
262 | | a | b |
263 | | --- | --- |
264 | | \\ | `\\` |
265 | | \\\\ | `\\\\` |
266 | | \_ | `\_` |
267 | | \| | `\|` |
268 | | \a | `\a` |
269 |
270 | \\ `\\`
271 |
272 | \\\\ `\\\\`
273 |
274 | \_ `\_`
275 |
276 | \| `\|`
277 |
278 | \a `\a`
279 | -- 11.html --
280 |
281 |
282 |
283 | a
284 | b
285 |
286 |
287 |
288 |
289 | \
290 | \\
291 |
292 |
293 | \\
294 | \\\\
295 |
296 |
297 | _
298 | \_
299 |
300 |
301 | |
302 | |
303 |
304 |
305 | \a
306 | \a
307 |
308 |
309 |
310 | \ \\
311 | \\ \\\\
312 | _ \_
313 | | \|
314 | \a \a
315 | -- 12.md --
316 | | a |
317 | | --- |
318 | | hello |
319 | | ok
sure |
320 | -- 12.html --
321 |
322 |
323 |
324 | a
325 |
326 |
327 |
328 |
329 | hello
330 |
331 |
332 | ok
sure
333 |
334 |
335 |
336 | -- 13.md --
337 | Here's a link to [Freedom Planet 2][].
338 |
339 | | Here's a link to [Freedom Planet 2][] in a table header. |
340 | | --- |
341 | | Here's a link to [Freedom Planet 2][] in a table row. |
342 |
343 | [Freedom Planet 2]: http://www.freedomplanet2.com/
344 | -- 13.html --
345 | Here's a link to Freedom Planet 2.
346 |
347 |
348 |
349 | Here's a link to Freedom Planet 2 in a table header.
350 |
351 |
352 |
353 |
354 | Here's a link to Freedom Planet 2 in a table row.
355 |
356 |
357 |
358 | -- 14.md --
359 | | a | b | c |
360 | | --- | --- | --- |
361 | | d || e |
362 | -- 14.html --
363 |
364 |
365 |
366 | a
367 | b
368 | c
369 |
370 |
371 |
372 |
373 | d
374 |
375 | e
376 |
377 |
378 |
379 | -- 15.md --
380 | | a | b |
381 | | --- | --- |
382 | |***(a)***|
383 | -- 15.html --
384 |
385 |
386 |
387 | a
388 | b
389 |
390 |
391 |
392 |
393 | (a)
394 |
395 |
396 |
397 |
398 | -- 16.md --
399 | 123
400 | 456
401 | | a | b |
402 | | ---| --- |
403 | d | e
404 | -- 16.html --
405 | 123
406 | 456
407 |
408 |
409 |
410 | a
411 | b
412 |
413 |
414 |
415 |
416 | d
417 | e
418 |
419 |
420 |
421 | -- 17.md --
422 | A proper ~strikethrough~.
423 | -- 17.html --
424 | A proper strikethrough.
425 | -- 18.md --
426 | These are ~not strikethroughs.
427 |
428 | No, they are not~
429 |
430 | This ~is ~ legit~ isn't ~ legit.
431 |
432 | This is not ~~~~~one~~~~~ huge strikethrough.
433 |
434 | ~one~ ~~two~~ ~~~three~~~
435 |
436 | No ~mismatch~~
437 | -- 18.html --
438 | These are ~not strikethroughs.
439 | No, they are not~
440 | This is ~ legit isn't ~ legit.
441 | This is not ~~~~~one~~~~~ huge strikethrough.
442 | one two ~~~three~~~
443 | No ~mismatch~~
444 |
--------------------------------------------------------------------------------
/parse.go:
--------------------------------------------------------------------------------
1 | // Copyright 2021 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package markdown
6 |
7 | import (
8 | "strings"
9 | )
10 |
11 | type blockBuilder interface {
12 | extend(p *parser, s line) (line, bool)
13 | build(*parser) Block
14 | }
15 |
16 | type openBlock struct {
17 | builder blockBuilder
18 | inner []Block
19 | pos Position
20 | }
21 |
22 | func (p *parser) last() Block {
23 | ob := &p.stack[len(p.stack)-1]
24 | return ob.inner[len(ob.inner)-1]
25 | }
26 |
27 | func (p *parser) deleteLast() {
28 | ob := &p.stack[len(p.stack)-1]
29 | ob.inner = ob.inner[:len(ob.inner)-1]
30 | }
31 |
32 | type rootBuilder struct{}
33 |
34 | func (b *rootBuilder) build(p *parser) Block {
35 | return &Document{p.pos(), p.blocks(), p.links}
36 | }
37 |
38 | // A Parser is a Markdown parser.
39 | // The exported fields in the struct can be filled in before calling
40 | // [Parser.Parse] in order to customize the details of the parsing process.
41 | // A Parser is safe for concurrent use by multiple goroutines.
42 | type Parser struct {
43 | // HeadingID determines whether the parser accepts
44 | // the {#hdr} syntax for an HTML id="hdr" attribute on headings.
45 | // For example, if HeadingIDs is true then the Markdown
46 | // ## Overview {#overview}
47 | // will render as the HTML
48 | // Overview
49 | HeadingID bool
50 |
51 | // Strikethrough determines whether the parser accepts
52 | // ~abc~ and ~~abc~~ as strikethrough syntax, producing
53 | // abc in HTML.
54 | Strikethrough bool
55 |
56 | // TaskList determines whether the parser accepts
57 | // “task list items” as defined in GitHub Flavored Markdown.
58 | // When a list item begins with the plain text [ ] or [x]
59 | // that turns into an unchecked or checked check box.
60 | TaskList bool
61 |
62 | // TODO
63 | AutoLinkText bool
64 | AutoLinkAssumeHTTP bool
65 |
66 | // TODO
67 | Table bool
68 |
69 | // TODO
70 | Emoji bool
71 |
72 | // TODO
73 | SmartDot bool
74 | SmartDash bool
75 | SmartQuote bool
76 |
77 | // TODO
78 | Footnote bool
79 | }
80 |
81 | type parser struct {
82 | *Parser
83 |
84 | corner bool // noticed corner case to ignore in cross-implementation testing
85 |
86 | root *Document
87 | links map[string]*Link
88 | lineno int
89 | stack []openBlock
90 | lineDepth int
91 | lineInfo
92 |
93 | // texts to apply inline processing to
94 | texts []textRaw
95 |
96 | footnotes map[string]*Footnote
97 |
98 | // inline parsing
99 | s string
100 | emitted int // s[:emitted] has been emitted into list
101 | list []Inline
102 |
103 | backticks backtickParser
104 |
105 | fixups []func()
106 | }
107 |
108 | func (p *parser) addFixup(f func()) {
109 | p.fixups = append(p.fixups, f)
110 | }
111 |
112 | type lineInfo struct {
113 | noDeclEnd bool // no > on line
114 | noCommentEnd bool // no --> on line
115 | noProcInstEnd bool // no ?> on line
116 | noCDATAEnd bool // ]]> on line
117 | }
118 |
119 | type textRaw struct {
120 | *Text
121 | raw string
122 | }
123 |
124 | func (p *parser) newText(pos Position, text string) *Text {
125 | b := &Text{Position: pos}
126 | p.texts = append(p.texts, textRaw{b, text})
127 | return b
128 | }
129 |
130 | func (p *parser) blocks() []Block {
131 | b := &p.stack[len(p.stack)-1]
132 | return b.inner
133 | }
134 |
135 | func (p *parser) pos() Position {
136 | b := &p.stack[len(p.stack)-1]
137 | return b.pos
138 | }
139 |
140 | func (p *Parser) Parse(text string) *Document {
141 | d, _ := p.parse(text)
142 | return d
143 | }
144 |
145 | func (p *Parser) parse(text string) (d *Document, corner bool) {
146 | var ps parser
147 | ps.Parser = p
148 | if strings.Contains(text, "\x00") {
149 | text = strings.ReplaceAll(text, "\x00", "\uFFFD")
150 | ps.corner = true // goldmark does not replace NUL
151 | }
152 |
153 | ps.lineDepth = -1
154 | ps.addBlock(&rootBuilder{})
155 | for text != "" {
156 | end := 0
157 | for end < len(text) && text[end] != '\n' && text[end] != '\r' {
158 | end++
159 | }
160 | ln := text[:end]
161 | text = text[end:]
162 | nl := byte(0)
163 | switch {
164 | case len(text) >= 2 && text[0] == '\r' && text[1] == '\n':
165 | nl = '\r' + '\n'
166 | text = text[2:]
167 | case len(text) >= 1:
168 | nl = text[0]
169 | text = text[1:]
170 | }
171 | ps.lineno++
172 | ps.addLine(makeLine(ln, nl))
173 | }
174 | ps.trimStack(0)
175 |
176 | for _, t := range ps.texts {
177 | t.Inline = ps.inline(t.raw)
178 | }
179 |
180 | for _, f := range ps.fixups {
181 | f()
182 | }
183 |
184 | // TODO move into its own function
185 | var fixBlock func(Block)
186 |
187 | fixBlocks := func(blocks []Block) []Block {
188 | keep := blocks[:0]
189 | for _, b := range blocks {
190 | fixBlock(b)
191 | if _, ok := b.(*Empty); ok {
192 | continue
193 | }
194 | keep = append(keep, b)
195 | }
196 | return keep
197 | }
198 |
199 | fixBlock = func(x Block) {
200 | switch x := x.(type) {
201 | case *Document:
202 | x.Blocks = fixBlocks(x.Blocks)
203 | case *Quote:
204 | x.Blocks = fixBlocks(x.Blocks)
205 | case *List:
206 | for _, item := range x.Items {
207 | fixBlock(item)
208 | }
209 | case *Item:
210 | x.Blocks = fixBlocks(x.Blocks)
211 | }
212 | }
213 |
214 | fixBlock(ps.root)
215 |
216 | return ps.root, ps.corner
217 | }
218 |
219 | func (p *parser) curB() blockBuilder {
220 | if p.lineDepth < len(p.stack) {
221 | return p.stack[p.lineDepth].builder
222 | }
223 | return nil
224 | }
225 |
226 | func (p *parser) nextB() blockBuilder {
227 | if p.lineDepth+1 < len(p.stack) {
228 | return p.stack[p.lineDepth+1].builder
229 | }
230 | return nil
231 | }
232 | func (p *parser) trimStack(depth int) {
233 | if len(p.stack) < depth {
234 | // unreachable
235 | panic("trimStack")
236 | }
237 | for len(p.stack) > depth {
238 | p.closeBlock()
239 | }
240 | }
241 |
242 | func (p *parser) addBlock(c blockBuilder) {
243 | p.trimStack(p.lineDepth + 1)
244 | p.stack = append(p.stack, openBlock{})
245 | ob := &p.stack[len(p.stack)-1]
246 | ob.builder = c
247 | ob.pos.StartLine = p.lineno
248 | ob.pos.EndLine = p.lineno
249 | }
250 |
251 | func (p *parser) doneBlock(b Block) {
252 | p.trimStack(p.lineDepth + 1)
253 | ob := &p.stack[len(p.stack)-1]
254 | ob.inner = append(ob.inner, b)
255 | }
256 |
257 | func (p *parser) para() *paraBuilder {
258 | if b, ok := p.stack[len(p.stack)-1].builder.(*paraBuilder); ok {
259 | return b
260 | }
261 | return nil
262 | }
263 |
264 | func (p *parser) closeBlock() Block {
265 | b := &p.stack[len(p.stack)-1]
266 | if b.builder == nil {
267 | println("closeBlock", len(p.stack)-1)
268 | }
269 | blk := b.builder.build(p)
270 | p.stack = p.stack[:len(p.stack)-1]
271 | if len(p.stack) > 0 {
272 | b := &p.stack[len(p.stack)-1]
273 | b.inner = append(b.inner, blk)
274 | // _ = b
275 | } else {
276 | p.root = blk.(*Document)
277 | }
278 | return blk
279 | }
280 |
281 | func (p *parser) link(label string) *Link {
282 | return p.links[label]
283 | }
284 |
285 | func (p *parser) defineLink(label string, link *Link) {
286 | if p.links == nil {
287 | p.links = make(map[string]*Link)
288 | }
289 | p.links[label] = link
290 | }
291 |
292 | func (p *parser) addLine(s line) {
293 | // Process continued prefixes.
294 | p.lineDepth = 0
295 | for ; p.lineDepth+1 < len(p.stack); p.lineDepth++ {
296 | old := s
297 | var ok bool
298 | s, ok = p.stack[p.lineDepth+1].builder.extend(p, s)
299 | // Note: s != old is efficient only because s.text is either the same string (same pointer, len)
300 | // as old.text or has a different length or is empty; either way so there is no actual data comparison.
301 | // Sometimes s.text = "" and there is still
302 | if (ok || s != old) && !old.isBlank() {
303 | p.stack[p.lineDepth+1].pos.EndLine = p.lineno
304 | }
305 | if !ok {
306 | break
307 | }
308 | }
309 |
310 | if s.isBlank() {
311 | p.trimStack(p.lineDepth + 1)
312 | return
313 | }
314 |
315 | // Process new prefixes, if any.
316 | Prefixes:
317 | // Start new block inside p.stack[depth].
318 | for _, fn := range starters {
319 | if l, ok := fn(p, s); ok {
320 | s = l
321 | if s.isBlank() {
322 | return
323 | }
324 | p.lineDepth++
325 | goto Prefixes
326 | }
327 | }
328 |
329 | startParagraph(p, s)
330 | }
331 |
332 | func (c *rootBuilder) extend(p *parser, s line) (line, bool) {
333 | // unreachable
334 | panic("root extend")
335 | }
336 |
337 | type starter func(*parser, line) (line, bool)
338 |
339 | var starters = []starter{
340 | startIndentedCodeBlock,
341 | startFencedCodeBlock,
342 | startBlockQuote,
343 | startATXHeading,
344 | startSetextHeading,
345 | startThematicBreak,
346 | startListItem,
347 | startHTMLBlock,
348 | startFootnote,
349 | }
350 |
--------------------------------------------------------------------------------
/table.go:
--------------------------------------------------------------------------------
1 | // Copyright 2023 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package markdown
6 |
7 | import (
8 | "strings"
9 | "unicode/utf8"
10 | )
11 |
12 | // A Table is a [Block] representing a [table], a GitHub-flavored Markdown extension.
13 | //
14 | // [table]: https://github.github.com/gfm/#tables-extension-
15 | type Table struct {
16 | Position
17 | Header []*Text // header row (slice of columns)
18 | Align []string // alignment for columns: "left", "center", "right"; "" for unset
19 | Rows [][]*Text // data rows (slices of columns, not necessarily all same width)
20 | }
21 |
22 | func (*Table) Block() {}
23 |
24 | func (t *Table) printHTML(p *printer) {
25 | p.html("\n")
26 | p.html("\n")
27 | p.html("\n")
28 | for i, hdr := range t.Header {
29 | p.html("")
34 | hdr.printHTML(p)
35 | p.html(" \n")
36 | }
37 | p.html(" \n")
38 | p.html("\n")
39 | if len(t.Rows) > 0 {
40 | p.html("\n")
41 | for _, row := range t.Rows {
42 | p.html("\n")
43 | for i, cell := range row {
44 | p.html("")
49 | cell.printHTML(p)
50 | p.html(" \n")
51 | }
52 | p.html(" \n")
53 | }
54 | p.html("\n")
55 | }
56 | p.html("
\n")
57 | }
58 |
59 | func (t *Table) printMarkdown(p *printer) {
60 | // TODO: double-check this
61 | // inline all Text values in Header and Rows to
62 | // get final, rendered widths
63 | var (
64 | hdr = make([]string, len(t.Header))
65 | rows = make([][]string, 0, len(t.Rows))
66 | maxWidths = make([]int, len(t.Header))
67 |
68 | xb = &printer{}
69 | xs string
70 | )
71 |
72 | toString := func(txt *Text) string {
73 | xb.buf.Reset()
74 | txt.printMarkdown(xb)
75 | return strings.TrimSpace(xb.buf.String())
76 | }
77 |
78 | for i, txt := range t.Header {
79 | xs = toString(txt)
80 | hdr[i] = xs
81 | maxWidths[i] = utf8.RuneCountInString(xs)
82 | }
83 |
84 | for _, row := range t.Rows {
85 | xrow := make([]string, len(hdr))
86 | for j := range t.Header {
87 | xs = toString(row[j])
88 | xrow[j] = xs
89 | if n := utf8.RuneCountInString(xs); n > maxWidths[j] {
90 | maxWidths[j] = n
91 | }
92 | }
93 | rows = append(rows, xrow)
94 | }
95 |
96 | p.maybeQuoteNL('|')
97 | for i, cell := range hdr {
98 | p.WriteString("| ")
99 | pad(p, cell, t.Align[i], maxWidths[i])
100 | p.WriteString(" ")
101 | }
102 | p.WriteString("|")
103 |
104 | p.nl()
105 | for i, a := range t.Align {
106 | w := maxWidths[i]
107 | p.WriteString("| ")
108 | switch a {
109 | case "left":
110 | p.WriteString(":")
111 | repeat(p, '-', w-1)
112 | case "center":
113 | p.WriteString(":")
114 | repeat(p, '-', w-2)
115 | p.WriteString(":")
116 | case "right":
117 | repeat(p, '-', w-1)
118 | p.WriteString(":")
119 | default:
120 | repeat(p, '-', w)
121 | }
122 | p.WriteString(" ")
123 | }
124 | p.WriteString("|")
125 |
126 | for _, row := range rows {
127 | p.nl()
128 | for i := range t.Header {
129 | p.WriteString("| ")
130 | pad(p, row[i], t.Align[i], maxWidths[i])
131 | p.WriteString(" ")
132 | }
133 | p.WriteString("|")
134 | }
135 | }
136 |
137 | // repeat prints c n times to p.
138 | func repeat(p *printer, c byte, n int) {
139 | for i := 0; i < n; i++ {
140 | p.WriteByte(c)
141 | }
142 | }
143 |
144 | // pad prints text to p aligned according to align,
145 | // aiming for a width of w runes.
146 | // It can happen that multiple runes appear as a single “character”,
147 | // which will break the alignment, but this is the best we can do for now.
148 | func pad(p *printer, text, align string, w int) {
149 | n := w - utf8.RuneCountInString(text)
150 | switch align {
151 | default:
152 | p.WriteString(text)
153 | repeat(p, ' ', n)
154 | case "right":
155 | repeat(p, ' ', n)
156 | p.WriteString(text)
157 | case "center":
158 | repeat(p, ' ', n/2)
159 | p.WriteString(text)
160 | repeat(p, ' ', n-n/2)
161 | }
162 | }
163 |
164 | // A tableTrimmed is a table row with the outer pipes (if any) removed.
165 | // It is a separate type to avoid accidentally trimming the outer pipes multiple times,
166 | // which would instead discard outer empty cells.
167 | type tableTrimmed string
168 |
169 | // isTableSpace reports whether c is a space as far as tables are concerned.
170 | func isTableSpace(c byte) bool {
171 | return c == ' ' || c == '\t' || c == '\v' || c == '\f'
172 | }
173 |
174 | // tableTrimSpace returns s with table space prefixes and suffixes removed.
175 | func tableTrimSpace(s string) string {
176 | i := 0
177 | for i < len(s) && isTableSpace(s[i]) {
178 | i++
179 | }
180 | j := len(s)
181 | for j > i && isTableSpace(s[j-1]) {
182 | j--
183 | }
184 | return s[i:j]
185 | }
186 |
187 | // tableTrimOuter trims the outer | |, if any, from the row.
188 | func tableTrimOuter(row string) tableTrimmed {
189 | row = tableTrimSpace(row)
190 | if len(row) > 0 && row[0] == '|' {
191 | row = row[1:]
192 | }
193 | if len(row) > 0 && row[len(row)-1] == '|' {
194 | row = row[:len(row)-1]
195 | }
196 | return tableTrimmed(row)
197 | }
198 |
199 | // isTableStart reports whether the pair of lines hdr1, delim1
200 | // are a valid table start.
201 | func isTableStart(hdr1, delim1 string) bool {
202 | // Scan potential delimiter string, counting columns.
203 | // This happens on every line of text,
204 | // so make it relatively quick - nothing expensive.
205 | col := 0
206 | delim := tableTrimOuter(delim1)
207 | i := 0
208 | for ; ; col++ {
209 | for i < len(delim) && isTableSpace(delim[i]) {
210 | i++
211 | }
212 | if i >= len(delim) {
213 | break
214 | }
215 | if i < len(delim) && delim[i] == ':' {
216 | i++
217 | }
218 | if i >= len(delim) || delim[i] != '-' {
219 | return false
220 | }
221 | i++
222 | for i < len(delim) && delim[i] == '-' {
223 | i++
224 | }
225 | if i < len(delim) && delim[i] == ':' {
226 | i++
227 | }
228 | for i < len(delim) && isTableSpace(delim[i]) {
229 | i++
230 | }
231 | if i < len(delim) && delim[i] == '|' {
232 | i++
233 | }
234 | }
235 |
236 | if tableTrimSpace(hdr1) == "|" {
237 | // https://github.com/github/cmark-gfm/pull/127 and
238 | // https://github.com/github/cmark-gfm/pull/128
239 | // fixed a buffer overread by rejecting | by itself as a table line.
240 | // That seems to violate the “spec”, but we will play along.
241 | return false
242 | }
243 |
244 | return col == tableCount(tableTrimOuter(hdr1))
245 | }
246 |
247 | // tableCount returns the number of columns in the row.
248 | func tableCount(row tableTrimmed) int {
249 | col := 1
250 | prev := byte(0)
251 | for i := 0; i < len(row); i++ {
252 | c := row[i]
253 | if c == '|' && prev != '\\' {
254 | col++
255 | }
256 | prev = c
257 | }
258 | return col
259 | }
260 |
261 | // A tableBuilder is a [blockBuilder] for a [Table].
262 | type tableBuilder struct {
263 | hdr tableTrimmed // header line
264 | delim tableTrimmed // delimiter line
265 | rows []tableTrimmed // data lines
266 | }
267 |
268 | // start starts the builder with the given header and delimiter lines.
269 | func (b *tableBuilder) start(hdr, delim string) {
270 | b.hdr = tableTrimOuter(hdr)
271 | b.delim = tableTrimOuter(delim)
272 | }
273 |
274 | // addRow adds a new row to the table.
275 | func (b *tableBuilder) addRow(row string) {
276 | b.rows = append(b.rows, tableTrimOuter(row))
277 | }
278 |
279 | // build returns the [Table] for this tableBuilder.
280 | func (b *tableBuilder) build(p *parser) Block {
281 | pos := p.pos()
282 | pos.StartLine-- // builder does not count header
283 | pos.EndLine = pos.StartLine + 1 + len(b.rows)
284 | t := &Table{
285 | Position: pos,
286 | }
287 | width := tableCount(b.hdr)
288 | t.Header = b.parseRow(p, b.hdr, pos.StartLine, width)
289 | t.Align = b.parseAlign(b.delim, width)
290 | t.Rows = make([][]*Text, len(b.rows))
291 | for i, row := range b.rows {
292 | t.Rows[i] = b.parseRow(p, row, pos.StartLine+2+i, width)
293 | }
294 | return t
295 | }
296 |
297 | // parseRow TODO explain
298 | func (b *tableBuilder) parseRow(p *parser, row tableTrimmed, line int, width int) []*Text {
299 | out := make([]*Text, 0, width)
300 | pos := Position{StartLine: line, EndLine: line}
301 | start := 0
302 | unesc := nop
303 | for i := 0; i < len(row); i++ {
304 | c := row[i]
305 | if c == '\\' && i+1 < len(row) && row[i+1] == '|' {
306 | unesc = tableUnescape
307 | i++
308 | continue
309 | }
310 | if c == '|' {
311 | out = append(out, p.newText(pos, unesc(strings.Trim(string(row[start:i]), " \t\v\f"))))
312 | if len(out) == width {
313 | // Extra cells are discarded!
314 | return out
315 | }
316 | start = i + 1
317 | unesc = nop
318 | }
319 | }
320 | out = append(out, p.newText(pos, unesc(strings.Trim(string(row[start:]), " \t\v\f"))))
321 | for len(out) < width {
322 | // Missing cells are considered empty.
323 | out = append(out, p.newText(pos, ""))
324 | }
325 | return out
326 | }
327 |
328 | func nop(text string) string {
329 | return text
330 | }
331 |
332 | // tableUnescape TODO
333 | func tableUnescape(text string) string {
334 | out := make([]byte, 0, len(text))
335 | for i := 0; i < len(text); i++ {
336 | c := text[i]
337 | if c == '\\' && i+1 < len(text) && text[i+1] == '|' {
338 | i++
339 | c = '|'
340 | }
341 | out = append(out, c)
342 | }
343 | return string(out)
344 | }
345 |
346 | // parseAlign TODO
347 | func (b *tableBuilder) parseAlign(delim tableTrimmed, n int) []string {
348 | align := make([]string, 0, tableCount(delim))
349 | start := 0
350 | for i := 0; i < len(delim); i++ {
351 | if delim[i] == '|' {
352 | align = append(align, tableAlign(string(delim[start:i])))
353 | start = i + 1
354 | }
355 | }
356 | align = append(align, tableAlign(string(delim[start:])))
357 | return align
358 | }
359 |
360 | // tableAlign TODO
361 | func tableAlign(cell string) string {
362 | cell = tableTrimSpace(cell)
363 | l := cell[0] == ':'
364 | r := cell[len(cell)-1] == ':'
365 | switch {
366 | case l && r:
367 | return "center"
368 | case l:
369 | return "left"
370 | case r:
371 | return "right"
372 | }
373 | return ""
374 | }
375 |
--------------------------------------------------------------------------------
/list.go:
--------------------------------------------------------------------------------
1 | // Copyright 2021 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package markdown
6 |
7 | import (
8 | "fmt"
9 | "strconv"
10 | )
11 |
12 | // TODO should Item implement Block?
13 | // maybe make a itemBlock internal Block for use with the builders?
14 |
15 | // A List is a [Block] representing a [list],
16 | // either an unordered (bullet) list
17 | // or an ordered (numbered) list.
18 | //
19 | // Lists can be [loose or tight], which controls the spacing between list items.
20 | // In Markdown, a list is loose when there is a blank line
21 | // between any two list items, or when any list item
22 | // directly contains two blocks that are separated by a blank line.
23 | // (Note that because paragraphs must be separated by blank lines,
24 | // any multi-paragraph item necessarily creates a loose list.)
25 | // When rendering HTML, loose list items are formatted in the usual way.
26 | // For tight lists, a list item consisting of a single paragraph omits
27 | // the ...
tags around the paragraph text.
28 | //
29 | // [list]: https://spec.commonmark.org/0.31.2/#lists
30 | // [loose or tight]: https://spec.commonmark.org/0.31.2/#loose
31 | type List struct {
32 | Position
33 |
34 | // Bullet is the bullet character used in the list: '-', '+', or '*'.
35 | // For an ordered list, Bullet is the character following the number: '.' or ')'.
36 | Bullet rune
37 |
38 | // Start is the number of the first item in an ordered list.
39 | Start int
40 |
41 | // Loose indicates whether the list is loose.
42 | // (See the [List] doc comment for details.)
43 | Loose bool
44 |
45 | // Items is the list's items.
46 | // TODO: Should this be []*Item or Blocks?
47 | Items []Block // always *Item
48 | }
49 |
50 | func (*List) Block() {}
51 |
52 | // Ordered reports whether the list is ordered (numbered).
53 | func (l *List) Ordered() bool {
54 | return l.Bullet == '.' || l.Bullet == ')'
55 | }
56 |
57 | // An Item is a [Block] representing a [list item].
58 | //
59 | // [list item]: https://spec.commonmark.org/0.31.2/#list-items
60 | type Item struct {
61 | Position
62 |
63 | // Blocks is the item content.
64 | Blocks []Block
65 | }
66 |
67 | func (*Item) Block() {}
68 |
69 | func (b *List) printHTML(p *printer) {
70 | if b.Bullet == '.' || b.Bullet == ')' {
71 | p.html("\n")
76 | } else {
77 | p.html("\n")
78 | }
79 | for _, item := range b.Items {
80 | item.printHTML(p)
81 | }
82 | if b.Bullet == '.' || b.Bullet == ')' {
83 | p.html("
\n")
84 | } else {
85 | p.html("\n")
86 | }
87 | }
88 |
89 | func (b *Item) printHTML(p *printer) {
90 | p.html("")
91 | if len(b.Blocks) > 0 {
92 | if _, ok := b.Blocks[0].(*Text); !ok {
93 | p.WriteString("\n")
94 | }
95 | }
96 | for i, c := range b.Blocks {
97 | c.printHTML(p)
98 | if i+1 < len(b.Blocks) {
99 | if _, ok := c.(*Text); ok {
100 | p.WriteString("\n")
101 | }
102 | }
103 | }
104 | p.html(" \n")
105 | }
106 |
107 | func (b *List) printMarkdown(p *printer) {
108 | old := p.listOut
109 | defer func() {
110 | p.listOut = old
111 | }()
112 | p.bullet = b.Bullet
113 | p.num = b.Start
114 | if b.Loose {
115 | p.loose++
116 | } else {
117 | p.tight++
118 | }
119 | p.maybeNL()
120 | for i, item := range b.Items {
121 | if i > 0 {
122 | p.nl()
123 | if b.Loose {
124 | p.nl()
125 | }
126 | }
127 | item.printMarkdown(p)
128 | p.num++
129 | }
130 | }
131 |
132 | func (b *Item) printMarkdown(p *printer) {
133 | var marker string
134 | if p.bullet == '.' || p.bullet == ')' {
135 | marker = fmt.Sprintf(" %d%c ", p.num, p.bullet)
136 | } else {
137 | marker = fmt.Sprintf(" %c ", p.bullet)
138 | }
139 | p.WriteString(marker)
140 | n := len(marker)
141 | if n > 4 {
142 | n = 4
143 | }
144 | defer p.pop(p.push(" "[:n]))
145 | printMarkdownBlocks(b.Blocks, p)
146 | }
147 |
148 | // A listBuilder is a [blockBuilder] for a [List].
149 | type listBuilder struct {
150 | // List fields
151 | bullet rune
152 | start int
153 |
154 | // item is the builder for the current item.
155 | item *itemBuilder
156 |
157 | //
158 | todo func() line
159 | }
160 |
161 | // An itemBuilder is a [blockBuilder] for an [Item].
162 | type itemBuilder struct {
163 | list *listBuilder // list containing item
164 | width int // TODO
165 | haveContent bool // TODO
166 | }
167 |
168 | // TODO explain
169 | // startListItem is a [starter] for a list item.
170 | // The first list item in a list also starts the list itself.
171 | func startListItem(p *parser, s line) (_ line, _ bool) {
172 | if list, ok := p.curB().(*listBuilder); ok && list.todo != nil {
173 | s = list.todo()
174 | list.todo = nil
175 | return s, true
176 | }
177 |
178 | t := s
179 | n := 0
180 | for i := 0; i < 3; i++ {
181 | if !t.trimSpace(1, 1, false) {
182 | break
183 | }
184 | n++
185 | }
186 | bullet := t.peek()
187 | var num int
188 | Switch:
189 | switch bullet {
190 | default:
191 | return
192 | case '-', '*', '+':
193 | t.trim(bullet)
194 | n++
195 | case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
196 | for j := t.i; ; j++ {
197 | if j >= len(t.text) {
198 | return
199 | }
200 | c := t.text[j]
201 | if c == '.' || c == ')' {
202 | // success
203 | bullet = c
204 | j++
205 | n += j - t.i
206 | t.i = j
207 | break Switch
208 | }
209 | if c < '0' || '9' < c {
210 | return
211 | }
212 | if j-t.i >= 9 {
213 | return
214 | }
215 | num = num*10 + int(c) - '0'
216 | }
217 |
218 | }
219 | if !t.trimSpace(1, 1, true) {
220 | return
221 | }
222 | n++
223 | tt := t
224 | m := 0
225 | for i := 0; i < 3 && tt.trimSpace(1, 1, false); i++ {
226 | m++
227 | }
228 | if !tt.trimSpace(1, 1, true) {
229 | n += m
230 | t = tt
231 | }
232 |
233 | // Pretty sure we have a list item now.
234 |
235 | var list *listBuilder
236 | if c, ok := p.nextB().(*listBuilder); ok {
237 | list = c
238 | }
239 | if list == nil || list.bullet != rune(bullet) {
240 | // “When the first list item in a list interrupts a paragraph—that is,
241 | // when it starts on a line that would otherwise count as
242 | // paragraph continuation text—then (a) the lines Ls must
243 | // not begin with a blank line,
244 | // and (b) if the list item is ordered, the start number must be 1.”
245 | if list == nil && p.para() != nil && (t.isBlank() || (bullet == '.' || bullet == ')') && num != 1) {
246 | // Goldmark and Dingus both seem to get this wrong
247 | // (or the words above don't mean what we think they do).
248 | // when the paragraph that could be continued
249 | // is inside a block quote.
250 | // See testdata/extra.txt 117.md.
251 | p.corner = true
252 | return
253 | }
254 | list = &listBuilder{bullet: rune(bullet), start: num}
255 | p.addBlock(list)
256 | }
257 | b := &itemBuilder{list: list, width: n, haveContent: !t.isBlank()}
258 | list.todo = func() line {
259 | p.addBlock(b)
260 | list.item = b
261 | return t
262 | }
263 |
264 | // TODO explain s not t
265 | return s, true
266 | }
267 |
268 | func (c *listBuilder) extend(p *parser, s line) (line, bool) {
269 | // TODO explain
270 | item := c.item
271 | if item == nil && s.isBlank() { // TODO how can this happen
272 | return s, true
273 | }
274 |
275 | // If we can trim the indentation required by the current item,
276 | // do that and return true, allowing s to be passed to the
277 | // item builder.
278 | if item != nil && s.trimSpace(item.width, item.width, true) {
279 | return s, true
280 | }
281 | return s, false
282 | }
283 |
284 | func (c *itemBuilder) extend(p *parser, s line) (line, bool) {
285 | blank := s.isBlank()
286 |
287 | // If there is a blank line and no content so far,
288 | // the item is over. TODO explain
289 | if blank && !c.haveContent {
290 | return s, false
291 | }
292 |
293 | // TODO explain
294 | if blank {
295 | // Goldmark does this and apparently commonmark.js too.
296 | // Not sure why it is necessary.
297 | return line{}, true
298 | }
299 |
300 | // TODO explain
301 | if !blank {
302 | c.haveContent = true
303 | }
304 | return s, true
305 | }
306 |
307 | func (b *itemBuilder) build(p *parser) Block {
308 | b.list.item = nil
309 | return &Item{p.pos(), p.blocks()}
310 | }
311 |
312 | func (b *listBuilder) build(p *parser) Block {
313 | blocks := p.blocks()
314 | pos := p.pos()
315 |
316 | // list can have wrong pos b/c extend dance.
317 | // TODO explain
318 | pos.EndLine = blocks[len(blocks)-1].Pos().EndLine
319 |
320 | // Decide whether list is loose.
321 | loose := false
322 | Loose:
323 | for i, c := range blocks {
324 | c := c.(*Item)
325 | if i+1 < len(blocks) {
326 | if blocks[i+1].Pos().StartLine-c.EndLine > 1 {
327 | loose = true
328 | break Loose
329 | }
330 | }
331 | for j, d := range c.Blocks {
332 | endLine := d.Pos().EndLine
333 | if j+1 < len(c.Blocks) {
334 | if c.Blocks[j+1].Pos().StartLine-endLine > 1 {
335 | loose = true
336 | break Loose
337 | }
338 | }
339 | }
340 | }
341 |
342 | if !loose {
343 | // TODO: rethink whether this is correct.
344 | // Perhaps the blocks should still be Paragraph
345 | // and we just skip over the during formatting?
346 | // Then Text might not need to be a Block.
347 | for _, c := range blocks {
348 | c := c.(*Item)
349 | for i, d := range c.Blocks {
350 | if p, ok := d.(*Paragraph); ok {
351 | c.Blocks[i] = p.Text
352 | }
353 | }
354 | }
355 | }
356 |
357 | x := &List{
358 | pos,
359 | b.bullet,
360 | b.start,
361 | loose,
362 | p.blocks(),
363 | }
364 | listCorner(p, x)
365 | if p.TaskList {
366 | p.addFixup(func() {
367 | parseTaskList(p, x)
368 | })
369 | }
370 | return x
371 | }
372 |
373 | // listCorner checks whether list contains any corner cases
374 | // that other implementations mishandle, and if so sets p.corner.
375 | func listCorner(p *parser, list *List) {
376 | for _, item := range list.Items {
377 | item := item.(*Item)
378 | if len(item.Blocks) == 0 {
379 | // Goldmark mishandles what follows; see testdata/extra.txt 111.md.
380 | p.corner = true
381 | return
382 | }
383 | switch item.Blocks[0].(type) {
384 | case *List, *ThematicBreak, *CodeBlock:
385 | // Goldmark mishandles a list with various block items inside it.
386 | p.corner = true
387 | return
388 | }
389 | }
390 | }
391 |
392 | // GitHub task list extension
393 |
394 | // A Task is an [Inline] for a [task list item marker] (a checkbox),
395 | // a GitHub-flavored Markdown extension.
396 | //
397 | // [task list item marker]: https://github.github.com/gfm/#task-list-items-extension-
398 | type Task struct {
399 | Checked bool
400 | }
401 |
402 | func (*Task) Inline() {}
403 |
404 | func (x *Task) printHTML(p *printer) {
405 | p.html(" `)
410 | }
411 |
412 | func (x *Task) printMarkdown(p *printer) {
413 | if x.Checked {
414 | p.text(`[x] `)
415 | } else {
416 | p.text(`[ ] `)
417 | }
418 | }
419 |
420 | func (x *Task) printText(p *printer) {
421 | // Unreachable: printText is only used to render the
422 | // alt text of an image, which can only contain inlines,
423 | // and while Task is an inline, it only appears inside
424 | // lists, and a list cannot appear in an alt text.
425 | // Even so, maybe someone will make malformed syntax trees.
426 | x.printMarkdown(p)
427 | }
428 |
429 | // taskList checks whether any items in list begin with task list markers.
430 | // If so, it replaces the markers with [Task]s.
431 | func parseTaskList(p *parser, list *List) {
432 | for _, item := range list.Items {
433 | item := item.(*Item)
434 | if len(item.Blocks) == 0 {
435 | continue
436 | }
437 | var text *Text
438 | switch b := item.Blocks[0].(type) {
439 | default:
440 | continue
441 | case *Paragraph:
442 | text = b.Text
443 | case *Text:
444 | text = b
445 | }
446 | if len(text.Inline) < 1 {
447 | // unreachable with standard parser
448 | continue
449 | }
450 | pl, ok := text.Inline[0].(*Plain)
451 | if !ok {
452 | continue
453 | }
454 | s := pl.Text
455 | if len(s) < 4 || s[0] != '[' || s[2] != ']' || (s[1] != ' ' && s[1] != 'x' && s[1] != 'X') {
456 | continue
457 | }
458 | if s[3] != ' ' && s[3] != '\t' {
459 | p.corner = true // goldmark does not require the space
460 | continue
461 | }
462 | text.Inline = append([]Inline{&Task{Checked: s[1] == 'x' || s[1] == 'X'},
463 | &Plain{Text: s[len("[x] "):]}}, text.Inline[1:]...)
464 | }
465 | }
466 |
--------------------------------------------------------------------------------
/testdata/autoext.txt:
--------------------------------------------------------------------------------
1 | -- parser.json --
2 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true}
3 | -- gfm622.md --
4 | www.commonmark.org
5 | -- gfm622.html --
6 |
7 | -- gfm623.md --
8 | Visit www.commonmark.org/help for more information.
9 | -- gfm623.html --
10 | Visit www.commonmark.org/help for more information.
11 | -- gfm624.md --
12 | Visit www.commonmark.org.
13 |
14 | Visit www.commonmark.org/a.b.
15 | -- gfm624.html --
16 | Visit www.commonmark.org.
17 | Visit www.commonmark.org/a.b.
18 | -- gfm625.md --
19 | www.google.com/search?q=Markup+(business)
20 |
21 | www.google.com/search?q=Markup+(business)))
22 |
23 | (www.google.com/search?q=Markup+(business))
24 |
25 | (www.google.com/search?q=Markup+(business)
26 | -- gfm625.html --
27 | www.google.com/search?q=Markup+(business)
28 | www.google.com/search?q=Markup+(business)))
29 | (www.google.com/search?q=Markup+(business))
30 | (www.google.com/search?q=Markup+(business)
31 | -- gfm626.md --
32 | www.google.com/search?q=(business))+ok
33 | -- gfm626.html --
34 | www.google.com/search?q=(business))+ok
35 | -- gfm627.md --
36 | www.google.com/search?q=commonmark&hl=en
37 |
38 | www.google.com/search?q=commonmark&hl;
39 | -- gfm627.html --
40 | www.google.com/search?q=commonmark&hl=en
41 | www.google.com/search?q=commonmark&hl;
42 | -- gfm628.md --
43 | www.commonmark.org/hewww.commonmark.org/he<lp
46 | -- gfm629.md --
47 | http://commonmark.org
48 |
49 | (Visit http://encrypted.google.com/search?q=Markup+(business))
50 | -- gfm629.html --
51 |
52 | (Visit http://encrypted.google.com/search?q=Markup+(business))
53 | -- gfm630.md --
54 | foo@bar.baz
55 | -- gfm630.html --
56 |
57 | -- gfm631.md --
58 | hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.
59 | -- gfm631.html --
60 | hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.
61 | -- gfm632.md --
62 | a.b-c_d@a.b
63 |
64 | a.b-c_d@a.b.
65 |
66 | a.b-c_d@a.b-
67 |
68 | a.b-c_d@a.b_
69 | -- gfm632.html --
70 |
71 |
72 | a.b-c_d@a.b-
73 | a.b-c_d@a.b_
74 | -- gfm633.md --
75 | mailto:foo@bar.baz
76 |
77 | mailto:a.b-c_d@a.b
78 |
79 | mailto:a.b-c_d@a.b.
80 |
81 | mailto:a.b-c_d@a.b/
82 |
83 | mailto:a.b-c_d@a.b-
84 |
85 | mailto:a.b-c_d@a.b_
86 |
87 | xmpp:foo@bar.baz
88 |
89 | xmpp:foo@bar.baz.
90 | -- gfm633.html --
91 |
92 |
93 |
94 |
95 | mailto:a.b-c_d@a.b-
96 | mailto:a.b-c_d@a.b_
97 |
98 |
99 | -- gfm634.md --
100 | xmpp:foo@bar.baz/txt
101 |
102 | xmpp:foo@bar.baz/txt@bin
103 |
104 | xmpp:foo@bar.baz/txt@bin.com
105 | -- gfm634.html --
106 |
107 |
108 |
109 | -- gfm635.md --
110 | xmpp:foo@bar.baz/txt/bin
111 | -- gfm635.html --
112 |
113 | -- 1.md --
114 | xhttp://go.dev y z
115 | αhttp://go.dev y z
116 | -- 1.html --
117 | xhttp://go.dev y z
118 | αhttp://go.dev y z
119 | -- 1a.md --
120 | xhttps://go.dev y z
121 | αhttps://go.dev y z
122 | -- 1a.html --
123 | xhttps://go.dev y z
124 | αhttps://go.dev y z
125 | -- 2.md --
126 | cannot follow ascii letter
127 | xhttp://go.dev y z
128 | x0http://go.dev
129 | αhttp://go.dev
130 | -- 2.html --
131 | cannot follow ascii letter
132 | xhttp://go.dev y z
133 | x0http://go.dev
134 | αhttp://go.dev
135 | -- 3.md --
136 | deviations - github would include the suffixes in the URLs
137 | www.go.dev@def.ghi is my email
138 | www.go.dev!wtf
139 | -- 3.html --
140 | deviations - github would include the suffixes in the URLs
141 | www.go.dev@def.ghi is my email
142 | www.go.dev!wtf
143 | -- 4.md --
144 | trimming
145 | www.google.com/search?q=Markup+(business)))
146 | -- 4.html --
147 | trimming
148 | www.google.com/search?q=Markup+(business)))
149 | -- 5.md --
150 | www.google.com/search?q=Markup+(business))).
151 | -- 5.html --
152 | www.google.com/search?q=Markup+(business))).
153 | -- 6.md --
154 | www.google.com/search?q=Markup+(business).
155 | -- 6.html --
156 | www.google.com/search?q=Markup+(business).
157 | -- 7.md --
158 | www.google.com/search?q=Markup+)()((business)
159 | -- 7.html --
160 | www.google.com/search?q=Markup+)()((business)
161 | -- 8.md --
162 | www.google.com/search?q=commonmark&hl;
163 | -- 8.html --
164 | www.google.com/search?q=commonmark&hl;
165 | -- 9.md --
166 | www.google.com/search?q=commonmark&hl;)
167 | -- 9.html --
168 | www.google.com/search?q=commonmark&hl;)
169 | -- 10.md --
170 | www.google.com/search?q=(commonmark&hl;)
171 | -- 10.html --
172 | www.google.com/search?q=(commonmark&hl;)
173 | -- 11.md --
174 | www.google.com/search?q=commonmark)&hl;
175 | -- 11.html --
176 | www.google.com/search?q=commonmark)&hl;
177 | -- 12.md --
178 | www.google.com/search?q=commonmark).&hl;
179 | -- 12.html --
180 | www.google.com/search?q=commonmark).&hl;
181 | -- 13.md --
182 | www.google.com/search?q=commonmark).&hl
183 | -- 13.html --
184 | www.google.com/search?q=commonmark).&hl
185 | -- 14.md --
186 | www.google.com/search?q=commonmark).&hl
187 | -- 14.html --
188 | www.google.com/search?q=commonmark).&hl
189 | -- 15.md --
190 | www.goo-gle.com/search
191 | -- 15.html --
192 |
193 | -- 16.md --
194 | www.goo_gle.com/search
195 | -- 16.html --
196 | www.goo_gle.com/search
197 | -- 17.md --
198 | www.foo_bar.google.com/search
199 | -- 17.html --
200 |
201 | -- 18.md --
202 | www./search
203 | -- 18.html --
204 |
205 | -- 19.md --
206 | www.google.com.foo_bar/search
207 | -- 19.html --
208 | www.google.com.foo_bar/search
209 | -- 20.md --
210 | www.search
211 | -- 20.html --
212 |
213 | -- 21.md --
214 | www.
215 | -- 21.html --
216 | www.
217 | -- 21a.md --
218 | www.!search
219 | -- 21a.html --
220 | www.!search
221 | -- 22.md --
222 | www.sea_rch
223 | -- 22.html --
224 | www.sea_rch
225 | -- 23.md --
226 | http://!search
227 | -- 23.html --
228 | http://!search
229 | -- 24.md --
230 | http://!search
231 | -- 24.html --
232 | http://!search
233 | -- 25.md --
234 | http://search
235 | -- 25.html --
236 |
237 | -- 26.md --
238 | https://search
239 | -- 26.html --
240 |
241 | -- 27.md --
242 | http://sea_rch
243 | -- 27.html --
244 | http://sea_rch
245 | -- 28.md --
246 | https://sea_rch
247 | -- 28.html --
248 | https://sea_rch
249 | -- 29.md --
250 | http://sea_rch.x
251 | -- 29.html --
252 | http://sea_rch.x
253 | -- 30.md --
254 | https://sea_rch.x
255 | -- 30.html --
256 | https://sea_rch.x
257 | -- 31.md --
258 | http://sea_rch.x.y
259 | -- 31.html --
260 |
261 | -- 32.md --
262 | http://sea_rch.x.y.http://www.google.com
263 | -- 32.html --
264 | http://sea_rch.x.y.http://www.google.com
265 | -- 33.md --
266 | http://sea_rch.http://www.google.com
267 | -- 33.html --
268 | http://sea_rch.http://www.google.com
269 | -- 34.md --
270 | _abc_@ghi.def is my email
271 | -- 34.html --
272 | abc@ghi.def is my email
273 | -- 35.md --
274 | _abc@ghi_.def is my email
275 | -- 35.html --
276 | abc@ghi.def is my email
277 | -- 36.md --
278 | `hello`abc@def.ghi is my email
279 | -- 36.html --
280 | helloabc@def.ghi is my email
281 | -- 37.md --
282 | `hello` abc@def.ghi is my email
283 | -- 37.html --
284 | hello abc@def.ghi is my email
285 | -- 38.md --
286 | *hello*abc@def.ghi is my email
287 | -- 38.html --
288 | helloabc@def.ghi is my email
289 | -- 39.md --
290 | [link](link)abc@def.ghi is my email
291 | -- 39.html --
292 | linkabc@def.ghi is my email
293 | -- 40.md --
294 | \!abc@def.ghi is my email
295 | -- 40.html --
296 | !abc@def.ghi is my email
297 | -- 41.md --
298 | $abc@def.ghi is my email
299 | -- 41.html --
300 | $abc@def.ghi is my email
301 | -- 42.md --
302 | www.go.dev@def.ghi is my email
303 | -- 42.html --
304 | www.go.dev@def.ghi is my email
305 | -- 43.md --
306 | abc@www.go.dev is my email
307 | -- 43.html --
308 | abc@www.go.dev is my email
309 | -- 44.md --
310 | αabc@def.ghi
311 | -- 44.html --
312 |
313 | -- 45.md --
314 | https://web.site:8080/~matloob
315 | -- 45.html --
316 | https://web.site:8080/~matloob
317 | -- parser.json --
318 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true, "Strikethrough": true}
319 | -- 46.md --
320 | https://web.site:8080/~matloob
321 | -- 46.html --
322 | https://web.site:8080/~matloob
323 | -- parser.json --
324 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true}
325 | -- 47.md --
326 | https://web.site:8080/*matlo_ob
327 | -- 47.html --
328 | https://web.site:8080/*matlo_ob
329 | -- parser.json --
330 | {"AutoLinkText": true, "Strikethrough": true}
331 | -- 48.md --
332 | *user@dom.org*
333 | -- 48.html --
334 |
335 | -- 49.md --
336 | **user@dom.org**
337 | -- 49.html --
338 |
339 | -- 50.md --
340 | ~~user@dom.org~~
341 | -- 50.html --
342 |
343 | -- 51.md --
344 | www.google.com/search?q=cmark&-hl;
345 | -- 51.html --
346 | www.google.com/search?q=cmark&-hl;
347 | -- 52.md --
348 | foo@.bar
349 | -- 52.html --
350 |
351 | -- 53.md --
352 | foo@..bar
353 | -- 53.html --
354 | foo@..bar
355 | -- 54.md --
356 | mailto:none
357 | mailto:none#
358 | -- 54.html --
359 | mailto:none
360 | mailto:none#
361 | -- 55.md --
362 | xmpp:none
363 | xmpp:none#
364 | xmpp:foo@..bar
365 | -- 55.html --
366 | xmpp:none
367 | xmpp:none#
368 | xmpp:foo@..bar
369 |
--------------------------------------------------------------------------------
/md_test.go:
--------------------------------------------------------------------------------
1 | // Copyright 2021 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package markdown
6 |
7 | import (
8 | "bytes"
9 | "encoding/json"
10 | "flag"
11 | "fmt"
12 | "go/token"
13 | "io"
14 | "net/url"
15 | "os"
16 | "path/filepath"
17 | "reflect"
18 | "strings"
19 | "testing"
20 |
21 | "github.com/yuin/goldmark"
22 | gext "github.com/yuin/goldmark/extension"
23 | gparser "github.com/yuin/goldmark/parser"
24 | ghtml "github.com/yuin/goldmark/renderer/html"
25 | "golang.org/x/tools/txtar"
26 | )
27 |
28 | var goldmarkFlag = flag.Bool("goldmark", false, "run goldmark tests")
29 |
30 | var roundTripFailures = map[string]bool{
31 | "TestToHTML/extra/13": true, // indentation of tag
32 | "TestToHTML/extra/75": true, // weird list
33 | "TestToHTML/extra/76": true, // weird list
34 | "TestToHTML/extra/115": true, // weird list
35 |
36 | "TestToHTML/gfm_ext/9": true, // table
37 | "TestToHTML/gfm_ext/11": true, // table
38 |
39 | "TestToHTML/spec0.29/19": true, // thematic break
40 | "TestToHTML/spec0.29/40": true, // indentation of heading
41 | "TestToHTML/spec0.29/51": true, // newline in heading
42 | "TestToHTML/spec0.29/52": true, // newline in heading
43 | "TestToHTML/spec0.29/57": true, // setext heading
44 | "TestToHTML/spec0.29/63": true, // setext heading
45 | "TestToHTML/spec0.29/65": true, // newline in heading
46 | "TestToHTML/spec0.29/171": true, // link ref def
47 | "TestToHTML/spec0.29/208": true, // weird list
48 | "TestToHTML/spec0.29/227": true, // weird list
49 | "TestToHTML/spec0.29/241": true, // weird list
50 | "TestToHTML/spec0.29/282": true, // weird list
51 | "TestToHTML/spec0.29/283": true, // weird list
52 | "TestToHTML/spec0.29/312": true, // escape plain
53 | "TestToHTML/spec0.29/323": true, // escape plain
54 | "TestToHTML/spec0.29/324": true, // escape plain
55 | "TestToHTML/spec0.29/325": true, // escape plain
56 | "TestToHTML/spec0.29/326": true, // escape plain
57 | "TestToHTML/spec0.29/327": true, // escape plain
58 | "TestToHTML/spec0.29/331": true, // backtick spaces
59 | "TestToHTML/spec0.29/349": true, // backticks
60 | "TestToHTML/spec0.29/502": true, // escape quotes
61 |
62 | "TestToHTML/spec0.30/26": true, // escape plain
63 | "TestToHTML/spec0.30/37": true, // escape plain
64 | "TestToHTML/spec0.30/38": true, // escape plain
65 | "TestToHTML/spec0.30/39": true, // escape plain
66 | "TestToHTML/spec0.30/40": true, // escape plain
67 | "TestToHTML/spec0.30/41": true, // escape plain
68 | "TestToHTML/spec0.30/49": true, // thematic break
69 | "TestToHTML/spec0.30/70": true, // indentation of heading
70 | "TestToHTML/spec0.30/81": true, // newline in heading
71 | "TestToHTML/spec0.30/82": true, // newline in heading
72 | "TestToHTML/spec0.30/87": true, // setext heading
73 | "TestToHTML/spec0.30/93": true, // setext heading
74 | "TestToHTML/spec0.30/95": true, // newline in heading
75 | "TestToHTML/spec0.30/202": true, // link ref def
76 | "TestToHTML/spec0.30/238": true, // weird list
77 | "TestToHTML/spec0.30/257": true, // weird list
78 | "TestToHTML/spec0.30/271": true, // weird list
79 | "TestToHTML/spec0.30/312": true, // weird list
80 | "TestToHTML/spec0.30/313": true, // weird list
81 | "TestToHTML/spec0.30/331": true, // backtick spaces
82 | "TestToHTML/spec0.30/349": true, // backticks
83 | "TestToHTML/spec0.30/505": true, // escape quotes
84 |
85 | "TestToHTML/spec0.31.2/26": true, // escape plain
86 | "TestToHTML/spec0.31.2/37": true, // escape plain
87 | "TestToHTML/spec0.31.2/38": true, // escape plain
88 | "TestToHTML/spec0.31.2/39": true, // escape plain
89 | "TestToHTML/spec0.31.2/40": true, // escape plain
90 | "TestToHTML/spec0.31.2/41": true, // escape plain
91 | "TestToHTML/spec0.31.2/49": true, // thematic break
92 | "TestToHTML/spec0.31.2/70": true, // indentation of heading
93 | "TestToHTML/spec0.31.2/81": true, // newline in heading
94 | "TestToHTML/spec0.31.2/82": true, // newline in heading
95 | "TestToHTML/spec0.31.2/87": true, // setext heading
96 | "TestToHTML/spec0.31.2/93": true, // setext heading
97 | "TestToHTML/spec0.31.2/95": true, // newline in heading
98 | "TestToHTML/spec0.31.2/202": true, // link ref def
99 | "TestToHTML/spec0.31.2/238": true, // weird list
100 | "TestToHTML/spec0.31.2/257": true, // weird list
101 | "TestToHTML/spec0.31.2/271": true, // weird list
102 | "TestToHTML/spec0.31.2/312": true, // weird list
103 | "TestToHTML/spec0.31.2/313": true, // weird list
104 | "TestToHTML/spec0.31.2/331": true, // backtick spaces
105 | "TestToHTML/spec0.31.2/349": true, // backticks
106 | "TestToHTML/spec0.31.2/506": true, // escape quotes
107 |
108 | "TestToHTML/table/gfm200": true, // table
109 | "TestToHTML/table/2": true, // table
110 | }
111 |
112 | func TestToHTML(t *testing.T) {
113 | files, err := filepath.Glob("testdata/*.txt")
114 | if err != nil {
115 | t.Fatal(err)
116 | }
117 | for _, file := range files {
118 | if strings.HasSuffix(file, "_fmt.txt") {
119 | continue
120 | }
121 | t.Run(strings.TrimSuffix(filepath.Base(file), ".txt"), func(t *testing.T) {
122 | a, err := txtar.ParseFile(file)
123 | if err != nil {
124 | t.Fatal(err)
125 | }
126 |
127 | var p Parser
128 | var ncase, npass int
129 | for i := 0; i+2 <= len(a.Files); {
130 | if a.Files[i].Name == "parser.json" {
131 | p = parseParser(t, a.Files[i].Data)
132 | i++
133 | continue
134 | }
135 | ncase++
136 | md := a.Files[i]
137 | html := a.Files[i+1]
138 | i += 2
139 | name := strings.TrimSuffix(md.Name, ".md")
140 | if name != strings.TrimSuffix(html.Name, ".html") {
141 | t.Fatalf("mismatched file pair: %s and %s", md.Name, html.Name)
142 | }
143 |
144 | t.Run(name, func(t *testing.T) {
145 | doc := p.Parse(decode(string(md.Data)))
146 | h := encode(ToHTML(doc))
147 | if h != string(html.Data) {
148 | q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20")
149 | t.Fatalf("input %q\nparse:\n%s\nhave %q\nwant %q\ndingus: (https://spec.commonmark.org/dingus/?text=%s)\ngithub: (https://github.com/rsc/tmp/issues/new?body=%s)", md.Data, dump(doc), h, html.Data, q, q)
150 | }
151 |
152 | // Make sure unexported types like emphPlain don't leak into result.
153 | if x, ok := findUnexported(reflect.ValueOf(doc)); ok {
154 | t.Fatalf("input %q\nparse:\n%s\nfound parsed value of unexported type %s", md.Data, dump(doc), x.Type())
155 | }
156 |
157 | // Make sure Format preserves the HTML.
158 | md1 := Format(doc)
159 | doc1 := p.Parse(md1)
160 | h1 := encode(ToHTML(doc1))
161 | if h1 != string(html.Data) && !roundTripFailures[t.Name()] {
162 | q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20")
163 | t.Fatalf("input %q\nreformat %q\n%s\n%s\nhave %q\nwant %q\ndingus: (https://spec.commonmark.org/dingus/?text=%s)\ngithub: (https://github.com/rsc/tmp/issues/new?body=%s)", md.Data, md1, dump(doc), dump(doc1), h1, html.Data, q, q)
164 | }
165 | if h1 == string(html.Data) && roundTripFailures[t.Name()] {
166 | t.Fatalf("no longer failing")
167 | }
168 |
169 | npass++
170 | })
171 |
172 | if !*goldmarkFlag {
173 | continue
174 | }
175 | t.Run("goldmark/"+name, func(t *testing.T) {
176 | in := decode(string(md.Data))
177 | _, corner := p.parse(in)
178 | if corner {
179 | t.Skip("known corner case")
180 | }
181 | gm := goldmarkParser(&p)
182 | var buf bytes.Buffer
183 | if err := gm.Convert([]byte(in), &buf); err != nil {
184 | t.Fatal(err)
185 | }
186 | if buf.Len() > 0 && buf.Bytes()[buf.Len()-1] != '\n' {
187 | buf.WriteByte('\n')
188 | }
189 | want := decode(string(html.Data))
190 | want = strings.ReplaceAll(want, " />", ">")
191 | out := buf.String()
192 | out = strings.ReplaceAll(out, " />", ">")
193 | q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20")
194 | if out != want {
195 | t.Fatalf("\n - input: ``%q``\n - output: ``%q``\n - golden: ``%q``\n - [dingus](https://spec.commonmark.org/dingus/?text=%s)\n - [github](https://github.com/rsc/tmp/issues/new?body=%s)", in, out, want, q, q)
196 | }
197 | npass++
198 |
199 | })
200 | }
201 | t.Logf("%d/%d pass", npass, ncase)
202 | })
203 | }
204 | }
205 |
206 | func goldmarkParser(p *Parser) goldmark.Markdown {
207 | opts := []goldmark.Option{
208 | goldmark.WithRendererOptions(ghtml.WithUnsafe()),
209 | }
210 | if p.HeadingID {
211 | opts = append(opts, goldmark.WithParserOptions(gparser.WithHeadingAttribute()))
212 | }
213 | if p.Strikethrough {
214 | opts = append(opts, goldmark.WithExtensions(gext.Strikethrough))
215 | }
216 | if p.TaskList {
217 | opts = append(opts, goldmark.WithExtensions(gext.TaskList))
218 | }
219 | if p.AutoLinkText {
220 | opts = append(opts, goldmark.WithExtensions(gext.Linkify))
221 | }
222 | if p.Table {
223 | opts = append(opts, goldmark.WithExtensions(gext.Table))
224 | }
225 | return goldmark.New(opts...)
226 | }
227 |
228 | func decode(s string) string {
229 | s = strings.ReplaceAll(s, "^J\n", "\n")
230 | s = strings.ReplaceAll(s, "^M", "\r")
231 | s = strings.ReplaceAll(s, "^D\n", "")
232 | s = strings.ReplaceAll(s, "^@", "\x00")
233 | return s
234 | }
235 |
236 | func encode(s string) string {
237 | s = strings.ReplaceAll(s, "\r\n", "^M\n")
238 | s = strings.ReplaceAll(s, "\r", "^M^D\n")
239 | s = strings.ReplaceAll(s, " \n", " ^J\n")
240 | s = strings.ReplaceAll(s, "\t\n", "\t^J\n")
241 | s = strings.ReplaceAll(s, "\x00", "^@")
242 | if s != "" && !strings.HasSuffix(s, "\n") {
243 | s += "^D\n"
244 | }
245 | return s
246 | }
247 |
248 | func parseParser(t *testing.T, data []byte) Parser {
249 | d := json.NewDecoder(bytes.NewReader(data))
250 | d.DisallowUnknownFields()
251 | var p Parser
252 | err := d.Decode(&p)
253 | if err != nil {
254 | t.Fatalf("reading parser.json: %v", err)
255 | }
256 | err = d.Decode(new(json.RawMessage))
257 | if err != io.EOF {
258 | t.Fatalf("junk on end of parser.json")
259 | }
260 | return p
261 | }
262 |
263 | func TestFormat(t *testing.T) {
264 | files, err := filepath.Glob(filepath.Join("testdata", "*_fmt.txt"))
265 | if err != nil {
266 | t.Fatal(err)
267 | }
268 | for _, file := range files {
269 | t.Run(strings.TrimSuffix(filepath.Base(file), ".txt"), func(t *testing.T) {
270 | a, err := txtar.ParseFile(file)
271 | if err != nil {
272 | t.Fatal(err)
273 | }
274 | var p Parser
275 | for i := 0; i < len(a.Files); {
276 | if a.Files[i].Name == "parser.json" {
277 | p = parseParser(t, a.Files[i].Data)
278 | i++
279 | continue
280 | }
281 | // Each test case is a single markdown document that should render either as itself,
282 | // or if followed by a file named "want", then by that file.
283 | name := a.Files[i].Name
284 | in := a.Files[i].Data
285 | wantb := in
286 | i++
287 | if i < len(a.Files) && a.Files[i].Name == "want" {
288 | wantb = a.Files[i].Data
289 | i++
290 | }
291 | t.Run(name, func(t *testing.T) {
292 | doc := p.Parse(decode(string(in)))
293 | want := decode(string(wantb))
294 | docWant := p.Parse(want)
295 | if ToHTML(doc) != ToHTML(docWant) {
296 | t.Errorf("bad testdata: input and want are different markdown documents:\ninput:\n%s\n\nwant:\n%s", dump(doc), dump(docWant))
297 | }
298 | h := Format(doc)
299 | h = encode(h)
300 | if h != want {
301 | t.Errorf("input %q\nparse: \n%s\nhave %q\nwant %q", in, dump(doc), h, want)
302 | }
303 | })
304 | }
305 | })
306 | }
307 |
308 | // Files ending in ".md" should render as themselves.
309 | files, err = filepath.Glob(filepath.Join("testdata", "*.md"))
310 | if err != nil {
311 | t.Fatal(err)
312 | }
313 | for _, file := range files {
314 | t.Run(strings.TrimSuffix(filepath.Base(file), ".md"), func(t *testing.T) {
315 | data, err := os.ReadFile(file)
316 | if err != nil {
317 | t.Fatal(err)
318 | }
319 | w := string(data)
320 | var p Parser
321 | doc := p.Parse(w)
322 | h := Format(doc)
323 | if h != w {
324 | t.Errorf("have:\n%s\nwant:\n%s", h, w)
325 | outfile := file + ".have"
326 | t.Logf("writing have to %s", outfile)
327 | if err := os.WriteFile(outfile, []byte(h), 0666); err != nil {
328 | t.Fatal(err)
329 | }
330 | }
331 | })
332 | }
333 | }
334 |
335 | func TestInline(t *testing.T) {
336 | // Test that these don't crash,
337 | // and also "cover" the bodies.
338 | new(HardBreak).Inline()
339 | new(SoftBreak).Inline()
340 | new(HTMLTag).Inline()
341 | new(Plain).Inline()
342 | new(Code).Inline()
343 | new(Strong).Inline()
344 | new(Del).Inline()
345 | new(Emph).Inline()
346 | new(Emoji).Inline()
347 | new(AutoLink).Inline()
348 | new(Link).Inline()
349 | new(Image).Inline()
350 | new(Task).Inline()
351 | }
352 |
353 | func findUnexported(v reflect.Value) (reflect.Value, bool) {
354 | if t := v.Type(); t.PkgPath() != "" && !token.IsExported(t.Name()) {
355 | return v, true
356 | }
357 | switch v.Kind() {
358 | case reflect.Interface, reflect.Pointer:
359 | if !v.IsNil() {
360 | if u, ok := findUnexported(v.Elem()); ok {
361 | return u, true
362 | }
363 | }
364 | case reflect.Struct:
365 | for i := 0; i < v.Type().NumField(); i++ {
366 | if !v.Type().Field(i).IsExported() {
367 | return v, true
368 | }
369 | if u, ok := findUnexported(v.Field(i)); ok {
370 | return u, true
371 | }
372 | }
373 | case reflect.Slice, reflect.Array:
374 | for i := 0; i < v.Len(); i++ {
375 | if u, ok := findUnexported(v.Index(i)); ok {
376 | return u, true
377 | }
378 | }
379 | }
380 | return v, false
381 | }
382 |
383 | var (
384 | blockType = reflect.TypeOf(new(Block)).Elem()
385 | blocksType = reflect.TypeOf(new([]Block)).Elem()
386 | inlinesType = reflect.TypeOf(new(Inlines)).Elem()
387 | )
388 |
389 | func printb(buf *bytes.Buffer, b Block, prefix string) {
390 | fmt.Fprintf(buf, "(%T", b)
391 | v := reflect.ValueOf(b)
392 | v = reflect.Indirect(v)
393 | if v.Kind() != reflect.Struct {
394 | fmt.Fprintf(buf, " %v", b)
395 | }
396 | t := v.Type()
397 | for i := 0; i < t.NumField(); i++ {
398 | tf := t.Field(i)
399 | if !tf.IsExported() {
400 | continue
401 | }
402 | if tf.Type == inlinesType {
403 | printis(buf, v.Field(i).Interface().(Inlines))
404 | } else if tf.Type.Kind() == reflect.Slice && tf.Type.Elem().Kind() == reflect.String {
405 | fmt.Fprintf(buf, " %s:%q", tf.Name, v.Field(i))
406 | } else if tf.Type != blocksType && !tf.Type.Implements(blockType) && tf.Type.Kind() != reflect.Slice {
407 | fmt.Fprintf(buf, " %s:%v", tf.Name, v.Field(i))
408 | }
409 | }
410 |
411 | prefix += "\t"
412 | for i := 0; i < t.NumField(); i++ {
413 | tf := t.Field(i)
414 | if !tf.IsExported() {
415 | continue
416 | }
417 | if tf.Type.Implements(blockType) {
418 | fmt.Fprintf(buf, "\n%s", prefix)
419 | printb(buf, v.Field(i).Interface().(Block), prefix)
420 | } else if tf.Type == blocksType {
421 | vf := v.Field(i)
422 | for i := 0; i < vf.Len(); i++ {
423 | fmt.Fprintf(buf, "\n%s", prefix)
424 | printb(buf, vf.Index(i).Interface().(Block), prefix)
425 | }
426 | } else if tf.Type.Kind() == reflect.Slice && tf.Type != inlinesType && tf.Type.Elem().Kind() != reflect.String {
427 | fmt.Fprintf(buf, "\n%s%s:", prefix, t.Field(i).Name)
428 | printslice(buf, v.Field(i), prefix)
429 | }
430 | }
431 | fmt.Fprintf(buf, ")")
432 | }
433 |
434 | func printslice(buf *bytes.Buffer, v reflect.Value, prefix string) {
435 | if v.Type().Elem().Kind() == reflect.Slice {
436 | for i := 0; i < v.Len(); i++ {
437 | fmt.Fprintf(buf, "\n%s#%d:", prefix, i)
438 | printslice(buf, v.Index(i), prefix+"\t")
439 | }
440 | return
441 | }
442 | for i := 0; i < v.Len(); i++ {
443 | fmt.Fprintf(buf, " ")
444 | printb(buf, v.Index(i).Interface().(Block), prefix+"\t")
445 | }
446 | }
447 |
448 | func printi(buf *bytes.Buffer, in Inline) {
449 | fmt.Fprintf(buf, "%T(", in)
450 | v := reflect.ValueOf(in).Elem()
451 | label := v.FieldByName("Label")
452 | if label.IsValid() {
453 | fmt.Fprintf(buf, "%q", label)
454 | }
455 | text := v.FieldByName("Text")
456 | if text.IsValid() {
457 | fmt.Fprintf(buf, "%q", text)
458 | }
459 | inner := v.FieldByName("Inner")
460 | if inner.IsValid() {
461 | printis(buf, inner.Interface().(Inlines))
462 | }
463 | buf.WriteString(")")
464 | }
465 |
466 | func printis(buf *bytes.Buffer, ins []Inline) {
467 | for _, in := range ins {
468 | buf.WriteByte(' ')
469 | printi(buf, in)
470 | }
471 | }
472 |
473 | func dump(b Block) string {
474 | var buf bytes.Buffer
475 | printb(&buf, b, "")
476 | return buf.String()
477 | }
478 |
--------------------------------------------------------------------------------
/html.go:
--------------------------------------------------------------------------------
1 | // Copyright 2021 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package markdown
6 |
7 | import (
8 | "strconv"
9 | "strings"
10 | "unicode"
11 | )
12 |
13 | // An HTMLBlock is a [Block] representing an [HTML block].
14 | //
15 | // [HTML block]: https://spec.commonmark.org/0.31.2/#html-blocks
16 | type HTMLBlock struct {
17 | Position
18 | // TODO should these be 'Text string'?
19 | Text []string // lines, without trailing newlines
20 | }
21 |
22 | func (*HTMLBlock) Block() {}
23 |
24 | func (b *HTMLBlock) printHTML(p *printer) {
25 | for _, s := range b.Text {
26 | p.html(s)
27 | p.html("\n")
28 | }
29 | }
30 |
31 | func (b *HTMLBlock) printMarkdown(p *printer) {
32 | p.maybeNL()
33 | for i, line := range b.Text {
34 | if i > 0 {
35 | p.nl()
36 | }
37 | p.WriteString(line)
38 | p.noTrim()
39 | }
40 | }
41 |
42 | // An htmlBuilder is a [blockBuilder] for an [HTMLBlock].
43 | // If endBlank is true, the block ends immediately before the first blank line.
44 | // If endFunc is non-nil, the block ends immediately after the first line
45 | // for which endFunc returns true.
46 | type htmlBuilder struct {
47 | endBlank bool
48 | endFunc func(string) bool
49 | text []string //accumulated text
50 | }
51 |
52 | func (c *htmlBuilder) extend(p *parser, s line) (line, bool) {
53 | if c.endBlank && s.isBlank() {
54 | return s, false
55 | }
56 | t := s.string()
57 | c.text = append(c.text, t)
58 | if c.endFunc != nil && c.endFunc(t) {
59 | return line{}, false
60 | }
61 | return line{}, true
62 | }
63 |
64 | func (c *htmlBuilder) build(p *parser) Block {
65 | return &HTMLBlock{
66 | p.pos(),
67 | c.text,
68 | }
69 | }
70 |
71 | // An HTMLTag is an [Inline] representing a [raw HTML tag].
72 | //
73 | // [raw HTML tag]: https://spec.commonmark.org/0.31.2/#raw-html
74 | type HTMLTag struct {
75 | Text string // TODO rename to HTML?
76 | }
77 |
78 | func (*HTMLTag) Inline() {}
79 |
80 | func (x *HTMLTag) printHTML(p *printer) {
81 | p.html(x.Text)
82 | }
83 |
84 | func (x *HTMLTag) printMarkdown(p *printer) {
85 | // TODO are there newlines? probably not
86 | for i, line := range strings.Split(x.Text, "\n") {
87 | if i > 0 {
88 | p.nl()
89 | }
90 | p.WriteString(line)
91 | p.noTrim()
92 | }
93 | }
94 |
95 | func (x *HTMLTag) printText(p *printer) {}
96 |
97 | // startHTMLBlock is a [starter] for an [HTMLBlock].
98 | //
99 | // See https://spec.commonmark.org/0.31.2/#html-blocks.
100 | func startHTMLBlock(p *parser, s line) (line, bool) {
101 | // Early out: block must start with a <.
102 | tt := s
103 | tt.trimSpace(0, 3, false) // TODO figure out trimSpace final argument
104 | if tt.peek() != '<' {
105 | return s, false
106 | }
107 | t := tt.string()
108 |
109 | // Check all 7 block types.
110 | if startHTMLBlock1(p, s, t) ||
111 | startHTMLBlock2345(p, s, t) ||
112 | startHTMLBlock6(p, s, t) ||
113 | startHTMLBlock7(p, s, t) {
114 | return line{}, true
115 | }
116 |
117 | return s, false
118 | }
119 |
120 | const forceLower = 0x20 // ASCII letter | forceLower == ASCII lower-case
121 |
122 | // startHTMLBlock1 handles HTML block type 1:
123 | // line starting with or .
125 | //
126 | // s is the entire line, for saving if starting a block.
127 | // t is the line as a string, with leading spaces removed; it starts with <.
128 | func startHTMLBlock1(p *parser, s line, t string) bool {
129 | if len(t) < 2 {
130 | return false
131 | }
132 | if c := t[1] | forceLower; c != 'p' && c != 's' && c != 't' { // early out; check first letter
133 | return false
134 | }
135 | i := 2
136 | for i < len(t) && (t[i] != ' ' && t[i] != '\t' && t[i] != '>') {
137 | i++
138 | }
139 | if !isBlock1Tag(t[1:i]) {
140 | return false
141 | }
142 | b := &htmlBuilder{endFunc: endBlock1}
143 | p.addBlock(b)
144 | b.text = append(b.text, s.string())
145 | if endBlock1(t) {
146 | p.closeBlock()
147 | }
148 | return true
149 | }
150 |
151 | // endBlock1 reports whether the string contains
152 | // , , , or ,
153 | // using ASCII case-insensitive matching.
154 | func endBlock1(s string) bool {
155 | start := -1
156 | for i := 0; i < len(s); i++ {
157 | if s[i] == '<' && i+1 < len(s) && s[i+1] == '/' {
158 | start = i + 2
159 | }
160 | if s[i] == '>' && start >= 0 {
161 | if isBlock1Tag(s[start:i]) {
162 | return true
163 | }
164 | start = -1
165 | }
166 | }
167 | return false
168 | }
169 |
170 | // isBlock1Tag reports whether tag is a tag that can open or close
171 | // HTML block type 1.
172 | func isBlock1Tag(tag string) bool {
173 | return lowerEq(tag, "pre") || lowerEq(tag, "script") || lowerEq(tag, "style") || lowerEq(tag, "textarea")
174 | }
175 |
176 | // lowerEq reports whether strings.ToLower(s) == lower
177 | // assuming lower is entirely ASCII lower-case letters.
178 | func lowerEq(s, lower string) bool {
179 | if len(s) != len(lower) {
180 | return false
181 | }
182 | lower = lower[:len(s)]
183 | for i := 0; i < len(s); i++ {
184 | if s[i]|forceLower != lower[i] {
185 | return false
186 | }
187 | }
188 | return true
189 | }
190 |
191 | // startHTMLBlock2345 handles HTML blocks types 2, 3, 4, and 5,
192 | // the ones that start and end a specific string constant.
193 | //
194 | // s is the entire line, for saving if starting a block.
195 | // t is the line as a string, with leading spaces removed; it starts with <.
196 | func startHTMLBlock2345(p *parser, s line, t string) bool {
197 | var end string
198 | switch {
199 | default:
200 | return false
201 |
202 | // type 2: , or or because of simplistic parsing.
203 | case strings.HasPrefix(t, ""
205 |
206 | // type 3: ... ?>, or > because of simplistic parsing.
207 | case strings.HasPrefix(t, ""): // type 3
208 | end = "?>"
209 |
210 | // type 4:
211 | case strings.HasPrefix(t, ""
213 |
214 | // type 5:
215 | // The spec says nothing about requiring a leading upper-case letter,
216 | // only that it should be an ASCII letter, but cmark-gfm, Goldmark,
217 | // and the Dingus all require upper-case, so we do too.
218 | // Presumably this is because the actual goal is to recognize the few
219 | // XML definitions that can appear, and they are all upper-case.
220 | // The result is that is an HTMLBlock but is an HTMLTag.
221 | // That's inconsistent, but Markdown is full of them, so we prioritize
222 | // consistency with all the existing implementations.
223 | case strings.HasPrefix(t, "= 3 && 'A' <= t[2] && t[2] <= 'Z':
224 | end = ">"
225 | }
226 |
227 | b := &htmlBuilder{endFunc: func(s string) bool { return strings.Contains(s, end) }}
228 | p.addBlock(b)
229 | b.text = append(b.text, s.string())
230 | if b.endFunc(t) {
231 | // If terminator appears on the starting line, we're done.
232 | p.closeBlock()
233 | }
234 | return true
235 | }
236 |
237 | // startHTMLBlock6 handles HTML block type 6,
238 | // which starts with the start of a recognized tag
239 | // and ends at a blank line.
240 | //
241 | // s is the entire line, for saving if starting a block.
242 | // t is the line as a string, with leading spaces removed; it starts with <.
243 | func startHTMLBlock6(p *parser, s line, t string) bool {
244 | // Skip over < or 1 && t[1] == '/' {
247 | start = 2
248 | }
249 |
250 | // Scan ASCII alphanumeric tag name;
251 | // must be followed by space, tab, >, />, or end of line.
252 | end := start
253 | for end < len(t) && end < 16 && isLetterDigit(t[end]) {
254 | end++
255 | }
256 | if end < len(t) {
257 | switch t[end] {
258 | default:
259 | return false
260 | case ' ', '\t', '>':
261 | // ok
262 | case '/':
263 | if end+1 >= len(t) || t[end+1] != '>' {
264 | return false
265 | }
266 | }
267 | }
268 |
269 | // Check whether tag is a recognized name.
270 | tag := t[start:end]
271 | if tag == "" {
272 | return false
273 | }
274 | c := tag[0] | forceLower
275 | for _, name := range htmlTags {
276 | if name[0] == c && len(name) == len(tag) && lowerEq(tag, name) {
277 | if end < len(t) && t[end] == '\t' {
278 | // Goldmark recognizes space but not tab.
279 | // testdata/extra.txt 143.md
280 | p.corner = true
281 | }
282 | b := &htmlBuilder{endBlank: true}
283 | p.addBlock(b)
284 | b.text = append(b.text, s.string())
285 | return true
286 | }
287 | }
288 | return false
289 | }
290 |
291 | // startHTMLBlock7 handles HTML block type 7,
292 | // which starts with a complete tag on a line by itself
293 | // and ends at a blank line.
294 | //
295 | // s is the entire line, for saving if starting a block.
296 | // t is the line as a string, with leading spaces removed; it starts with <.
297 | func startHTMLBlock7(p *parser, s line, t string) bool {
298 | // Type 7 blocks cannot interrupt a paragraph,
299 | // so that rewrapping a paragraph with inline tags
300 | // cannot change them into starting an HTML block.
301 | if p.para() != nil {
302 | return false
303 | }
304 |
305 | if _, end, ok := parseHTMLOpenTag(p, t, 0); ok && skipSpace(t, end) == len(t) {
306 | if end != len(t) {
307 | // Goldmark disallows trailing space
308 | p.corner = true
309 | }
310 | b := &htmlBuilder{endBlank: true}
311 | p.addBlock(b)
312 | b.text = append(b.text, s.string())
313 | return true
314 | }
315 | if _, end, ok := parseHTMLClosingTag(p, t, 0); ok && skipSpace(t, end) == len(t) {
316 | b := &htmlBuilder{endBlank: true}
317 | p.addBlock(b)
318 | b.text = append(b.text, s.string())
319 | return true
320 | }
321 | return false
322 | }
323 |
324 | // parseHTMLTag is an [inlineParser] for an [HTMLTag].
325 | // The caller has has checked that s[start] is '<'.
326 | func parseHTMLTag(p *parser, s string, start int) (x Inline, end int, ok bool) {
327 | // “An HTML tag consists of an open tag, a closing tag, an HTML comment,
328 | // a processing instruction, a declaration, or a CDATA section.”
329 | if len(s)-start < 3 || s[start] != '<' {
330 | return
331 | }
332 | switch s[start+1] {
333 | default:
334 | return parseHTMLOpenTag(p, s, start)
335 | case '/':
336 | return parseHTMLClosingTag(p, s, start)
337 | case '!':
338 | switch s[start+2] {
339 | case '-':
340 | return parseHTMLComment(p, s, start)
341 | case '[':
342 | return parseHTMLCDATA(p, s, start)
343 | default:
344 | return parseHTMLDecl(p, s, start)
345 | }
346 | case '?':
347 | return parseHTMLProcInst(p, s, start)
348 | }
349 | }
350 |
351 | // parseHTMLOpenTag is an [inlineParser] for an HTML open tag.
352 | // The caller has has checked that s[start] is '<'.
353 | func parseHTMLOpenTag(p *parser, s string, i int) (x Inline, end int, ok bool) {
354 | // “An open tag consists of a < character, a tag name, zero or more attributes,
355 | // optional spaces, tabs, and up to one line ending, an optional / character, and a > character.”
356 |
357 | // < character
358 | if i >= len(s) || s[i] != '<' {
359 | // unreachable unless called wrong
360 | return
361 | }
362 |
363 | // tag name
364 | name, j, ok1 := parseTagName(s, i+1)
365 | if !ok1 {
366 | return
367 | }
368 | switch name {
369 | case "pre", "script", "style", "textarea":
370 | // Goldmark treats these as starting a new HTMLBlock
371 | // and ending the paragraph they appear in.
372 | p.corner = true
373 | }
374 |
375 | // zero or more attributes
376 | for {
377 | if j >= len(s) || s[j] != ' ' && s[j] != '\t' && s[j] != '\n' && s[j] != '/' && s[j] != '>' {
378 | return
379 | }
380 | _, k, ok := parseAttr(p, s, skipSpace(s, j))
381 | if !ok {
382 | break
383 | }
384 | j = k
385 | }
386 |
387 | // optional spaces, tabs, and up to one line ending
388 | k := skipSpace(s, j)
389 | if k != j {
390 | // Goldmark mishandles spaces before >.
391 | p.corner = true
392 | }
393 | j = k
394 |
395 | // an optional / character
396 | if j < len(s) && s[j] == '/' {
397 | j++
398 | }
399 |
400 | // and a > character.
401 | if j >= len(s) || s[j] != '>' {
402 | return
403 | }
404 |
405 | return &HTMLTag{s[i : j+1]}, j + 1, true
406 | }
407 |
408 | // parseHTMLClosingTag is an [inlineParser] for an HTML closing tag.
409 | // The caller has has checked that s[start:] begins with "".
410 | func parseHTMLClosingTag(p *parser, s string, i int) (x Inline, end int, ok bool) {
411 | // “A closing tag consists of the string , a tag name,
412 | // optional spaces, tabs, and up to one line ending, and the character >.”
413 | if i+2 >= len(s) || s[i] != '<' || s[i+1] != '/' {
414 | return
415 | }
416 | if skipSpace(s, i+2) != i+2 {
417 | // Goldmark allows spaces here but the spec and the Dingus do not.
418 | p.corner = true
419 | }
420 |
421 | if _, j, ok := parseTagName(s, i+2); ok {
422 | j = skipSpace(s, j)
423 | if j < len(s) && s[j] == '>' {
424 | return &HTMLTag{s[i : j+1]}, j + 1, true
425 | }
426 | }
427 | return
428 | }
429 |
430 | // parseTagName parses a leading tag name from s[start:],
431 | // returning the tag and the end location.
432 | func parseTagName(s string, start int) (tag string, end int, ok bool) {
433 | // “A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).”
434 | if start >= len(s) || !isLetter(s[start]) {
435 | return
436 | }
437 | end = start + 1
438 | for end < len(s) && isLDH(s[end]) {
439 | end++
440 | }
441 | return s[start:end], end, true
442 | }
443 |
444 | // parseAttr parses a leading attr (or attr=value) from s[start:],
445 | // returning the entire attribute (including the =value) and the end location.
446 | func parseAttr(p *parser, s string, start int) (attr string, end int, ok bool) {
447 | // “An attribute consists of spaces, tabs, and up to one line ending,
448 | // an attribute name, and an optional attribute value specification.”
449 | _, end, ok = parseAttrName(s, start)
450 | if !ok {
451 | return
452 | }
453 | if endVal, ok := parseAttrValueSpec(p, s, end); ok {
454 | end = endVal
455 | }
456 | return s[start:end], end, true
457 | }
458 |
459 | // parseAttrName parses a leading attribute name from s[start:],
460 | // returning the name and the end location.
461 | func parseAttrName(s string, start int) (name string, end int, ok bool) {
462 | // “An attribute name consists of an ASCII letter, _, or :,
463 | // followed by zero or more ASCII letters, digits, _, ., :, or -.”
464 | if start+1 >= len(s) || (!isLetter(s[start]) && s[start] != '_' && s[start] != ':') {
465 | return
466 | }
467 | end = start + 1
468 | for end < len(s) && (isLDH(s[end]) || s[end] == '_' || s[end] == '.' || s[end] == ':') {
469 | end++
470 | }
471 | return s[start:end], end, true
472 | }
473 |
474 | // parseAttrValueSpec parses a leading attribute value specification
475 | // from s[start:], returning the end location.
476 | func parseAttrValueSpec(p *parser, s string, start int) (end int, ok bool) {
477 | // “An attribute value specification consists of
478 | // optional spaces, tabs, and up to one line ending,
479 | // a = character,
480 | // optional spaces, tabs, and up to one line ending,
481 | // and an attribute value.”
482 | end = skipSpace(s, start)
483 | if end >= len(s) || s[end] != '=' {
484 | return
485 | }
486 | end = skipSpace(s, end+1)
487 |
488 | // “An attribute value consists of
489 | // an unquoted attribute value,
490 | // a single-quoted attribute value,
491 | // or a double-quoted attribute value.”
492 | // TODO: No escaping???
493 | if end < len(s) && (s[end] == '\'' || s[end] == '"') {
494 | // “A single-quoted attribute value consists of ',
495 | // zero or more characters not including ', and a final '.”
496 | // “A double-quoted attribute value consists of ",
497 | // zero or more characters not including ", and a final ".”
498 | i := strings.IndexByte(s[end+1:], s[end])
499 | if i < 0 {
500 | return
501 | }
502 | return end + 1 + i + 1, true
503 | }
504 |
505 | // “An unquoted attribute value is a nonempty string of characters
506 | // not including spaces, tabs, line endings, ", ', =, <, >, or `.”
507 | isAttrVal := func(c byte) bool {
508 | return c != ' ' && c != '\t' && c != '\n' &&
509 | c != '"' && c != '\'' &&
510 | c != '=' && c != '<' && c != '>' && c != '`'
511 | }
512 | i := end
513 | for i < len(s) && isAttrVal(s[i]) {
514 | i++
515 | }
516 | if i == end {
517 | return
518 | }
519 | return i, true
520 | }
521 |
522 | // parseHTMLComment is an [inlineParser] for an HTML comment.
523 | // The caller has has checked that s[start:] begins with ",
526 | // where text does not start with > or ->,
527 | // does not end with -, and does not contain --.”
528 | if strings.HasPrefix(s[start:], "") {
529 | end = start + len("")
530 | return &HTMLTag{s[start:end]}, end, true
531 | }
532 | if strings.HasPrefix(s[start:], "") {
533 | end = start + len("")
534 | return &HTMLTag{s[start:end]}, end, true
535 | }
536 | if x, end, ok := parseHTMLMarker(p, s, start, ""); ok {
537 | return x, end, ok
538 | }
539 | return
540 | }
541 |
542 | // parseHTMLCDATA is an [inlineParser] for an HTML CDATA section.
543 | // The caller has has checked that s[start:] begins with ", and the string ]]>.”
547 | return parseHTMLMarker(p, s, i, "")
548 | }
549 |
550 | // parseHTMLDecl is an [inlineParser] for an HTML declaration section.
551 | // The caller has has checked that s[start:] begins with ", and the character >.”
555 | if i+2 < len(s) && isLetter(s[i+2]) {
556 | if 'a' <= s[i+2] && s[i+2] <= 'z' {
557 | p.corner = true // goldmark requires uppercase
558 | }
559 | return parseHTMLMarker(p, s, i, "")
560 | }
561 | return
562 | }
563 |
564 | // parseHTMLDecl is an [inlineParser] for an HTML processing instruction.
565 | // The caller has has checked that s[start:] begins with "".
566 | func parseHTMLProcInst(p *parser, s string, i int) (x Inline, end int, ok bool) {
567 | // “A processing instruction consists of the string ,
568 | // a string of characters not including the string ?>, and the string ?>.”
569 | return parseHTMLMarker(p, s, i, "", "?>")
570 | }
571 |
572 | // parseHTMLMarker is a generalized parser for the
573 | // various prefix/suffix-denote HTML markers.
574 | // If s[start:] starts with prefix and is followed eventually by suffix,
575 | // then parseHTMLMarker returns an HTMLTag for that section of s
576 | // along with start, end, ok to implement the result of an [inlineParser].
577 | func parseHTMLMarker(p *parser, s string, start int, prefix, suffix string) (x Inline, end int, ok bool) {
578 | if strings.HasPrefix(s[start:], prefix) {
579 | // To avoid quadratic behavior looking at on line
612 | case '?':
613 | p.noProcInstEnd = true // no ?> on line
614 | }
615 | }
616 | return
617 | }
618 |
619 | // parseHTMLEntity is an [inlineParser] for an HTML entity reference,
620 | // such as ", {, or ካ.
621 | func parseHTMLEntity(_ *parser, s string, start int) (x Inline, end int, ok bool) {
622 | i := start
623 | if i+1 < len(s) && s[i+1] == '#' {
624 | i += 2
625 | var r int
626 | if i < len(s) && (s[i] == 'x' || s[i] == 'X') {
627 | // hex
628 | i++
629 | j := i
630 | for j < len(s) && isHexDigit(s[j]) {
631 | j++
632 | }
633 | if j-i < 1 || j-i > 6 || j >= len(s) || s[j] != ';' {
634 | return
635 | }
636 | r64, _ := strconv.ParseInt(s[i:j], 16, 0)
637 | r = int(r64)
638 | end = j + 1
639 | } else {
640 | // decimal
641 | j := i
642 | for j < len(s) && isDigit(s[j]) {
643 | j++
644 | }
645 | if j-i < 1 || j-i > 7 || j >= len(s) || s[j] != ';' {
646 | return
647 | }
648 | r, _ = strconv.Atoi(s[i:j])
649 | end = j + 1
650 | }
651 | if r > unicode.MaxRune || r == 0 {
652 | // Invalid code points and U+0000 are replaced by U+FFFD.
653 | r = unicode.ReplacementChar
654 | }
655 | return &Plain{string(rune(r))}, end, true
656 | }
657 |
658 | // Max name in list is 32 bytes. Try for 64 for good measure.
659 | for j := i + 1; j < len(s) && j-i < 64; j++ {
660 | if s[j] == '&' { // Stop possible quadratic search on &&&&&&&.
661 | break
662 | }
663 | if s[j] == ';' {
664 | if r, ok := htmlEntity[s[i:j+1]]; ok {
665 | return &Plain{r}, j + 1, true
666 | }
667 | break
668 | }
669 | }
670 |
671 | return
672 | }
673 |
--------------------------------------------------------------------------------