├── testdata
    ├── task_fmt.txt
    ├── fuzz
    │   └── Fuzz
    │   │   ├── 5d90cadcbf2fc0a05c34346f2e0d544de4e230b1a7b56412ab4b5fdbb413d147
    │   │   ├── e2e384485b8d6c08f62211a6db9cf55e3582dfe088c6ffc6f3ee80446171e148
    │   │   ├── 900d64f4df082a036ff8da05207cb0b00379ef7c2714addee6a3000b7d42f046
    │   │   ├── 6e1ec98995f90b7237a488109ef07219eea37739b6fc18f69c0c61cfd43590ce
    │   │   ├── 99b7e429a4c90c1eddd1560b79014ab2442f3b5b9b80d84d0a04a96a4a8c9906
    │   │   ├── b6461168fb519180a65d1a230dc6c5cb03194e5817bf4a192c33b6fbd8eec65f
    │   │   ├── 4f0397bfd8cdada4815be61da4ee7a80200dc512dc0bfc09dd086dad03b335dc
    │   │   ├── e73b40d4a194f94ba52c4774f577ca9d90e71698cf76d2944c688c0b4a9927b9
    │   │   ├── 38a2bce29a092521f5d1f873dd7bab598b72474bee79f396ac5d1515128baa71
    │   │   └── ce9879da2226220068fd4085fff503aa5ebf62c5879af3dc23fc47dd29e500f0
    ├── heading_fmt.txt
    ├── code_fmt.txt
    ├── smart.txt
    ├── emoji.txt
    ├── linkref_fmt.txt
    ├── table_fmt.txt
    ├── headings.txt
    ├── del.txt
    ├── footnote.txt
    ├── spec2txtar.go
    ├── task.txt
    ├── gfm_smart.txt
    ├── cmark2txtar.go
    ├── table.txt
    ├── gfm_regress.txt
    ├── basic_fmt.txt
    ├── gfm_ext.txt
    └── autoext.txt
├── go.mod
├── go.sum
├── README.md
├── block.go
├── doc.go
├── htmltags.go
├── emoji2gist.go
├── LICENSE
├── quote.go
├── mdfmt
    └── main.go
├── table_test.go
├── entity2go.go
├── md2html
    └── main.go
├── emoji2go.go
├── fuzz_test.go
├── break.go
├── htmlesc.go
├── lex.go
├── line.go
├── footnote.go
├── print.go
├── big_test.go
├── para.go
├── heading.go
├── code.go
├── parse.go
├── table.go
├── list.go
├── md_test.go
└── html.go


/testdata/task_fmt.txt:
--------------------------------------------------------------------------------
1 | -- parser.json --
2 | {"TaskList": true}
3 | -- gfm279.md --
4 |   - [ ] foo
5 |   - [x] bar
6 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/5d90cadcbf2fc0a05c34346f2e0d544de4e230b1a7b56412ab4b5fdbb413d147:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("*[_*]()\n")
3 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/e2e384485b8d6c08f62211a6db9cf55e3582dfe088c6ffc6f3ee80446171e148:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("\\\\\nr\n")
3 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/900d64f4df082a036ff8da05207cb0b00379ef7c2714addee6a3000b7d42f046:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("*[a*r*]()\n")
3 | 


--------------------------------------------------------------------------------
/testdata/heading_fmt.txt:
--------------------------------------------------------------------------------
1 | -- parser.json --
2 | {"HeadingID": true}
3 | -- 1 --
4 | #  H  {# id }
5 | -- want --
6 | # H {#id}
7 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/6e1ec98995f90b7237a488109ef07219eea37739b6fc18f69c0c61cfd43590ce:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("!][![[]()]()]()\n")
3 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/99b7e429a4c90c1eddd1560b79014ab2442f3b5b9b80d84d0a04a96a4a8c9906:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("-     e\n\n  o\n")
3 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/b6461168fb519180a65d1a230dc6c5cb03194e5817bf4a192c33b6fbd8eec65f:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("- a\n  > b\n  ` `\n  c'= ```\n; d\n")
3 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/4f0397bfd8cdada4815be61da4ee7a80200dc512dc0bfc09dd086dad03b335dc:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("![foo](/\x10rl \"title%(/url \"titlU%(/url \"\")\n")
3 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/e73b40d4a194f94ba52c4774f577ca9d90e71698cf76d2944c688c0b4a9927b9:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("1. a\n\n  \x05\x05\x05\x05\x052n b\n\n   3. c\n")
3 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/38a2bce29a092521f5d1f873dd7bab598b72474bee79f396ac5d1515128baa71:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("![([foo]![([oo][i.)](u.\u007ffoo][i1)](u1.)](uri3)\n")
3 | 


--------------------------------------------------------------------------------
/testdata/fuzz/Fuzz/ce9879da2226220068fd4085fff503aa5ebf62c5879af3dc23fc47dd29e500f0:
--------------------------------------------------------------------------------
1 | go test fuzz v1
2 | string("1.  foo\n\n    ```\n    bar\n     ``\n\n  ` baz\n\n    > bo\n\n    `>m\n")
3 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module rsc.io/markdown
 2 | 
 3 | go 1.22.0
 4 | 
 5 | require (
 6 | 	github.com/yuin/goldmark v1.6.0 // for testing only
 7 | 	golang.org/x/text v0.3.7
 8 | 	golang.org/x/tools v0.1.5
 9 | )
10 | 


--------------------------------------------------------------------------------
/testdata/code_fmt.txt:
--------------------------------------------------------------------------------
 1 | -- 1 --
 2 | `x`
 3 | -- want --
 4 | `x`
 5 | -- 2 --
 6 | ```x```
 7 | -- want --
 8 | `x`
 9 | -- 3 --
10 | ```` `x` ````
11 | -- want --
12 | `` `x` ``
13 | -- 4 --
14 | `````a ``` b`` `````
15 | -- want --
16 | ````a ``` b`` ````
17 | 


--------------------------------------------------------------------------------
/testdata/smart.txt:
--------------------------------------------------------------------------------
 1 | -- parser.json --
 2 | {"SmartQuote": true}
 3 | -- 1.md --
 4 | 'hello'
 5 | -- 1.html --
 6 | <p>‘hello’</p>
 7 | -- 2.md --
 8 | my'hello'
 9 | -- 2.html --
10 | <p>my’hello’</p>
11 | -- 3.md --
12 | [my]'hello'
13 | -- 3.html --
14 | <p>[my]’hello’</p>
15 | 


--------------------------------------------------------------------------------
/testdata/emoji.txt:
--------------------------------------------------------------------------------
 1 | -- parser.json --
 2 | {"Emoji": true}
 3 | -- 1.md --
 4 | emojis
 5 | :+1:
 6 | :100:
 7 | :1st_place_medal:
 8 | :negative_squared_cross_mark:
 9 | :wales:
10 | :south_georgia_south_sandwich_islands:
11 | :woman_facepalming:
12 | end
13 | -- 1.html --
14 | <p>emojis
15 | 👍
16 | 💯
17 | 🥇
18 | ❎
19 | 🏴󠁧󠁢󠁷󠁬󠁳󠁿
20 | 🇬🇸
21 | 🤦‍♀️
22 | end</p>
23 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/yuin/goldmark v1.6.0 h1:boZcn2GTjpsynOsC0iJHnBWa4Bi0qzfJjthwauItG68=
2 | github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
3 | golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
4 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
5 | golang.org/x/tools v0.1.5 h1:ouewzE6p+/VEB31YYnTbEJdi8pFqKp4P4n85vwo3DHA=
6 | golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Package markdown is a Commonmark-compliant Markdown parser and
 2 | HTML generator. It does not have many bells and whistles, but it does
 3 | expose the parsed syntax in an easy-to-use form.
 4 | 
 5 | Work in progress.
 6 | 
 7 | TODO:
 8 |  - documentation
 9 |  - make Format always print valid markdown,
10 |    even when the tree was constructed manually and may
11 |    not correspond to something Parse would return.
12 |  - footnote support
13 |  - possibly math support
14 |  - would it be simpler to have a lexer generated from regexps?
15 | 


--------------------------------------------------------------------------------
/testdata/linkref_fmt.txt:
--------------------------------------------------------------------------------
 1 | Tests for rendering a document's link references in markdown.
 2 | -- simple --
 3 | A document.
 4 | 
 5 | [foo]: u
 6 | -- want --
 7 | A document.
 8 | 
 9 | [foo]: u
10 | -- sorted --
11 | A document.
12 | 
13 | [foo]: u1
14 | [bar]: u2
15 | -- want --
16 | A document.
17 | 
18 | [bar]: u2
19 | [foo]: u1
20 | -- interleaved --
21 | First.
22 | 
23 | [foo]: u1
24 | Second.
25 | 
26 | [bar]: u2
27 | -- want --
28 | First.
29 | 
30 | Second.
31 | 
32 | [bar]: u2
33 | [foo]: u1
34 | -- titles --
35 | A document.
36 | 
37 | [r1]: u1 (title1)
38 | [r2]: u2 "title2"
39 | [r3]: u3 'title3'
40 | 


--------------------------------------------------------------------------------
/block.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package markdown
 6 | 
 7 | // Block is implemented by:
 8 | //
 9 | //	CodeBlock
10 | //	Document
11 | //	Empty
12 | //	HTMLBlock
13 | //	Heading
14 | //	Item
15 | //	List
16 | //	Paragraph
17 | //	Quote
18 | //	Text
19 | //	ThematicBreak
20 | type Block interface {
21 | 	Block()
22 | 	Pos() Position
23 | 	printHTML(p *printer)
24 | 	printMarkdown(p *printer)
25 | }
26 | 
27 | type Position struct {
28 | 	StartLine int
29 | 	EndLine   int
30 | }
31 | 
32 | func (p Position) Pos() Position {
33 | 	return p
34 | }
35 | 


--------------------------------------------------------------------------------
/testdata/table_fmt.txt:
--------------------------------------------------------------------------------
 1 | -- parser.json --
 2 | {"Table": true}
 3 | -- padded --
 4 | |foo|bar|baz|
 5 | |--|--|--|
 6 | |1|2|3|
 7 | |a|b|c|
 8 | -- want --
 9 | | foo | bar | baz |
10 | | --- | --- | --- |
11 | | 1   | 2   | 3   |
12 | | a   | b   | c   |
13 | -- aligned --
14 | |foo|bär|baz|
15 | |:--|:-:|--:|
16 | |1|2|3|
17 | |a|b|c|
18 | -- want --
19 | | foo | bär | baz |
20 | | :-- | :-: | --: |
21 | | 1   |  2  |   3 |
22 | | a   |  b  |   c |
23 | -- with_normalized_inline --
24 | |[foo](u1 )|
25 | |---|
26 | |1|
27 | |a|
28 | -- want --
29 | | [foo](u1) |
30 | | --------- |
31 | | 1         |
32 | | a         |
33 | -- indented --
34 |   - item 1
35 | 
36 |     | col1 | col2 |
37 |     | ---- | ---- |
38 |     | 1    | 2    |
39 | -- bigvalues --
40 | | foo | bar      | baz |
41 | | --- | -------- | --- |
42 | | 1   | 22345678 | 3   |
43 | | a   | b        | c   |
44 | 


--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2024 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package markdown
 6 | 
 7 | type Document struct {
 8 | 	Position
 9 | 	Blocks []Block
10 | 	Links  map[string]*Link
11 | }
12 | 
13 | func (*Document) Block() {}
14 | 
15 | func (b *Document) printHTML(p *printer) {
16 | 	for _, c := range b.Blocks {
17 | 		c.printHTML(p)
18 | 	}
19 | }
20 | 
21 | func (b *Document) printMarkdown(p *printer) {
22 | 	printMarkdownBlocks(b.Blocks, p)
23 | 
24 | 	// Terminate with a single newline.
25 | 	text := p.buf.Bytes()
26 | 	w := len(text)
27 | 	for w > 0 && text[w-1] == '\n' {
28 | 		w--
29 | 	}
30 | 	p.buf.Truncate(w)
31 | 	if w > 0 {
32 | 		p.nl()
33 | 	}
34 | 
35 | 	// Add link reference definitions.
36 | 	if len(b.Links) > 0 {
37 | 		if p.buf.Len() > 0 {
38 | 			p.nl()
39 | 		}
40 | 		printLinks(p, b.Links)
41 | 	}
42 | }
43 | 
44 | func printMarkdownBlocks(bs []Block, p *printer) {
45 | 	for bn, b := range bs {
46 | 		if bn > 0 {
47 | 			p.nl() // end block
48 | 			if p.loose > 0 {
49 | 				p.nl()
50 | 			}
51 | 		}
52 | 		b.printMarkdown(p)
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/testdata/headings.txt:
--------------------------------------------------------------------------------
 1 | Goldmark fails on 11 because it doesn't like slashes or spaces in ids.
 2 | -- parser.json --
 3 | {"HeadingID": true}
 4 | -- 1.md --
 5 | # Heading
 6 | -- 1.html --
 7 | <h1>Heading</h1>
 8 | -- 2.md --
 9 | # Heading ###
10 | -- 2.html --
11 | <h1>Heading</h1>
12 | -- 3.md --
13 | # Heading {#id}
14 | -- 3.html --
15 | <h1 id="id">Heading</h1>
16 | -- 4.md --
17 | # Heading {#id} ##
18 | -- 4.html --
19 | <h1 id="id">Heading</h1>
20 | -- 5.md --
21 | # Heading {#id} more
22 | -- 5.html --
23 | <h1>Heading {#id} more</h1>
24 | -- 6.md --
25 | # Heading {nope}
26 | -- 6.html --
27 | <h1>Heading {nope}</h1>
28 | -- 7.md --
29 | # Heading {uhuh
30 | -- 7.html --
31 | <h1>Heading {uhuh</h1>
32 | -- 8.md --
33 | # {#no} Heading
34 | -- 8.html --
35 | <h1>{#no} Heading</h1>
36 | -- 9.md --
37 | # Heading {#id1} {#id2}
38 | -- 9.html --
39 | <h1 id="id2">Heading {#id1}</h1>
40 | -- 10.md --
41 | # Heading {#id1} {#id2
42 | -- 10.html --
43 | <h1>Heading {#id1} {#id2</h1>
44 | -- 11.md --
45 | # Heading {#a/b c}
46 | -- 11.html --
47 | <h1 id="a/b c">Heading</h1>
48 | -- 12.md --
49 | # {}
50 | -- 12.html --
51 | <h1>{}</h1>
52 | -- 13.md --
53 | # {#}
54 | -- 13.html --
55 | <h1>{#}</h1>
56 | 


--------------------------------------------------------------------------------
/htmltags.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package markdown
 6 | 
 7 | // htmlTags lists the known HTML tags for HTML block type 6.
 8 | // See https://spec.commonmark.org/0.31.2/#html-blocks.
 9 | var htmlTags = []string{
10 | 	"address",
11 | 	"article",
12 | 	"aside",
13 | 	"base",
14 | 	"basefont",
15 | 	"blockquote",
16 | 	"body",
17 | 	"caption",
18 | 	"center",
19 | 	"col",
20 | 	"colgroup",
21 | 	"dd",
22 | 	"details",
23 | 	"dialog",
24 | 	"dir",
25 | 	"div",
26 | 	"dl",
27 | 	"dt",
28 | 	"fieldset",
29 | 	"figcaption",
30 | 	"figure",
31 | 	"footer",
32 | 	"form",
33 | 	"frame",
34 | 	"frameset",
35 | 	"h1",
36 | 	"h2",
37 | 	"h3",
38 | 	"h4",
39 | 	"h5",
40 | 	"h6",
41 | 	"head",
42 | 	"header",
43 | 	"hr",
44 | 	"html",
45 | 	"iframe",
46 | 	"legend",
47 | 	"li",
48 | 	"link",
49 | 	"main",
50 | 	"menu",
51 | 	"menuitem",
52 | 	"nav",
53 | 	"noframes",
54 | 	"ol",
55 | 	"optgroup",
56 | 	"option",
57 | 	"p",
58 | 	"param",
59 | 	"section",
60 | 	"source",
61 | 	"summary",
62 | 	"table",
63 | 	"tbody",
64 | 	"td",
65 | 	"tfoot",
66 | 	"th",
67 | 	"thead",
68 | 	"title",
69 | 	"tr",
70 | 	"track",
71 | 	"ul",
72 | }
73 | 


--------------------------------------------------------------------------------
/emoji2gist.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build ignore
 6 | 
 7 | package main
 8 | 
 9 | import (
10 | 	"bytes"
11 | 	"encoding/json"
12 | 	"flag"
13 | 	"fmt"
14 | 	"io"
15 | 	"log"
16 | 	"net/http"
17 | 	"os"
18 | 	"sort"
19 | )
20 | 
21 | var outfile = flag.String("o", "", "write output to `file`")
22 | 
23 | func main() {
24 | 	log.SetFlags(0)
25 | 	log.SetPrefix("emoji2gist: ")
26 | 	flag.Parse()
27 | 
28 | 	resp, err := http.Get("https://api.github.com/emojis")
29 | 	if err != nil {
30 | 		log.Fatal(err)
31 | 	}
32 | 	if resp.StatusCode != 200 {
33 | 		log.Fatal(resp.Status)
34 | 	}
35 | 	data, err := io.ReadAll(resp.Body)
36 | 	if err != nil {
37 | 		log.Fatal(err)
38 | 	}
39 | 
40 | 	list := make(map[string]string)
41 | 	err = json.Unmarshal(data, &list)
42 | 	if err != nil {
43 | 		log.Fatal(err)
44 | 	}
45 | 
46 | 	var names []string
47 | 	for name := range list {
48 | 		names = append(names, name)
49 | 	}
50 | 	sort.Strings(names)
51 | 
52 | 	var buf bytes.Buffer
53 | 	fmt.Fprintf(&buf, "code | emoji\n-|-\n")
54 | 	for _, name := range names {
55 | 		fmt.Fprintf(&buf, "`%s` | :%s:\n", name, name)
56 | 	}
57 | 
58 | 	if *outfile != "" {
59 | 		if err := os.WriteFile(*outfile, buf.Bytes(), 0666); err != nil {
60 | 			log.Fatal(err)
61 | 		}
62 | 	} else {
63 | 		os.Stdout.Write(buf.Bytes())
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/testdata/del.txt:
--------------------------------------------------------------------------------
 1 | Strikethrough <del> tests.
 2 | 
 3 | gfm* from https://github.github.com/gfm/#strikethrough-extension-
 4 | (version 0.29-gfm (2019-04-06))
 5 | 
 6 | Others by hand, guessing based on GitHub behavior.
 7 | 
 8 | -- parser.json --
 9 | {"Strikethrough": true}
10 | -- gfm491.md --
11 | ~~Hi~~ Hello, ~there~ world!
12 | -- gfm491.html --
13 | <p><del>Hi</del> Hello, <del>there</del> world!</p>
14 | -- gfm492.md --
15 | This ~~has a
16 | 
17 | new paragraph~~.
18 | -- gfm492.html --
19 | <p>This ~~has a</p>
20 | <p>new paragraph~~.</p>
21 | -- gfm493.md --
22 | This will ~~~not~~~ strike.
23 | -- gfm493.html --
24 | <p>This will ~~~not~~~ strike.</p>
25 | -- 1.md --
26 | 5*6*78
27 | 5_6_78
28 | 5~6~78
29 | -- 1.html --
30 | <p>5<em>6</em>78
31 | 5_6_78
32 | 5<del>6</del>78</p>
33 | -- 2.md --
34 | ~~Hi~~ Hello, ~~there~~ world!
35 | 5~~6~~78
36 | -- 2.html --
37 | <p><del>Hi</del> Hello, <del>there</del> world!
38 | 5<del>6</del>78</p>
39 | -- 3.md --
40 | ~~___`this`___~~
41 | -- 3.html --
42 | <p><del><em><strong><code>this</code></strong></em></del></p>
43 | -- 4.md --
44 | ~~***`this`***~~
45 | -- 4.html --
46 | <p><del><em><strong><code>this</code></strong></em></del></p>
47 | -- 5.md --
48 | ~~*this*~~
49 | -- 5.html --
50 | <p><del><em>this</em></del></p>
51 | -- 6.md --
52 | ~~_this_~~
53 | -- 6.html --
54 | <p><del><em>this</em></del></p>
55 | -- 7.md --
56 | ~~___this___~~
57 | -- 7.html --
58 | <p><del><em><strong>this</strong></em></del></p>
59 | -- 8.md --
60 | ~~__this__~~
61 | -- 8.html --
62 | <p><del><strong>this</strong></del></p>
63 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/testdata/footnote.txt:
--------------------------------------------------------------------------------
 1 | -- parser.json --
 2 | {"Footnote": true}
 3 | -- 1.md --
 4 | Here is a simple footnote[^1][^4].
 5 | 
 6 | A footnote can also[^3] have multiple lines[^4].
 7 | 
 8 | [^1]: My reference.
 9 | [^4]: To add line breaks within a footnote, prefix new lines with 2 spaces.
10 |   This is a second line.
11 | -- 1.html --
12 | <p>Here is a simple footnote<sup class="fn"><a id="fnref-1" href="#fn-1">1</a></sup><sup class="fn"><a id="fnref-2" href="#fn-2">2</a></sup>.</p>
13 | <p>A footnote can also[^3] have multiple lines<sup class="fn"><a id="fnref-2-2" href="#fn-2">2</a></sup>.</p>
14 | <div class="footnotes">Footnotes</div>
15 | <ol>
16 | <li id="fn-1">
17 | <p>My reference.
18 | <a class="fnref" href="#fnref-1">↩</a></p>
19 | </li>
20 | <li id="fn-2">
21 | <p>To add line breaks within a footnote, prefix new lines with 2 spaces.
22 | This is a second line.
23 | <a class="fnref" href="#fnref-2">↩</a>
24 | <a class="fnref" href="#fnref-2-2">↩</a></p>
25 | </li>
26 | </ol>
27 | -- 2.md --
28 | Footnote[^abc].
29 | 
30 | [^aBc]: Hi.
31 | -- 2.html --
32 | <p>Footnote<sup class="fn"><a id="fnref-1" href="#fn-1">1</a></sup>.</p>
33 | <div class="footnotes">Footnotes</div>
34 | <ol>
35 | <li id="fn-1">
36 | <p>Hi.
37 | <a class="fnref" href="#fnref-1">↩</a></p>
38 | </li>
39 | </ol>
40 | -- 3.md --
41 | Footnote[^aBc].
42 | 
43 | [^abC]: Hi.
44 | -- 3.html --
45 | <p>Footnote<sup class="fn"><a id="fnref-1" href="#fn-1">1</a></sup>.</p>
46 | <div class="footnotes">Footnotes</div>
47 | <ol>
48 | <li id="fn-1">
49 | <p>Hi.
50 | <a class="fnref" href="#fnref-1">↩</a></p>
51 | </li>
52 | </ol>
53 | 


--------------------------------------------------------------------------------
/quote.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package markdown
 6 | 
 7 | // A Quote is a [Block] representing a [block quote].
 8 | //
 9 | // [block quote]: https://spec.commonmark.org/0.31.2/#block-quotes
10 | type Quote struct {
11 | 	Position
12 | 	Blocks []Block // content of quote
13 | }
14 | 
15 | func (*Quote) Block() {}
16 | 
17 | func (b *Quote) printHTML(p *printer) {
18 | 	p.html("<blockquote>\n")
19 | 	for _, c := range b.Blocks {
20 | 		c.printHTML(p)
21 | 	}
22 | 	p.html("</blockquote>\n")
23 | }
24 | 
25 | func (b *Quote) printMarkdown(p *printer) {
26 | 	p.maybeQuoteNL('>')
27 | 	p.WriteString("> ")
28 | 	defer p.pop(p.push("> "))
29 | 	printMarkdownBlocks(b.Blocks, p)
30 | }
31 | 
32 | // A quoteBuildier is a [blockBuilder] for a block quote.
33 | type quoteBuilder struct{}
34 | 
35 | // startBlockQuote is a [starter] for a [Quote].
36 | func startBlockQuote(p *parser, s line) (line, bool) {
37 | 	line, ok := trimQuote(s)
38 | 	if !ok {
39 | 		return s, false
40 | 	}
41 | 	p.addBlock(new(quoteBuilder))
42 | 	return line, true
43 | }
44 | 
45 | func trimQuote(s line) (line, bool) {
46 | 	t := s
47 | 	t.trimSpace(0, 3, false)
48 | 	if !t.trim('>') {
49 | 		return s, false
50 | 	}
51 | 	t.trimSpace(0, 1, true)
52 | 	return t, true
53 | }
54 | 
55 | func (b *quoteBuilder) extend(p *parser, s line) (line, bool) {
56 | 	return trimQuote(s)
57 | }
58 | 
59 | func (b *quoteBuilder) build(p *parser) Block {
60 | 	return &Quote{p.pos(), p.blocks()}
61 | }
62 | 


--------------------------------------------------------------------------------
/mdfmt/main.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // Mdfmt reformats Markdown data.
 6 | //
 7 | // Usage:
 8 | //
 9 | //	mdfmt [-w] [file...]
10 | //
11 | // Mdfmt reads the named files, or else standard input, as Markdown documents
12 | // and then reprints the same Markdown documents to standard output.
13 | //
14 | // The -w flag specifies to rewrite the files in place.
15 | package main
16 | 
17 | import (
18 | 	"flag"
19 | 	"fmt"
20 | 	"io"
21 | 	"log"
22 | 	"os"
23 | 
24 | 	"rsc.io/markdown"
25 | )
26 | 
27 | var (
28 | 	wflag = flag.Bool("w", false, "write reformatted Markdown back to input files")
29 | 	exit  = 0
30 | )
31 | 
32 | func usage() {
33 | 	fmt.Fprintf(os.Stderr, "usage: mdfmt [-w] [file...]\n")
34 | 	flag.PrintDefaults()
35 | 	os.Exit(2)
36 | }
37 | 
38 | func main() {
39 | 	log.SetPrefix("mdfmt: ")
40 | 	log.SetFlags(0)
41 | 	flag.Usage = usage
42 | 	flag.Parse()
43 | 
44 | 	if flag.NArg() == 0 {
45 | 		data, err := io.ReadAll(os.Stdin)
46 | 		if err != nil {
47 | 			log.Fatal(err)
48 | 		}
49 | 		convert(data, "")
50 | 	} else {
51 | 		for _, file := range flag.Args() {
52 | 			data, err := os.ReadFile(file)
53 | 			if err != nil {
54 | 				log.Print(err)
55 | 				exit = 1
56 | 				continue
57 | 			}
58 | 			convert(data, file)
59 | 		}
60 | 	}
61 | 	os.Exit(exit)
62 | }
63 | 
64 | func convert(data []byte, file string) {
65 | 	var p markdown.Parser
66 | 	doc := p.Parse(string(data))
67 | 	out := []byte(markdown.Format(doc))
68 | 	if *wflag && file != "" {
69 | 		if err := os.WriteFile(file, out, 0666); err != nil {
70 | 			log.Print(err)
71 | 			exit = 1
72 | 			return
73 | 		}
74 | 	} else {
75 | 		os.Stdout.Write(out)
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/testdata/spec2txtar.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // go run spec2txtar.go https://spec.commonmark.org/0.30/spec.json > spec0.30.txt
 6 | 
 7 | package main
 8 | 
 9 | import (
10 | 	"encoding/json"
11 | 	"flag"
12 | 	"fmt"
13 | 	"io"
14 | 	"log"
15 | 	"net/http"
16 | 	"os"
17 | 	"strings"
18 | 
19 | 	"golang.org/x/tools/txtar"
20 | )
21 | 
22 | type specCase struct {
23 | 	Name     string
24 | 	Markdown string
25 | 	HTML     string
26 | 	Example  int
27 | }
28 | 
29 | func main() {
30 | 	log.SetFlags(0)
31 | 	log.SetPrefix("spec2txtar: ")
32 | 	flag.Usage = func() {
33 | 		fmt.Fprintf(os.Stderr, "usage: spec2txtar url\n")
34 | 		os.Exit(2)
35 | 	}
36 | 	flag.Parse()
37 | 	if flag.NArg() != 1 {
38 | 		flag.Usage()
39 | 	}
40 | 	url := flag.Arg(0)
41 | 
42 | 	resp, err := http.Get(url)
43 | 	if err != nil {
44 | 		log.Fatal(err)
45 | 	}
46 | 	if resp.StatusCode != 200 {
47 | 		log.Fatal(resp.Status)
48 | 	}
49 | 	data, err := io.ReadAll(resp.Body)
50 | 	if err != nil {
51 | 		log.Fatal(err)
52 | 	}
53 | 
54 | 	var spec []specCase
55 | 	err = json.Unmarshal(data, &spec)
56 | 	if err != nil {
57 | 		log.Fatal(err)
58 | 	}
59 | 
60 | 	a := &txtar.Archive{
61 | 		Comment: []byte("// go run spec2txtar.go " + url + "\n"),
62 | 	}
63 | 	for _, cas := range spec {
64 | 		name := fmt.Sprintf("%d", cas.Example)
65 | 		a.Files = append(a.Files,
66 | 			txtar.File{
67 | 				Name: name + ".md",
68 | 				Data: []byte(encode(cas.Markdown)),
69 | 			},
70 | 			txtar.File{
71 | 				Name: name + ".html",
72 | 				Data: []byte(encode(cas.HTML)),
73 | 			},
74 | 		)
75 | 	}
76 | 
77 | 	os.Stdout.Write(txtar.Format(a))
78 | }
79 | 
80 | func encode(s string) string {
81 | 	s = strings.ReplaceAll(s, " \n", " ^J\n")
82 | 	s = strings.ReplaceAll(s, "\t\n", "\t^J\n")
83 | 	if s != "" && !strings.HasSuffix(s, "\n") {
84 | 		s += "^D\n"
85 | 	}
86 | 	return s
87 | }
88 | 


--------------------------------------------------------------------------------
/table_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2023 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package markdown
 6 | 
 7 | import (
 8 | 	"testing"
 9 | )
10 | 
11 | var tableCountTests = []struct {
12 | 	row string
13 | 	n   int
14 | }{
15 | 	{"|", 1},
16 | 	{"|x|", 1},
17 | 	{"||", 1},
18 | 	{"| |", 1},
19 | 	{"| | |", 2},
20 | 	{"| | Foo | Bar |", 3},
21 | 	{"|          | Foo      | Bar      |", 3},
22 | 	{"", 1},
23 | 	{"|a|b", 2},
24 | 	{"|a| ", 1},
25 | 	{" |b", 1},
26 | 	{"a|b", 2},
27 | 	{`x\|y`, 1},
28 | 	{`x\\|y`, 1},
29 | 	{`x\\\|y`, 1},
30 | 	{`x\\\\|y`, 1},
31 | 	{`x\\\\\|y`, 1},
32 | 	{`| 0\|1\\|2\\\|3\\\\|4\\\\\|5\\\\\\|6\\\\\\\|7\\\\\\\\|8  |`, 1},
33 | }
34 | 
35 | func TestTableCount(t *testing.T) {
36 | 	for _, tt := range tableCountTests {
37 | 		n := tableCount(tableTrimOuter(tt.row))
38 | 		if n != tt.n {
39 | 			t.Errorf("tableCount(%#q) = %d, want %d", tt.row, n, tt.n)
40 | 		}
41 | 	}
42 | }
43 | 
44 | func TestPad(t *testing.T) {
45 | 	testCases := []struct {
46 | 		raw, align string
47 | 		w          int
48 | 
49 | 		want string
50 | 	}{
51 | 		{"foo", "center", 8, "  foo   "},
52 | 		{"foo", "center", 6, " foo  "},
53 | 		{"foo", "center", 5, " foo "},
54 | 		{"föó", "center", 5, " föó "},
55 | 		{"foo", "center", 4, "foo "},
56 | 		{"foo", "center", 3, "foo"},
57 | 
58 | 		{"foo", "left", 8, "foo     "},
59 | 		{"foo", "right", 8, "     foo"},
60 | 		{"foo", "", 8, "foo     "},
61 | 
62 | 		{"foo", "left", 6, "foo   "},
63 | 		{"foo", "right", 6, "   foo"},
64 | 		{"foo", "", 6, "foo   "},
65 | 
66 | 		{"foo", "left", 5, "foo  "},
67 | 		{"foo", "right", 5, "  foo"},
68 | 		{"foo", "", 5, "foo  "},
69 | 
70 | 		{"foo", "left", 4, "foo "},
71 | 		{"foo", "right", 4, " foo"},
72 | 		{"foo", "", 4, "foo "},
73 | 
74 | 		{"foo", "left", 3, "foo"},
75 | 		{"foo", "right", 3, "foo"},
76 | 		{"foo", "", 3, "foo"},
77 | 	}
78 | 
79 | 	for _, tc := range testCases {
80 | 		in := tc.raw
81 | 		a := tc.align
82 | 		w := tc.w
83 | 		want := tc.want
84 | 		var p printer
85 | 		pad(&p, in, a, w)
86 | 		h := p.buf.String()
87 | 		if h != want {
88 | 			t.Errorf("\npad(%s, %s, %d)\n have %q\n want %q", in, a, w, h, want)
89 | 		}
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/entity2go.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //go:build ignore
 6 | 
 7 | package main
 8 | 
 9 | import (
10 | 	"bytes"
11 | 	"encoding/json"
12 | 	"flag"
13 | 	"fmt"
14 | 	"go/format"
15 | 	"io"
16 | 	"log"
17 | 	"net/http"
18 | 	"os"
19 | 	"sort"
20 | 	"strings"
21 | )
22 | 
23 | var outfile = flag.String("o", "", "write output to `file`")
24 | 
25 | func main() {
26 | 	log.SetFlags(0)
27 | 	log.SetPrefix("entity2go: ")
28 | 	flag.Parse()
29 | 
30 | 	resp, err := http.Get("https://html.spec.whatwg.org/entities.json")
31 | 	if err != nil {
32 | 		log.Fatal(err)
33 | 	}
34 | 	if resp.StatusCode != 200 {
35 | 		log.Fatal(resp.Status)
36 | 	}
37 | 	data, err := io.ReadAll(resp.Body)
38 | 	if err != nil {
39 | 		log.Fatal(err)
40 | 	}
41 | 
42 | 	list := make(map[string]struct {
43 | 		Codepoints []rune
44 | 	})
45 | 	err = json.Unmarshal(data, &list)
46 | 	if err != nil {
47 | 		log.Fatal(err)
48 | 	}
49 | 
50 | 	var names []string
51 | 	for name := range list {
52 | 		names = append(names, name)
53 | 	}
54 | 	sort.Strings(names)
55 | 
56 | 	var buf bytes.Buffer
57 | 	buf.WriteString(hdr)
58 | 	fmt.Fprintf(&buf, "var htmlEntity = map[string]string{\n")
59 | 	for _, name := range names {
60 | 		if !strings.HasSuffix(name, ";") {
61 | 			continue
62 | 		}
63 | 		fmt.Fprintf(&buf, "\t%q: \"", name)
64 | 		for _, r := range list[name].Codepoints {
65 | 			if r <= 0xFFFF {
66 | 				fmt.Fprintf(&buf, "\\u%04x", r)
67 | 			} else {
68 | 				fmt.Fprintf(&buf, "\\U%08x", r)
69 | 			}
70 | 		}
71 | 		fmt.Fprintf(&buf, "\",\n")
72 | 	}
73 | 	fmt.Fprintf(&buf, "}\n")
74 | 
75 | 	src, err := format.Source(buf.Bytes())
76 | 	if err != nil {
77 | 		log.Fatalf("reformatting output: %v", err)
78 | 	}
79 | 
80 | 	if *outfile != "" {
81 | 		if err := os.WriteFile(*outfile, src, 0666); err != nil {
82 | 			log.Fatal(err)
83 | 		}
84 | 	} else {
85 | 		os.Stdout.Write(buf.Bytes())
86 | 	}
87 | }
88 | 
89 | var hdr = `// Copyright 2023 The Go Authors. All rights reserved.
90 | // Use of this source code is governed by a BSD-style
91 | // license that can be found in the LICENSE file.
92 | 
93 | //go:generate go run entity2go.go -o entity.go
94 | 
95 | package markdown
96 | 
97 | // htmlEntity maps known HTML entity sequences to their meanings.
98 | `
99 | 


--------------------------------------------------------------------------------
/md2html/main.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // Md2html converts Markdown to HTML.
 6 | //
 7 | // Usage:
 8 | //
 9 | //	md2html [file...]
10 | //
11 | // Md2html reads the named files, or else standard input, as Markdown documents
12 | // and then prints the corresponding HTML to standard output.
13 | package main
14 | 
15 | import (
16 | 	"bytes"
17 | 	"flag"
18 | 	"io/ioutil"
19 | 	"log"
20 | 	"os"
21 | 	"unicode/utf8"
22 | 
23 | 	"rsc.io/markdown"
24 | )
25 | 
26 | func main() {
27 | 	flag.Parse()
28 | 	args := flag.Args()
29 | 	if len(args) == 0 {
30 | 		do(os.Stdin)
31 | 	} else {
32 | 		for _, arg := range args {
33 | 			f, err := os.Open(arg)
34 | 			if err != nil {
35 | 				log.Fatal(err)
36 | 			}
37 | 			do(f)
38 | 			f.Close()
39 | 		}
40 | 	}
41 | }
42 | 
43 | func do(f *os.File) {
44 | 	data, err := ioutil.ReadAll(f)
45 | 	if err != nil {
46 | 		log.Fatal(err)
47 | 	}
48 | 	os.Stdout.WriteString(toHTML(data))
49 | }
50 | 
51 | // toHTML converts Markdown to HTML.
52 | func toHTML(md []byte) string {
53 | 	var p markdown.Parser
54 | 	p.Table = true
55 | 	return markdown.ToHTML(p.Parse(string(replaceTabs(md))))
56 | }
57 | 
58 | // replaceTabs replaces all tabs in text with spaces up to a 4-space tab stop.
59 | //
60 | // In Markdown, tabs used for indentation are required to be interpreted as
61 | // 4-space tab stops. See https://spec.commonmark.org/0.30/#tabs.
62 | // Go also renders nicely and more compactly on the screen with 4-space
63 | // tab stops, while browsers often use 8-space.
64 | // Make the Go code consistently compact across browsers,
65 | // all while staying Markdown-compatible, by expanding to 4-space tab stops.
66 | //
67 | // This function does not handle multi-codepoint Unicode sequences correctly.
68 | func replaceTabs(text []byte) []byte {
69 | 	var buf bytes.Buffer
70 | 	col := 0
71 | 	for len(text) > 0 {
72 | 		r, size := utf8.DecodeRune(text)
73 | 		text = text[size:]
74 | 
75 | 		switch r {
76 | 		case '\n':
77 | 			buf.WriteByte('\n')
78 | 			col = 0
79 | 
80 | 		case '\t':
81 | 			buf.WriteByte(' ')
82 | 			col++
83 | 			for col%4 != 0 {
84 | 				buf.WriteByte(' ')
85 | 				col++
86 | 			}
87 | 
88 | 		default:
89 | 			buf.WriteRune(r)
90 | 			col++
91 | 		}
92 | 	}
93 | 	return buf.Bytes()
94 | }
95 | 


--------------------------------------------------------------------------------
/testdata/task.txt:
--------------------------------------------------------------------------------
  1 | Task list items tests.
  2 | 
  3 | gfm* from https://github.github.com/gfm/#task-list-items-extension-
  4 | (version 0.29-gfm (2019-04-06))
  5 | 
  6 | Others by hand, guessing based on GitHub behavior.
  7 | 
  8 | -- parser.json --
  9 | {"TaskList": true}
 10 | -- gfm279.md --
 11 | - [ ] foo
 12 | - [x] bar
 13 | -- gfm279.html --
 14 | <ul>
 15 | <li><input disabled="" type="checkbox"> foo</li>
 16 | <li><input checked="" disabled="" type="checkbox"> bar</li>
 17 | </ul>
 18 | -- gfm280.md --
 19 | - [x] foo
 20 |   - [ ] bar
 21 |   - [x] baz
 22 | - [ ] bim
 23 | -- gfm280.html --
 24 | <ul>
 25 | <li><input checked="" disabled="" type="checkbox"> foo
 26 | <ul>
 27 | <li><input disabled="" type="checkbox"> bar</li>
 28 | <li><input checked="" disabled="" type="checkbox"> baz</li>
 29 | </ul>
 30 | </li>
 31 | <li><input disabled="" type="checkbox"> bim</li>
 32 | </ul>
 33 | -- spaces.md --
 34 | -  [ ] foo
 35 | -   [x] bar
 36 | - [ ]quux
 37 | -- spaces.html --
 38 | <ul>
 39 | <li><input disabled="" type="checkbox"> foo</li>
 40 | <li><input checked="" disabled="" type="checkbox"> bar</li>
 41 | <li>[ ]quux</li>
 42 | </ul>
 43 | -- wxyz.md --
 44 | - [w] woolloomooloo
 45 | - [x] foo
 46 | - [y] bar
 47 | - [z] baz
 48 | -- wxyz.html --
 49 | <ul>
 50 | <li>[w] woolloomooloo</li>
 51 | <li><input checked="" disabled="" type="checkbox"> foo</li>
 52 | <li>[y] bar</li>
 53 | <li>[z] baz</li>
 54 | </ul>
 55 | -- X.md --
 56 | - [x] foo
 57 | - [X] bar
 58 | - [ ] baz
 59 | -- X.html --
 60 | <ul>
 61 | <li><input checked="" disabled="" type="checkbox"> foo</li>
 62 | <li><input checked="" disabled="" type="checkbox"> bar</li>
 63 | <li><input disabled="" type="checkbox"> baz</li>
 64 | </ul>
 65 | -- 1.md --
 66 | - [x] foo
 67 | -
 68 | - [x] bar
 69 | - hello
 70 | - > quote
 71 | - *emph*
 72 | -- 1.html --
 73 | <ul>
 74 | <li><input checked="" disabled="" type="checkbox"> foo</li>
 75 | <li></li>
 76 | <li><input checked="" disabled="" type="checkbox"> bar</li>
 77 | <li>hello</li>
 78 | <li>
 79 | <blockquote>
 80 | <p>quote</p>
 81 | </blockquote>
 82 | </li>
 83 | <li><em>emph</em></li>
 84 | </ul>
 85 | -- 2.md --
 86 | - [x] foo
 87 | -
 88 | - [x] bar
 89 | 
 90 | - hello
 91 | -- 2.html --
 92 | <ul>
 93 | <li>
 94 | <p><input checked="" disabled="" type="checkbox"> foo</p>
 95 | </li>
 96 | <li></li>
 97 | <li>
 98 | <p><input checked="" disabled="" type="checkbox"> bar</p>
 99 | </li>
100 | <li>
101 | <p>hello</p>
102 | </li>
103 | </ul>
104 | 


--------------------------------------------------------------------------------
/testdata/gfm_smart.txt:
--------------------------------------------------------------------------------
  1 | // go run cmark2txtar.go /users/rsc/pub/cmark-gfm/test/smart_punct.txt
  2 | -- parser.json --
  3 | {"SmartQuote": true, "SmartDash": true, "SmartDot": true}
  4 | -- 1.md --
  5 | "Hello," said the spider.
  6 | "'Shelob' is my name."
  7 | -- 1.html --
  8 | <p>“Hello,” said the spider.
  9 | “‘Shelob’ is my name.”</p>
 10 | -- 2.md --
 11 | 'A', 'B', and 'C' are letters.
 12 | -- 2.html --
 13 | <p>‘A’, ‘B’, and ‘C’ are letters.</p>
 14 | -- 3.md --
 15 | 'Oak,' 'elm,' and 'beech' are names of trees.
 16 | So is 'pine.'
 17 | -- 3.html --
 18 | <p>‘Oak,’ ‘elm,’ and ‘beech’ are names of trees.
 19 | So is ‘pine.’</p>
 20 | -- 4.md --
 21 | 'He said, "I want to go."'
 22 | -- 4.html --
 23 | <p>‘He said, “I want to go.”’</p>
 24 | -- 5.md --
 25 | Were you alive in the 70's?
 26 | -- 5.html --
 27 | <p>Were you alive in the 70’s?</p>
 28 | -- 6.md --
 29 | Here is some quoted '`code`' and a "[quoted link](url)".
 30 | -- 6.html --
 31 | <p>Here is some quoted ‘<code>code</code>’ and a “<a href="url">quoted link</a>”.</p>
 32 | -- 7.md --
 33 | 'tis the season to be 'jolly'
 34 | -- 7.html --
 35 | <p>’tis the season to be ‘jolly’</p>
 36 | -- 8.md --
 37 | 'We'll use Jane's boat and John's truck,' Jenna said.
 38 | -- 8.html --
 39 | <p>‘We’ll use Jane’s boat and John’s truck,’ Jenna said.</p>
 40 | -- 9.md --
 41 | "A paragraph with no closing quote.
 42 | 
 43 | "Second paragraph by same speaker, in fiction."
 44 | -- 9.html --
 45 | <p>“A paragraph with no closing quote.</p>
 46 | <p>“Second paragraph by same speaker, in fiction.”</p>
 47 | -- 10.md --
 48 | [a]'s b'
 49 | -- 10.html --
 50 | <p>[a]’s b’</p>
 51 | -- 11.md --
 52 | \"This is not smart.\"
 53 | This isn\'t either.
 54 | 5\'8\"
 55 | -- 11.html --
 56 | <p>&quot;This is not smart.&quot;
 57 | This isn't either.
 58 | 5'8&quot;</p>
 59 | -- 12.md --
 60 | Some dashes:  em---em
 61 | en--en
 62 | em --- em
 63 | en -- en
 64 | 2--3
 65 | -- 12.html --
 66 | <p>Some dashes:  em—em
 67 | en–en
 68 | em — em
 69 | en – en
 70 | 2–3</p>
 71 | -- 13.md --
 72 | one-
 73 | two--
 74 | three---
 75 | four----
 76 | five-----
 77 | six------
 78 | seven-------
 79 | eight--------
 80 | nine---------
 81 | thirteen-------------.
 82 | -- 13.html --
 83 | <p>one-
 84 | two–
 85 | three—
 86 | four––
 87 | five—–
 88 | six——
 89 | seven—––
 90 | eight––––
 91 | nine———
 92 | thirteen———––.</p>
 93 | -- 14.md --
 94 | Escaped hyphens: \-- \-\-\-.
 95 | -- 14.html --
 96 | <p>Escaped hyphens: -- ---.</p>
 97 | -- 15.md --
 98 | Ellipses...and...and....
 99 | -- 15.html --
100 | <p>Ellipses…and…and….</p>
101 | -- 16.md --
102 | No ellipses\.\.\.
103 | -- 16.html --
104 | <p>No ellipses...</p>
105 | 


--------------------------------------------------------------------------------
/testdata/cmark2txtar.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"log"
 11 | 	"os"
 12 | 	"strings"
 13 | 
 14 | 	"golang.org/x/tools/txtar"
 15 | 	"rsc.io/markdown"
 16 | )
 17 | 
 18 | var parsers = map[string]string{
 19 | 	"example autolink":      `{"AutoLinkText": true, "AutoLinkAssumeHTTP": true}`,
 20 | 	"example disabled":      `{"TaskListItems": true}`,
 21 | 	"example strikethrough": `{"Strikethrough": true}`,
 22 | 	"example table":         `{"Table": true}`,
 23 | }
 24 | 
 25 | func main() {
 26 | 	log.SetFlags(0)
 27 | 	log.SetPrefix("cmark2txtar: ")
 28 | 	flag.Usage = func() {
 29 | 		fmt.Fprintf(os.Stderr, "usage: cmark2txtar file\n")
 30 | 		os.Exit(2)
 31 | 	}
 32 | 	flag.Parse()
 33 | 	if flag.NArg() != 1 {
 34 | 		flag.Usage()
 35 | 	}
 36 | 	file := flag.Arg(0)
 37 | 
 38 | 	data, err := os.ReadFile(file)
 39 | 	if err != nil {
 40 | 		log.Fatal(err)
 41 | 	}
 42 | 
 43 | 	a := &txtar.Archive{
 44 | 		Comment: []byte("// go run cmark2txtar.go " + file + "\n"),
 45 | 	}
 46 | 
 47 | 	var p markdown.Parser
 48 | 	doc := p.Parse(string(data))
 49 | 	n := 0
 50 | 	for _, b := range doc.Blocks {
 51 | 		var in, out []string
 52 | 		b, ok := b.(*markdown.CodeBlock)
 53 | 		if !ok || !strings.HasPrefix(b.Info, "example") {
 54 | 			continue
 55 | 		}
 56 | 		for i := 0; i < len(b.Text); i++ {
 57 | 			if b.Text[i] == "." {
 58 | 				in, out = b.Text[:i], b.Text[i+1:]
 59 | 				goto Found
 60 | 			}
 61 | 		}
 62 | 		log.Fatalf("did not find . in pre block:\n%s", strings.Join(b.Text, "\n"))
 63 | 	Found:
 64 | 		parserChange := false
 65 | 		if b.Info != "example" {
 66 | 			js, ok := parsers[b.Info]
 67 | 			if !ok {
 68 | 				log.Printf("skipping %s", b.Info)
 69 | 				continue
 70 | 			}
 71 | 			parserChange = true
 72 | 			a.Files = append(a.Files, txtar.File{Name: "parser.json", Data: []byte(js)})
 73 | 		}
 74 | 		n++
 75 | 		name := fmt.Sprintf("%d", n)
 76 | 		a.Files = append(a.Files,
 77 | 			txtar.File{
 78 | 				Name: name + ".md",
 79 | 				Data: []byte(encode(join(in))),
 80 | 			},
 81 | 			txtar.File{
 82 | 				Name: name + ".html",
 83 | 				Data: []byte(encode(join(out))),
 84 | 			},
 85 | 		)
 86 | 		if parserChange {
 87 | 			a.Files = append(a.Files, txtar.File{Name: "parser.json", Data: []byte(`{}`)})
 88 | 		}
 89 | 	}
 90 | 
 91 | 	os.Stdout.Write(txtar.Format(a))
 92 | }
 93 | 
 94 | func encode(s string) string {
 95 | 	s = strings.ReplaceAll(s, " \n", " ^J\n")
 96 | 	s = strings.ReplaceAll(s, "\t\n", "\t^J\n")
 97 | 	if s != "" && !strings.HasSuffix(s, "\n") {
 98 | 		s += "^D\n"
 99 | 	}
100 | 	return s
101 | }
102 | 
103 | func join(s []string) string {
104 | 	if len(s) == 0 {
105 | 		return ""
106 | 	}
107 | 	x := strings.Join(s, "\n") + "\n"
108 | 	x = strings.ReplaceAll(x, "→", "\t")
109 | 	return x
110 | }
111 | 


--------------------------------------------------------------------------------
/emoji2go.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //go:build ignore
  6 | 
  7 | package main
  8 | 
  9 | import (
 10 | 	"bytes"
 11 | 	"encoding/json"
 12 | 	"flag"
 13 | 	"fmt"
 14 | 	"go/format"
 15 | 	"io"
 16 | 	"log"
 17 | 	"net/http"
 18 | 	"os"
 19 | 	"regexp"
 20 | 	"sort"
 21 | 	"strconv"
 22 | 	"strings"
 23 | )
 24 | 
 25 | var outfile = flag.String("o", "", "write output to `file`")
 26 | 
 27 | func get(url string) []byte {
 28 | 	resp, err := http.Get(url)
 29 | 	if err != nil {
 30 | 		log.Fatal(err)
 31 | 	}
 32 | 	if resp.StatusCode != 200 {
 33 | 		log.Fatal(resp.Status)
 34 | 	}
 35 | 	data, err := io.ReadAll(resp.Body)
 36 | 	if err != nil {
 37 | 		log.Fatal(err)
 38 | 	}
 39 | 	return data
 40 | }
 41 | 
 42 | var gemojiRE = regexp.MustCompile(`</?g-emoji[^<>]*>`)
 43 | 
 44 | func main() {
 45 | 	log.SetFlags(0)
 46 | 	log.SetPrefix("emoji2go: ")
 47 | 	flag.Parse()
 48 | 
 49 | 	emojiJSON := get("https://api.github.com/emojis")
 50 | 	list := make(map[string]string)
 51 | 	err := json.Unmarshal(emojiJSON, &list)
 52 | 	if err != nil {
 53 | 		log.Fatal(err)
 54 | 	}
 55 | 
 56 | 	var names []string
 57 | 	for name := range list {
 58 | 		names = append(names, name)
 59 | 	}
 60 | 	sort.Strings(names)
 61 | 
 62 | 	emojiHTML := string(get("https://gist.github.com/rsc/316bc98c066ad111973634d435203aac"))
 63 | 
 64 | 	bad := false
 65 | 	var buf bytes.Buffer
 66 | 	buf.WriteString(hdr)
 67 | 	fmt.Fprintf(&buf, "var emoji = map[string]string{\n")
 68 | 	n := 0
 69 | 	for _, name := range names {
 70 | 		n = max(n, len(name))
 71 | 		_, val, ok := strings.Cut(emojiHTML, "<td><code>"+name+"</code></td>\n<td>")
 72 | 		if !ok {
 73 | 			log.Printf("gist missing :%s:", name)
 74 | 			bad = true
 75 | 			continue
 76 | 		}
 77 | 		val, _, ok = strings.Cut(val, "</td>")
 78 | 		if !ok {
 79 | 			log.Printf("gist missing :%s:", name)
 80 | 			bad = true
 81 | 			continue
 82 | 		}
 83 | 		val = gemojiRE.ReplaceAllString(val, "")
 84 | 		if strings.Contains(val, "<") {
 85 | 			log.Printf("skipping %s: non-unicode: %s", name, val)
 86 | 			continue
 87 | 		}
 88 | 		fmt.Fprintf(&buf, "\t%q: %s,\n", name, strconv.QuoteToASCII(val))
 89 | 	}
 90 | 	fmt.Fprintf(&buf, "}\n\n")
 91 | 
 92 | 	fmt.Fprintf(&buf, "const maxEmojiLen = %d\n", n)
 93 | 
 94 | 	if bad {
 95 | 		os.Exit(1)
 96 | 	}
 97 | 
 98 | 	src, err := format.Source(buf.Bytes())
 99 | 	if err != nil {
100 | 		log.Fatalf("reformatting output: %v", err)
101 | 	}
102 | 
103 | 	if *outfile != "" {
104 | 		if err := os.WriteFile(*outfile, src, 0666); err != nil {
105 | 			log.Fatal(err)
106 | 		}
107 | 	} else {
108 | 		os.Stdout.Write(src)
109 | 	}
110 | }
111 | 
112 | var hdr = `// Copyright 2023 The Go Authors. All rights reserved.
113 | // Use of this source code is governed by a BSD-style
114 | // license that can be found in the LICENSE file.
115 | 
116 | //go:generate go run emoji2go.go -o emoji.go
117 | 
118 | package markdown
119 | 
120 | // emoji maps known emoji names to their UTF-8 emoji forms.
121 | `
122 | 


--------------------------------------------------------------------------------
/fuzz_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"net/url"
 11 | 	"path/filepath"
 12 | 	"strings"
 13 | 	"testing"
 14 | 	"unicode/utf8"
 15 | 
 16 | 	"golang.org/x/tools/txtar"
 17 | )
 18 | 
 19 | func FuzzGoldmark(f *testing.F) {
 20 | 	if !*goldmarkFlag {
 21 | 		f.Skip("-goldmark not set")
 22 | 	}
 23 | 	files, err := filepath.Glob("testdata/*.txt")
 24 | 	if err != nil {
 25 | 		f.Fatal(err)
 26 | 	}
 27 | 	for _, file := range files {
 28 | 		if strings.HasSuffix(file, "to_markdown.txt") {
 29 | 			continue
 30 | 		}
 31 | 		a, err := txtar.ParseFile(file)
 32 | 		if err != nil {
 33 | 			f.Fatal(err)
 34 | 		}
 35 | 		for i := 0; i+2 <= len(a.Files); {
 36 | 			if a.Files[i].Name == "parser.json" {
 37 | 				i++
 38 | 				continue
 39 | 			}
 40 | 			md := a.Files[i]
 41 | 			html := a.Files[i+1]
 42 | 			i += 2
 43 | 			name := strings.TrimSuffix(md.Name, ".md")
 44 | 			if name != strings.TrimSuffix(html.Name, ".html") {
 45 | 				f.Fatalf("mismatched file pair: %s and %s", md.Name, html.Name)
 46 | 			}
 47 | 			f.Add(decode(string(md.Data)))
 48 | 		}
 49 | 	}
 50 | 	f.Fuzz(func(t *testing.T, s string) {
 51 | 		// Too many corner cases involving non-terminated lines.
 52 | 		if !strings.HasSuffix(s, "\n") {
 53 | 			s += "\n"
 54 | 		}
 55 | 		// Goldmark does not convert \r to \n.
 56 | 		s = strings.ReplaceAll(s, "\r", "\n")
 57 | 		// Goldmark treats \v as isUnicodeSpace for deciding emphasis.
 58 | 		// Not unreasonable, but not what the spec says.
 59 | 		s = strings.ReplaceAll(s, "\v", "\f")
 60 | 		if !utf8.ValidString(s) {
 61 | 			s = string([]rune(s)) // coerce to valid UTF8
 62 | 		}
 63 | 		var parsers = []Parser{
 64 | 			{},
 65 | 			{HeadingID: true},
 66 | 			{Strikethrough: true},
 67 | 			{TaskList: true},
 68 | 			{HeadingID: true, Strikethrough: true, TaskList: true},
 69 | 		}
 70 | 		for i, p := range parsers {
 71 | 			if t.Failed() {
 72 | 				break
 73 | 			}
 74 | 			t.Run(fmt.Sprintf("p%d", i), func(t *testing.T) {
 75 | 				doc, corner := p.parse(s)
 76 | 				if corner {
 77 | 					return
 78 | 				}
 79 | 				out := ToHTML(doc)
 80 | 
 81 | 				gm := goldmarkParser(&p)
 82 | 				var buf bytes.Buffer
 83 | 				if err := gm.Convert([]byte(s), &buf); err != nil {
 84 | 					t.Fatal(err)
 85 | 				}
 86 | 				if buf.Len() > 0 && buf.Bytes()[buf.Len()-1] != '\n' {
 87 | 					buf.WriteByte('\n')
 88 | 				}
 89 | 				gout := buf.String()
 90 | 
 91 | 				// Goldmark uses <br />, <hr />, and <img />.
 92 | 				// Goldmark also escapes | as %7C.
 93 | 				// Apply rewrites to out as well as gout to handle these appearing
 94 | 				// as literals in the input.
 95 | 				canon := func(s string) string {
 96 | 					s = strings.ReplaceAll(s, " />", ">")
 97 | 					s = strings.ReplaceAll(s, "%7C", "|")
 98 | 					return s
 99 | 				}
100 | 				out = canon(out)
101 | 				gout = canon(gout)
102 | 
103 | 				if out != gout {
104 | 					q := strings.ReplaceAll(url.QueryEscape(s), "+", "%20")
105 | 					t.Fatalf("in: %q\nparse:\n%s\nout: %q\ngout: %q\ndingus: (https://spec.commonmark.org/dingus/?text=%s)\ngithub: (https://github.com/rsc/tmp/issues/new?body=%s)", s, dump(doc), out, gout, q, q)
106 | 				}
107 | 			})
108 | 		}
109 | 	})
110 | }
111 | 


--------------------------------------------------------------------------------
/break.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | // A ThematicBreak is a [Block] representing a [thematic break],
  8 | // usually displayed as a horizontal rule (<hr> tag).
  9 | //
 10 | // [thematic break]: https://spec.commonmark.org/0.31.2/#thematic-breaks
 11 | type ThematicBreak struct {
 12 | 	Position
 13 | }
 14 | 
 15 | func (*ThematicBreak) Block() {}
 16 | 
 17 | func (b *ThematicBreak) printHTML(p *printer) {
 18 | 	p.html("<hr />\n")
 19 | }
 20 | 
 21 | func (b *ThematicBreak) printMarkdown(p *printer) {
 22 | 	p.maybeNL()
 23 | 	p.md("***")
 24 | }
 25 | 
 26 | // startThematicBreak is a [starter] for a [ThematicBreak].
 27 | func startThematicBreak(p *parser, s line) (line, bool) {
 28 | 	if !trimThematicBreak(&s) {
 29 | 		return s, false
 30 | 	}
 31 | 	p.doneBlock(&ThematicBreak{Position{p.lineno, p.lineno}})
 32 | 	return line{}, true
 33 | }
 34 | 
 35 | // trimThematicBreak attempts to trim a thematic break from s,
 36 | // reporting whether it was successful.
 37 | // See https://spec.commonmark.org/0.31.2/#thematic-breaks.
 38 | func trimThematicBreak(s *line) bool {
 39 | 	t := s
 40 | 	t.trimSpace(0, 3, false)
 41 | 	c := t.peek()
 42 | 	if c != '-' && c != '_' && c != '*' {
 43 | 		return false
 44 | 	}
 45 | 	for i := 0; ; i++ {
 46 | 		if !t.trim(c) {
 47 | 			if i < 3 {
 48 | 				return false
 49 | 			}
 50 | 			break
 51 | 		}
 52 | 		t.skipSpace()
 53 | 	}
 54 | 	if !t.eof() {
 55 | 		return false
 56 | 	}
 57 | 	*s = line{}
 58 | 	return true
 59 | }
 60 | 
 61 | // A HardBreak is an Inline representing a hard line break (<br> tag).
 62 | type HardBreak struct{}
 63 | 
 64 | func (*HardBreak) Inline() {}
 65 | 
 66 | func (x *HardBreak) printHTML(p *printer) {
 67 | 	p.html("<br />\n")
 68 | }
 69 | 
 70 | func (x *HardBreak) printMarkdown(p *printer) {
 71 | 	p.md(`\`)
 72 | 	p.nl()
 73 | }
 74 | 
 75 | func (x *HardBreak) printText(p *printer) {
 76 | 	p.text("\n")
 77 | }
 78 | 
 79 | // A SoftBreak is an Inline representing a soft line break (newline character).
 80 | type SoftBreak struct{}
 81 | 
 82 | func (*SoftBreak) Inline() {}
 83 | 
 84 | func (x *SoftBreak) printHTML(p *printer) {
 85 | 	// TODO: If printer config says to, print <br> instead.
 86 | 	p.html("\n")
 87 | }
 88 | 
 89 | func (x *SoftBreak) printMarkdown(p *printer) {
 90 | 	p.nl()
 91 | }
 92 | 
 93 | func (x *SoftBreak) printText(p *printer) {
 94 | 	p.text("\n")
 95 | }
 96 | 
 97 | // parseBreak is an [inlineParser] for a [SoftBreak] or [HardBreak].
 98 | // The caller has checked that s[start] is a newline.
 99 | func parseBreak(p *parser, s string, start int) (x Inline, end int, ok bool) {
100 | 	// Back up to remove trailing spaces and tabs.
101 | 	i := start
102 | 	for i > 0 && (s[i-1] == ' ' || s[i-1] == '\t') {
103 | 		i--
104 | 	}
105 | 	if i < start {
106 | 		// The caller will do p.emit(start), but we want to skip
107 | 		// the spaces and tabs between i and start, so do the
108 | 		// emit ourselves followed by skipping to start.
109 | 		p.emit(i)
110 | 		p.skip(start)
111 | 	}
112 | 
113 | 	end = start + 1
114 | 	// TODO: Do tabs count? That would be a mess.
115 | 	if start >= 2 && s[start-1] == ' ' && s[start-2] == ' ' {
116 | 		return &HardBreak{}, end, true
117 | 	}
118 | 	return &SoftBreak{}, end, true
119 | }
120 | 


--------------------------------------------------------------------------------
/htmlesc.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package markdown
 6 | 
 7 | import "strings"
 8 | 
 9 | // htmlEscaper is a strings.Replacer that escapes text for inclusion in HTML.
10 | // It escapes " & < > only. In particular it does not escape ' so any generated
11 | // HTML should use " for attribute quoting.
12 | var htmlEscaper = strings.NewReplacer(
13 | 	"\"", "&quot;",
14 | 	"&", "&amp;",
15 | 	"<", "&lt;",
16 | 	">", "&gt;",
17 | )
18 | 
19 | // htmlLinkEscaper is a strings.Replacer that escapes URLs
20 | // for inclusion in an <a href="..."> tag.
21 | var htmlLinkEscaper = strings.NewReplacer(
22 | 	"\"", "%22",
23 | 	"&", "&amp;",
24 | 	"<", "%3C",
25 | 	">", "%3E",
26 | 	"\\", "%5C",
27 | 	" ", "%20",
28 | 	"`", "%60",
29 | 	"[", "%5B",
30 | 	"]", "%5D",
31 | 	"^", "%5E",
32 | 	"{", "%7B",
33 | 	"}", "%7D",
34 | 
35 | 	"\x00", "%00", "\x01", "%01", "\x02", "%02", "\x03", "%03",
36 | 	"\x04", "%04", "\x05", "%05", "\x06", "%06", "\x07", "%07",
37 | 	"\x08", "%08", "\x0B", "%0B", // not 09 (tab) or 0A (newline)
38 | 	"\x0C", "%0C", "\x0E", "%0E", "\x0F", "%0F", // not 0D (carriage return)
39 | 
40 | 	"\x10", "%10", "\x11", "%11", "\x12", "%12", "\x13", "%13",
41 | 	"\x14", "%14", "\x15", "%15", "\x16", "%16", "\x17", "%17",
42 | 	"\x18", "%18", "\x19", "%19", "\x1A", "%1A", "\x1B", "%1B",
43 | 	"\x1C", "%1C", "\x1D", "%1D", "\x1E", "%1E", "\x1F", "%1F",
44 | 
45 | 	"\x7F", "%7F",
46 | 
47 | 	"\x80", "%80", "\x81", "%81", "\x82", "%82", "\x83", "%83",
48 | 	"\x84", "%84", "\x85", "%85", "\x86", "%86", "\x87", "%87",
49 | 	"\x88", "%88", "\x89", "%89", "\x8A", "%8A", "\x8B", "%8B",
50 | 	"\x8C", "%8C", "\x8D", "%8D", "\x8E", "%8E", "\x8F", "%8F",
51 | 
52 | 	"\x90", "%90", "\x91", "%91", "\x92", "%92", "\x93", "%93",
53 | 	"\x94", "%94", "\x95", "%95", "\x96", "%96", "\x97", "%97",
54 | 	"\x98", "%98", "\x99", "%99", "\x9A", "%9A", "\x9B", "%9B",
55 | 	"\x9C", "%9C", "\x9D", "%9D", "\x9E", "%9E", "\x9F", "%9F",
56 | 
57 | 	"\xA0", "%A0", "\xA1", "%A1", "\xA2", "%A2", "\xA3", "%A3",
58 | 	"\xA4", "%A4", "\xA5", "%A5", "\xA6", "%A6", "\xA7", "%A7",
59 | 	"\xA8", "%A8", "\xA9", "%A9", "\xAA", "%AA", "\xAB", "%AB",
60 | 	"\xAC", "%AC", "\xAD", "%AD", "\xAE", "%AE", "\xAF", "%AF",
61 | 
62 | 	"\xB0", "%B0", "\xB1", "%B1", "\xB2", "%B2", "\xB3", "%B3",
63 | 	"\xB4", "%B4", "\xB5", "%B5", "\xB6", "%B6", "\xB7", "%B7",
64 | 	"\xB8", "%B8", "\xB9", "%B9", "\xBA", "%BA", "\xBB", "%BB",
65 | 	"\xBC", "%BC", "\xBD", "%BD", "\xBE", "%BE", "\xBF", "%BF",
66 | 
67 | 	"\xC0", "%C0", "\xC1", "%C1", "\xC2", "%C2", "\xC3", "%C3",
68 | 	"\xC4", "%C4", "\xC5", "%C5", "\xC6", "%C6", "\xC7", "%C7",
69 | 	"\xC8", "%C8", "\xC9", "%C9", "\xCA", "%CA", "\xCB", "%CB",
70 | 	"\xCC", "%CC", "\xCD", "%CD", "\xCE", "%CE", "\xCF", "%CF",
71 | 
72 | 	"\xD0", "%D0", "\xD1", "%D1", "\xD2", "%D2", "\xD3", "%D3",
73 | 	"\xD4", "%D4", "\xD5", "%D5", "\xD6", "%D6", "\xD7", "%D7",
74 | 	"\xD8", "%D8", "\xD9", "%D9", "\xDA", "%DA", "\xDB", "%DB",
75 | 	"\xDC", "%DC", "\xDD", "%DD", "\xDE", "%DE", "\xDF", "%DF",
76 | 
77 | 	"\xE0", "%E0", "\xE1", "%E1", "\xE2", "%E2", "\xE3", "%E3",
78 | 	"\xE4", "%E4", "\xE5", "%E5", "\xE6", "%E6", "\xE7", "%E7",
79 | 	"\xE8", "%E8", "\xE9", "%E9", "\xEA", "%EA", "\xEB", "%EB",
80 | 	"\xEC", "%EC", "\xED", "%ED", "\xEE", "%EE", "\xEF", "%EF",
81 | 
82 | 	"\xF0", "%F0", "\xF1", "%F1", "\xF2", "%F2", "\xF3", "%F3",
83 | 	"\xF4", "%F4", "\xF5", "%F5", "\xF6", "%F6", "\xF7", "%F7",
84 | 	"\xF8", "%F8", "\xF9", "%F9", "\xFA", "%FA", "\xFB", "%FB",
85 | 	"\xFC", "%FC", "\xFD", "%FD", "\xFE", "%FE", "\xFF", "%FF",
86 | )
87 | 


--------------------------------------------------------------------------------
/testdata/table.txt:
--------------------------------------------------------------------------------
  1 | -- parser.json --
  2 | {"Table": true}
  3 | -- gfm198.md --
  4 | | foo | bar |
  5 | | --- | --- |
  6 | | baz | bim |
  7 | -- gfm198.html --
  8 | <table>
  9 | <thead>
 10 | <tr>
 11 | <th>foo</th>
 12 | <th>bar</th>
 13 | </tr>
 14 | </thead>
 15 | <tbody>
 16 | <tr>
 17 | <td>baz</td>
 18 | <td>bim</td>
 19 | </tr>
 20 | </tbody>
 21 | </table>
 22 | -- gfm199.md --
 23 | | abc | defghi |
 24 | :-: | -----------:
 25 | bar | baz
 26 | -- gfm199.html --
 27 | <table>
 28 | <thead>
 29 | <tr>
 30 | <th align="center">abc</th>
 31 | <th align="right">defghi</th>
 32 | </tr>
 33 | </thead>
 34 | <tbody>
 35 | <tr>
 36 | <td align="center">bar</td>
 37 | <td align="right">baz</td>
 38 | </tr>
 39 | </tbody>
 40 | </table>
 41 | -- gfm200.md --
 42 | | f\|oo  |
 43 | | ------ |
 44 | | b `\|` az |
 45 | | b **\|** im |
 46 | -- gfm200.html --
 47 | <table>
 48 | <thead>
 49 | <tr>
 50 | <th>f|oo</th>
 51 | </tr>
 52 | </thead>
 53 | <tbody>
 54 | <tr>
 55 | <td>b <code>|</code> az</td>
 56 | </tr>
 57 | <tr>
 58 | <td>b <strong>|</strong> im</td>
 59 | </tr>
 60 | </tbody>
 61 | </table>
 62 | -- gfm201.md --
 63 | | abc | def |
 64 | | --- | --- |
 65 | | bar | baz |
 66 | > bar
 67 | -- gfm201.html --
 68 | <table>
 69 | <thead>
 70 | <tr>
 71 | <th>abc</th>
 72 | <th>def</th>
 73 | </tr>
 74 | </thead>
 75 | <tbody>
 76 | <tr>
 77 | <td>bar</td>
 78 | <td>baz</td>
 79 | </tr>
 80 | </tbody>
 81 | </table>
 82 | <blockquote>
 83 | <p>bar</p>
 84 | </blockquote>
 85 | -- gfm202.md --
 86 | | abc | def |
 87 | | --- | --- |
 88 | | bar | baz |
 89 | bar
 90 | 
 91 | bar
 92 | -- gfm202.html --
 93 | <table>
 94 | <thead>
 95 | <tr>
 96 | <th>abc</th>
 97 | <th>def</th>
 98 | </tr>
 99 | </thead>
100 | <tbody>
101 | <tr>
102 | <td>bar</td>
103 | <td>baz</td>
104 | </tr>
105 | <tr>
106 | <td>bar</td>
107 | <td></td>
108 | </tr>
109 | </tbody>
110 | </table>
111 | <p>bar</p>
112 | -- gfm203.md --
113 | | abc | def |
114 | | --- |
115 | | bar |
116 | -- gfm203.html --
117 | <p>| abc | def |
118 | | --- |
119 | | bar |</p>
120 | -- gfm204.md --
121 | | abc | def |
122 | | --- | --- |
123 | | bar |
124 | | bar | baz | boo |
125 | -- gfm204.html --
126 | <table>
127 | <thead>
128 | <tr>
129 | <th>abc</th>
130 | <th>def</th>
131 | </tr>
132 | </thead>
133 | <tbody>
134 | <tr>
135 | <td>bar</td>
136 | <td></td>
137 | </tr>
138 | <tr>
139 | <td>bar</td>
140 | <td>baz</td>
141 | </tr>
142 | </tbody>
143 | </table>
144 | -- gfm205.md --
145 | | abc | def |
146 | | --- | --- |
147 | -- gfm205.html --
148 | <table>
149 | <thead>
150 | <tr>
151 | <th>abc</th>
152 | <th>def</th>
153 | </tr>
154 | </thead>
155 | </table>
156 | -- 1.md --
157 | hello world
158 | this is a test
159 | 
160 | > a
161 | b
162 | > |-
163 | > d
164 | e
165 | > e
166 | c
167 | -- 1.html --
168 | <p>hello world
169 | this is a test</p>
170 | <blockquote>
171 | <p>a</p>
172 | <table>
173 | <thead>
174 | <tr>
175 | <th>b</th>
176 | </tr>
177 | </thead>
178 | <tbody>
179 | <tr>
180 | <td>d</td>
181 | </tr>
182 | </tbody>
183 | </table>
184 | </blockquote>
185 | <p>e</p>
186 | <blockquote>
187 | <p>e
188 | c</p>
189 | </blockquote>
190 | -- 2.md --
191 | | 0\|1\\|2\\\|3\\\\|4\\\\\|5\\\\\\|6\\\\\\\|7\\\\\\\\|8  |
192 | | ------ |
193 | -- 2.html --
194 | <table>
195 | <thead>
196 | <tr>
197 | <th>0|1|2\|3\|4\\|5\\|6\\\|7\\\|8</th>
198 | </tr>
199 | </thead>
200 | </table>
201 | -- 3.md --
202 | |          | Foo      | Bar      |
203 | | -------- | -------- | -------- |
204 | | a        | value1   | value2   |
205 | | b        | value3   | value4   |
206 | -- 3.html --
207 | <table>
208 | <thead>
209 | <tr>
210 | <th></th>
211 | <th>Foo</th>
212 | <th>Bar</th>
213 | </tr>
214 | </thead>
215 | <tbody>
216 | <tr>
217 | <td>a</td>
218 | <td>value1</td>
219 | <td>value2</td>
220 | </tr>
221 | <tr>
222 | <td>b</td>
223 | <td>value3</td>
224 | <td>value4</td>
225 | </tr>
226 | </tbody>
227 | </table>
228 | -- 4.md --
229 | |
230 | |-
231 | |x
232 | |
233 | -- 4.html --
234 | <p>|
235 | |-
236 | |x
237 | |</p>
238 | -- 5.md --
239 | ||
240 | |-
241 | |x
242 | |
243 | -- 5.html --
244 | <table>
245 | <thead>
246 | <tr>
247 | <th></th>
248 | </tr>
249 | </thead>
250 | <tbody>
251 | <tr>
252 | <td>x</td>
253 | </tr>
254 | </tbody>
255 | </table>
256 | <p>|</p>
257 | 


--------------------------------------------------------------------------------
/lex.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2024 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"strings"
  9 | 	"unicode"
 10 | )
 11 | 
 12 | // isPunct reports whether c is Markdown punctuation.
 13 | func isPunct(c byte) bool {
 14 | 	return '!' <= c && c <= '/' || ':' <= c && c <= '@' || '[' <= c && c <= '`' || '{' <= c && c <= '~'
 15 | }
 16 | 
 17 | // isLetter reports whether c is an ASCII letter.
 18 | func isLetter(c byte) bool {
 19 | 	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
 20 | }
 21 | 
 22 | // isDigit reports whether c is an ASCII digit.
 23 | func isDigit(c byte) bool {
 24 | 	return '0' <= c && c <= '9'
 25 | }
 26 | 
 27 | // isLetterDigit reports whether c is an ASCII letter or digit.
 28 | func isLetterDigit(c byte) bool {
 29 | 	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9'
 30 | }
 31 | 
 32 | // isLDH reports whether c is an ASCII letter, digit, or hyphen.
 33 | func isLDH(c byte) bool {
 34 | 	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '-'
 35 | }
 36 | 
 37 | // isHexDigit reports whether c is an ASCII hexadecimal digit.
 38 | func isHexDigit(c byte) bool {
 39 | 	return 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' || '0' <= c && c <= '9'
 40 | }
 41 | 
 42 | // isUnocdeSpace reports whether r is a Unicode space as defined by Markdown.
 43 | // This is not the same as unicode.IsSpace.
 44 | // For example, U+0085 does not satisfy isUnicodeSpace
 45 | // but does satisfy unicode.IsSpace.
 46 | func isUnicodeSpace(r rune) bool {
 47 | 	if r < 0x80 {
 48 | 		return r == ' ' || r == '\t' || r == '\f' || r == '\n'
 49 | 	}
 50 | 	return unicode.In(r, unicode.Zs)
 51 | }
 52 | 
 53 | // isUnocdeSpace reports whether r is Unicode punctuation as defined by Markdown.
 54 | // This is not the same as unicode.Punct; it also includes unicode.Symbol.
 55 | func isUnicodePunct(r rune) bool {
 56 | 	if r < 0x80 {
 57 | 		return isPunct(byte(r))
 58 | 	}
 59 | 	return unicode.In(r, unicode.Punct, unicode.Symbol)
 60 | }
 61 | 
 62 | // skipSpace returns i + the number of spaces, tabs, carriage returns, and newlines
 63 | // at the start of s[i:]. That is, it skips i past any such characters, returning the new i.
 64 | func skipSpace(s string, i int) int {
 65 | 	// Note: Blank lines have already been removed.
 66 | 	for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') {
 67 | 		i++
 68 | 	}
 69 | 	return i
 70 | }
 71 | 
 72 | // mdEscaper escapes symbols that are used in inline Markdown sequences.
 73 | // TODO(rsc): There is a better way to do this.
 74 | var mdEscaper = strings.NewReplacer(
 75 | 	`(`, `\(`,
 76 | 	`)`, `\)`,
 77 | 	`[`, `\[`,
 78 | 	`]`, `\]`,
 79 | 	`*`, `\*`,
 80 | 	`_`, `\_`,
 81 | 	`<`, `\<`,
 82 | 	`>`, `\>`,
 83 | )
 84 | 
 85 | // mdLinkEscaper escapes symbols that have meaning inside a link target.
 86 | var mdLinkEscaper = strings.NewReplacer(
 87 | 	`(`, `\(`,
 88 | 	`)`, `\)`,
 89 | 	`<`, `\<`,
 90 | 	`>`, `\>`,
 91 | )
 92 | 
 93 | // mdUnscape returns the Markdown unescaping of s.
 94 | func mdUnescape(s string) string {
 95 | 	if !strings.Contains(s, `\`) && !strings.Contains(s, `&`) {
 96 | 		return s
 97 | 	}
 98 | 	return mdUnescaper.Replace(s)
 99 | }
100 | 
101 | // mdUnescaper unescapes Markdown escape sequences and HTML entities.
102 | // TODO(rsc): Perhaps there is a better way to do this.
103 | var mdUnescaper = func() *strings.Replacer {
104 | 	var list = []string{
105 | 		`\!`, `!`,
106 | 		`\"`, `"`,
107 | 		`\#`, `#`,
108 | 		`\$`, `$`,
109 | 		`\%`, `%`,
110 | 		`\&`, `&`,
111 | 		`\'`, `'`,
112 | 		`\(`, `(`,
113 | 		`\)`, `)`,
114 | 		`\*`, `*`,
115 | 		`\+`, `+`,
116 | 		`\,`, `,`,
117 | 		`\-`, `-`,
118 | 		`\.`, `.`,
119 | 		`\/`, `/`,
120 | 		`\:`, `:`,
121 | 		`\;`, `;`,
122 | 		`\<`, `<`,
123 | 		`\=`, `=`,
124 | 		`\>`, `>`,
125 | 		`\?`, `?`,
126 | 		`\@`, `@`,
127 | 		`\[`, `[`,
128 | 		`\\`, `\`,
129 | 		`\]`, `]`,
130 | 		`\^`, `^`,
131 | 		`\_`, `_`,
132 | 		"\\`", "`",
133 | 		`\{`, `{`,
134 | 		`\|`, `|`,
135 | 		`\}`, `}`,
136 | 		`\~`, `~`,
137 | 	}
138 | 
139 | 	for name, repl := range htmlEntity {
140 | 		list = append(list, name, repl)
141 | 	}
142 | 	return strings.NewReplacer(list...)
143 | }()
144 | 


--------------------------------------------------------------------------------
/testdata/gfm_regress.txt:
--------------------------------------------------------------------------------
  1 | // go run cmark2txtar.go /users/rsc/pub/cmark-gfm/test/regression.txt
  2 | -- 1.md --
  3 | line1
  4 | 
  5 | line2
  6 | -- 1.html --
  7 | <p>line1</p>
  8 | <p>line2</p>
  9 | -- 2.md --
 10 | By taking it apart
 11 | 
 12 | - alternative solutions
 13 | 	^J
 14 | Repeatedly solving
 15 | 	^J
 16 | - how techniques
 17 | -- 2.html --
 18 | <p>By taking it apart</p>
 19 | <ul>
 20 | <li>alternative solutions</li>
 21 | </ul>
 22 | <p>Repeatedly solving</p>
 23 | <ul>
 24 | <li>how techniques</li>
 25 | </ul>
 26 | -- 3.md --
 27 | <h1>lorem</h1>
 28 | 
 29 | <h2>lorem</h2>
 30 | 
 31 | <h3>lorem</h3>
 32 | 
 33 | <h4>lorem</h4>
 34 | 
 35 | <h5>lorem</h5>
 36 | 
 37 | <h6>lorem</h6>
 38 | -- 3.html --
 39 | <h1>lorem</h1>
 40 | <h2>lorem</h2>
 41 | <h3>lorem</h3>
 42 | <h4>lorem</h4>
 43 | <h5>lorem</h5>
 44 | <h6>lorem</h6>
 45 | -- 4.md --
 46 | hi
 47 | --	^J
 48 | -- 4.html --
 49 | <h2>hi</h2>
 50 | -- 5.md --
 51 | a***b* c*
 52 | -- 5.html --
 53 | <p>a*<em><em>b</em> c</em></p>
 54 | -- 6.md --
 55 | [a]
 56 | 
 57 | [a]: <te<st>
 58 | -- 6.html --
 59 | <p>[a]</p>
 60 | <p>[a]: &lt;te<st></p>
 61 | -- 7.md --
 62 | [a](te\ st)
 63 | -- 7.html --
 64 | <p>[a](te\ st)</p>
 65 | -- parser.json --
 66 | {"Strikethrough": true}
 67 | -- 8.md --
 68 | ~~**_`this`_**~~  ^J
 69 | ~~***`this`***~~  ^J
 70 | ~~___`this`___~~
 71 | 
 72 | **_`this`_**  ^J
 73 | ***`this`***  ^J
 74 | ___`this`___
 75 | 
 76 | ~~**_this_**~~  ^J
 77 | ~~***this***~~  ^J
 78 | ~~___this___~~
 79 | 
 80 | **_this_**  ^J
 81 | ***this***  ^J
 82 | ___this___
 83 | -- 8.html --
 84 | <p><del><strong><em><code>this</code></em></strong></del><br />
 85 | <del><em><strong><code>this</code></strong></em></del><br />
 86 | <del><em><strong><code>this</code></strong></em></del></p>
 87 | <p><strong><em><code>this</code></em></strong><br />
 88 | <em><strong><code>this</code></strong></em><br />
 89 | <em><strong><code>this</code></strong></em></p>
 90 | <p><del><strong><em>this</em></strong></del><br />
 91 | <del><em><strong>this</strong></em></del><br />
 92 | <del><em><strong>this</strong></em></del></p>
 93 | <p><strong><em>this</em></strong><br />
 94 | <em><strong>this</strong></em><br />
 95 | <em><strong>this</strong></em></p>
 96 | -- parser.json --
 97 | {}
 98 | -- 9.md --
 99 | City:
100 | <span itemprop="contentLocation" itemscope itemtype="https://schema.org/City">
101 |   <meta itemprop="name" content="Springfield">
102 | </span>
103 | -- 9.html --
104 | <p>City:
105 | <span itemprop="contentLocation" itemscope itemtype="https://schema.org/City">
106 | <meta itemprop="name" content="Springfield">
107 | </span></p>
108 | -- parser.json --
109 | {"Strikethrough": true}
110 | -- 10.md --
111 | ~Hi~ Hello, world!
112 | -- 10.html --
113 | <p><del>Hi</del> Hello, world!</p>
114 | -- parser.json --
115 | {}
116 | -- parser.json --
117 | {"Strikethrough": true}
118 | -- 11.md --
119 | This ~text~ ~~is~~ ~~~curious~~~.
120 | -- 11.html --
121 | <p>This <del>text</del> <del>is</del> ~~~curious~~~.</p>
122 | -- parser.json --
123 | {}
124 | -- 12.md --
125 | [x](http://members.aon.at/~nkehrer/ibm_5110/emu5110.html)
126 | -- 12.html --
127 | <p><a href="http://members.aon.at/~nkehrer/ibm_5110/emu5110.html">x</a></p>
128 | -- 13.md --
129 | City:
130 | <span itemprop="contentLocation" itemscope itemtype="https://schema.org/City">
131 |   <meta itemprop="name" content="Springfield">
132 | </span>
133 | -- 13.html --
134 | <p>City:
135 | <span itemprop="contentLocation" itemscope itemtype="https://schema.org/City">
136 | <meta itemprop="name" content="Springfield">
137 | </span></p>
138 | -- 14.md --
139 | [a](\ b)
140 | 
141 | [a](<<b)
142 | 
143 | [a](<b
144 | )
145 | -- 14.html --
146 | <p>[a](\ b)</p>
147 | <p>[a](&lt;&lt;b)</p>
148 | <p>[a](&lt;b
149 | )</p>
150 | -- 15.md --
151 | [link](url ((title))
152 | -- 15.html --
153 | <p>[link](url ((title))</p>
154 | -- 16.md --
155 | </script>
156 | 
157 | </pre>
158 | 
159 | </style>
160 | -- 16.html --
161 | </script>
162 | </pre>
163 | </style>
164 | -- 17.md --
165 | [a](<b) c>
166 | -- 17.html --
167 | <p>[a](&lt;b) c&gt;</p>
168 | -- parser.json --
169 | {"Table": true}
170 | -- 18.md --
171 | |
172 | -|
173 | -- 18.html --
174 | <p>|
175 | -|</p>
176 | -- parser.json --
177 | {}
178 | -- 19.md --
179 | *text* [link](#section)
180 | -- 19.html --
181 | <p><em>text</em> <a href="#section">link</a></p>
182 | 


--------------------------------------------------------------------------------
/line.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2024 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | type line struct {
  8 | 	spaces   int
  9 | 	i        int
 10 | 	tab      int
 11 | 	text     string
 12 | 	nl       byte // newline character ending this line: \r or \n or \r+\n or zero for EOF
 13 | 	nonblank int  // index of first non-space, non-tab char in text; len(text) if none
 14 | }
 15 | 
 16 | func makeLine(text string, nl byte) line {
 17 | 	s := line{text: text, nl: nl}
 18 | 	s.setNonblank()
 19 | 	return s
 20 | }
 21 | 
 22 | func (s *line) setNonblank() {
 23 | 	i := s.i
 24 | 	for i < len(s.text) && (s.text[i] == ' ' || s.text[i] == '\t') {
 25 | 		i++
 26 | 	}
 27 | 	s.nonblank = i
 28 | }
 29 | 
 30 | func (s *line) peek() byte {
 31 | 	if s.spaces > 0 {
 32 | 		return ' '
 33 | 	}
 34 | 	if s.i >= len(s.text) {
 35 | 		return 0
 36 | 	}
 37 | 	return s.text[s.i]
 38 | }
 39 | 
 40 | func (s *line) skipSpace() {
 41 | 	s.spaces = 0
 42 | 	if s.nonblank < s.i {
 43 | 		panic("nonblank")
 44 | 	}
 45 | 	s.i = s.nonblank
 46 | }
 47 | 
 48 | func (s *line) trimSpace(min, max int, eolOK bool) bool {
 49 | 	t := *s
 50 | 
 51 | 	for n := 0; n < max; n++ {
 52 | 		if t.spaces > 0 {
 53 | 			t.spaces--
 54 | 			continue
 55 | 		}
 56 | 		if t.i >= len(t.text) && eolOK {
 57 | 			continue
 58 | 		}
 59 | 		// TODO performance bottleneck here using trimSpace with list extensions?
 60 | 		// but each only fails once?
 61 | 		if t.i < len(t.text) {
 62 | 			switch t.text[t.i] {
 63 | 			case '\t':
 64 | 				t.spaces = 4 - (t.i-t.tab)&3 - 1
 65 | 				t.i++
 66 | 				t.tab = t.i // TODO seems wrong
 67 | 				continue
 68 | 			case ' ':
 69 | 				t.i++
 70 | 				continue
 71 | 			}
 72 | 		}
 73 | 		if n >= min {
 74 | 			break
 75 | 		}
 76 | 		return false
 77 | 	}
 78 | 	if t.nonblank < t.i {
 79 | 		t.setNonblank()
 80 | 	}
 81 | 	*s = t
 82 | 	return true
 83 | }
 84 | 
 85 | func (s *line) trim(c byte) bool {
 86 | 	if s.spaces > 0 {
 87 | 		if c == ' ' {
 88 | 			s.spaces--
 89 | 			return true
 90 | 		}
 91 | 		return false
 92 | 	}
 93 | 	if s.i < len(s.text) && s.text[s.i] == c {
 94 | 		s.i++
 95 | 		if s.nonblank < s.i {
 96 | 			s.setNonblank()
 97 | 		}
 98 | 		return true
 99 | 	}
100 | 	return false
101 | }
102 | 
103 | func (s *line) skip(n int) {
104 | 	s.i += n
105 | 	if s.nonblank < s.i {
106 | 		s.setNonblank()
107 | 	}
108 | }
109 | 
110 | func (s *line) string() string {
111 | 	switch s.spaces {
112 | 	case 0:
113 | 		return s.text[s.i:]
114 | 	case 1:
115 | 		return " " + s.text[s.i:]
116 | 	case 2:
117 | 		return "  " + s.text[s.i:]
118 | 	case 3:
119 | 		return "   " + s.text[s.i:]
120 | 	}
121 | 	// unreachable
122 | 	panic("bad spaces")
123 | }
124 | 
125 | func trimLeftSpaceTab(s string) string {
126 | 	i := 0
127 | 	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
128 | 		i++
129 | 	}
130 | 	return s[i:]
131 | }
132 | 
133 | func trimRightSpaceTab(s string) string {
134 | 	j := len(s)
135 | 	for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') {
136 | 		j--
137 | 	}
138 | 	return s[:j]
139 | }
140 | 
141 | func trimSpaceTab(s string) string {
142 | 	i := 0
143 | 	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
144 | 		i++
145 | 	}
146 | 	s = s[i:]
147 | 	j := len(s)
148 | 	for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') {
149 | 		j--
150 | 	}
151 | 	return s[:j]
152 | }
153 | 
154 | func trimSpace(s string) string {
155 | 	i := 0
156 | 	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
157 | 		i++
158 | 	}
159 | 	s = s[i:]
160 | 	j := len(s)
161 | 	for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t') {
162 | 		j--
163 | 	}
164 | 	return s[:j]
165 | }
166 | 
167 | func trimSpaceTabNewline(s string) string {
168 | 	i := 0
169 | 	for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n') {
170 | 		i++
171 | 	}
172 | 	s = s[i:]
173 | 	j := len(s)
174 | 	for j > 0 && (s[j-1] == ' ' || s[j-1] == '\t' || s[j-1] == '\n') {
175 | 		j--
176 | 	}
177 | 	return s[:j]
178 | }
179 | 
180 | func (s *line) isBlank() bool {
181 | 	return s.nonblank == len(s.text)
182 | }
183 | 
184 | func (s *line) eof() bool {
185 | 	return s.i >= len(s.text)
186 | }
187 | 
188 | func (s *line) trimSpaceString() string {
189 | 	return s.text[s.nonblank:]
190 | }
191 | 
192 | func (s *line) trimString() string {
193 | 	if s.nonblank < s.i {
194 | 		panic("bad blank")
195 | 	}
196 | 	return trimSpaceTab(s.text[s.nonblank:])
197 | }
198 | 


--------------------------------------------------------------------------------
/footnote.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2024 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"strconv"
  9 | 	"strings"
 10 | )
 11 | 
 12 | type Footnote struct {
 13 | 	Position
 14 | 	Label  string
 15 | 	Blocks []Block
 16 | }
 17 | 
 18 | type FootnoteLink struct {
 19 | 	Label    string
 20 | 	Footnote *Footnote
 21 | }
 22 | 
 23 | type printedNote struct {
 24 | 	num  string
 25 | 	note *Footnote
 26 | 	refs []string
 27 | }
 28 | 
 29 | func (*FootnoteLink) Inline() {}
 30 | 
 31 | func (x *Footnote) printed(p *printer) *printedNote {
 32 | 	if p.footnotes == nil {
 33 | 		p.footnotes = make(map[*Footnote]*printedNote)
 34 | 	}
 35 | 	pr, ok := p.footnotes[x]
 36 | 	if !ok {
 37 | 		pr = &printedNote{
 38 | 			num:  strconv.Itoa(len(p.footnotes) + 1),
 39 | 			note: x,
 40 | 		}
 41 | 		p.footnotes[x] = pr
 42 | 		p.footnotelist = append(p.footnotelist, pr)
 43 | 	}
 44 | 	ref := pr.num
 45 | 	if len(pr.refs) > 0 {
 46 | 		ref += "-" + strconv.Itoa(len(pr.refs)+1)
 47 | 	}
 48 | 	pr.refs = append(pr.refs, ref)
 49 | 	return pr
 50 | }
 51 | 
 52 | func (x *FootnoteLink) printHTML(p *printer) {
 53 | 	note := x.Footnote
 54 | 	if note == nil {
 55 | 		return
 56 | 	}
 57 | 	pr := note.printed(p)
 58 | 	ref := pr.refs[len(pr.refs)-1]
 59 | 	p.html(`<sup class="fn"><a id="fnref-`, ref, `" href="#fn-`, pr.num, `">`, pr.num, `</a></sup>`)
 60 | }
 61 | 
 62 | func (x *FootnoteLink) printMarkdown(p *printer) {
 63 | 	note := x.Footnote
 64 | 	if note == nil {
 65 | 		return
 66 | 	}
 67 | 	note.printed(p) // add to list for printFootnoteMarkdown
 68 | 	p.text(`[^`, x.Label, `]`)
 69 | }
 70 | 
 71 | func (x *FootnoteLink) printText(p *printer) {
 72 | 	p.text(`[^`, x.Label, `]`)
 73 | }
 74 | 
 75 | func printFootnoteHTML(p *printer) {
 76 | 	if len(p.footnotelist) == 0 {
 77 | 		return
 78 | 	}
 79 | 
 80 | 	p.html(`<div class="footnotes">Footnotes</div>`, "\n")
 81 | 	p.html("<ol>\n")
 82 | 	for num, note := range p.footnotelist {
 83 | 		num++
 84 | 		str := strconv.Itoa(num)
 85 | 		p.html(`<li id="fn-`, str, `">`, "\n")
 86 | 		for _, b := range note.note.Blocks {
 87 | 			b.printHTML(p)
 88 | 		}
 89 | 		if !p.eraseCloseP() {
 90 | 			p.html("<p>\n")
 91 | 		}
 92 | 		for _, ref := range note.refs {
 93 | 			p.html("\n", `<a class="fnref" href="#fnref-`, ref, `">↩</a>`)
 94 | 		}
 95 | 		p.html("</p>\n")
 96 | 		p.html("</li>\n")
 97 | 	}
 98 | 	p.html("</ol>\n")
 99 | }
100 | 
101 | func (x *Footnote) printMarkdown(p *printer) {
102 | 	p.md(`[^`, x.Label, `]: `)
103 | 	defer p.pop(p.push("  "))
104 | 	printMarkdownBlocks(x.Blocks, p)
105 | }
106 | 
107 | func printFootnoteMarkdown(p *printer) {
108 | 	if len(p.footnotelist) == 0 {
109 | 		return
110 | 	}
111 | 
112 | 	p.maybeNL()
113 | 	for _, note := range p.footnotelist {
114 | 		p.nl()
115 | 		note.note.printMarkdown(p)
116 | 	}
117 | }
118 | 
119 | func parseFootnoteRef(p *parser, s string, start int) (x Inline, end int, ok bool) {
120 | 	if !p.Footnote || start+1 >= len(s) || s[start+1] != '^' {
121 | 		return
122 | 	}
123 | 	end = strings.Index(s[start:], "]")
124 | 	if end < 0 {
125 | 		return
126 | 	}
127 | 	end += start + 1
128 | 	label := s[start+2 : end-1]
129 | 	note, ok := p.footnotes[normalizeLabel(label)]
130 | 	if !ok {
131 | 		return
132 | 	}
133 | 	return &FootnoteLink{label, note}, end, true
134 | }
135 | 
136 | func startFootnote(p *parser, s line) (line, bool) {
137 | 	t := s
138 | 	t.trimSpace(0, 3, false)
139 | 	if !t.trim('[') || !t.trim('^') {
140 | 		return s, false
141 | 	}
142 | 	label := t.string()
143 | 	i := strings.Index(label, "]")
144 | 	if i < 0 || i+1 >= len(label) && label[i+1] != ':' {
145 | 		return s, false
146 | 	}
147 | 	label = label[:i]
148 | 	for j := 0; j < i; j++ {
149 | 		c := label[j]
150 | 		if c == ' ' || c == '\r' || c == '\n' || c == 0x00 || c == '\t' {
151 | 			return s, false
152 | 		}
153 | 	}
154 | 	t.skip(i + 2)
155 | 
156 | 	if _, ok := p.footnotes[normalizeLabel(label)]; ok {
157 | 		// Already have a footnote with this label.
158 | 		// cmark-gfm ignores all future references,
159 | 		// dropping them from the document,
160 | 		// but it seems more helpful to not treat it
161 | 		// as a footnote.
162 | 		p.corner = true
163 | 		return s, false
164 | 	}
165 | 
166 | 	fb := &footnoteBuilder{label}
167 | 	p.addBlock(fb)
168 | 	return t, true
169 | }
170 | 
171 | type footnoteBuilder struct {
172 | 	label string
173 | }
174 | 
175 | func (b *footnoteBuilder) extend(p *parser, s line) (line, bool) {
176 | 	if !s.trimSpace(4, 4, true) {
177 | 		return s, false
178 | 	}
179 | 	return s, true
180 | }
181 | 
182 | func (b *footnoteBuilder) build(p *parser) Block {
183 | 	if p.footnotes == nil {
184 | 		p.footnotes = make(map[string]*Footnote)
185 | 	}
186 | 	p.footnotes[normalizeLabel(b.label)] = &Footnote{p.pos(), b.label, p.blocks()}
187 | 	return &Empty{}
188 | }
189 | 


--------------------------------------------------------------------------------
/print.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2024 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import "bytes"
  8 | 
  9 | const (
 10 | 	writeMarkdown = iota
 11 | 	writeHTML
 12 | 	writeText
 13 | )
 14 | 
 15 | type printer struct {
 16 | 	writeMode   int
 17 | 	buf         bytes.Buffer
 18 | 	prefix      []byte
 19 | 	prefixOld   []byte
 20 | 	prefixOlder []byte
 21 | 	trimLimit   int
 22 | 	listOut
 23 | 	footnotes    map[*Footnote]*printedNote
 24 | 	footnotelist []*printedNote
 25 | }
 26 | 
 27 | type listOut struct {
 28 | 	bullet rune
 29 | 	num    int
 30 | 	loose  int
 31 | 	tight  int
 32 | }
 33 | 
 34 | func (w *printer) WriteStrings(list ...string) {
 35 | 	for _, s := range list {
 36 | 		w.WriteString(s)
 37 | 	}
 38 | }
 39 | 
 40 | func cutLastNL(text []byte) (prefix, last []byte) {
 41 | 	i := bytes.LastIndexByte(text, '\n')
 42 | 	if i < 0 {
 43 | 		return nil, text
 44 | 	}
 45 | 	return text[:i], text[i+1:]
 46 | }
 47 | 
 48 | func (b *printer) noTrim() {
 49 | 	b.trimLimit = len(b.buf.Bytes())
 50 | }
 51 | 
 52 | func (b *printer) nl() {
 53 | 	text := b.buf.Bytes()
 54 | 	for len(text) > b.trimLimit && text[len(text)-1] == ' ' {
 55 | 		text = text[:len(text)-1]
 56 | 	}
 57 | 	b.buf.Truncate(len(text))
 58 | 
 59 | 	b.buf.WriteByte('\n')
 60 | 	b.buf.Write(b.prefix)
 61 | 	b.prefixOlder, b.prefixOld = b.prefixOld, b.prefix
 62 | }
 63 | 
 64 | func (b *printer) maybeNL() bool {
 65 | 	// Starting a new block that may need a blank line before it
 66 | 	// to avoid being mixed into a previous block
 67 | 	// as paragraph continuation text.
 68 | 	//
 69 | 	// If the prefix on the current line (all of cur)
 70 | 	// is the same as the current continuation prefix
 71 | 	// (not first line of a list item)
 72 | 	// and the previous line started with the same prefix,
 73 | 	// then we need a blank line to avoid looking like
 74 | 	// paragraph continuation text.
 75 | 	before, cur := cutLastNL(b.buf.Bytes())
 76 | 	before, prev := cutLastNL(before)
 77 | 	if b.buf.Len() > 0 && bytes.Equal(cur, b.prefix) && bytes.HasPrefix(prev, b.prefix) {
 78 | 		b.nl()
 79 | 		return true
 80 | 	}
 81 | 	return true
 82 | }
 83 | 
 84 | func ToHTML(b Block) string {
 85 | 	var p printer
 86 | 	p.writeMode = writeHTML
 87 | 	b.printHTML(&p)
 88 | 	printFootnoteHTML(&p)
 89 | 	return p.buf.String()
 90 | }
 91 | 
 92 | func Format(b Block) string {
 93 | 	var p printer
 94 | 	b.printMarkdown(&p)
 95 | 	printFootnoteMarkdown(&p)
 96 | 	// TODO footnotes?
 97 | 	return p.buf.String()
 98 | }
 99 | 
100 | var closeP = []byte("</p>\n")
101 | 
102 | func (b *printer) eraseCloseP() bool {
103 | 	if bytes.HasSuffix(b.buf.Bytes(), closeP) {
104 | 		b.buf.Truncate(b.buf.Len() - len(closeP))
105 | 		return true
106 | 	}
107 | 	return false
108 | }
109 | 
110 | func (b *printer) maybeQuoteNL(quote byte) bool {
111 | 	// Starting a new quote block.
112 | 	// Make sure it doesn't look like it is part of a preceding quote block.
113 | 	before, cur := cutLastNL(b.buf.Bytes())
114 | 	before, prev := cutLastNL(before)
115 | 	if len(prev) >= len(cur)+1 && bytes.HasPrefix(prev, cur) && prev[len(cur)] == quote {
116 | 		b.nl()
117 | 		return true
118 | 	}
119 | 	return false
120 | }
121 | 
122 | func (b *printer) WriteByte(c byte) error {
123 | 	if c == '\n' {
124 | 		panic("Write \\n")
125 | 	}
126 | 	return b.buf.WriteByte(c)
127 | }
128 | 
129 | func (p *printer) Write(text []byte) (int, error) {
130 | 	if p.writeMode == writeMarkdown {
131 | 		for i := range text {
132 | 			if text[i] == '\n' {
133 | 				panic("Write \\n")
134 | 			}
135 | 		}
136 | 	}
137 | 	return p.buf.Write(text)
138 | }
139 | 
140 | func (p *printer) html(list ...string) {
141 | 	if p.writeMode != writeHTML {
142 | 		panic("raw HTML in non-HTML output")
143 | 	}
144 | 	for _, s := range list {
145 | 		p.buf.WriteString(s)
146 | 	}
147 | }
148 | 
149 | func (p *printer) text(list ...string) {
150 | 	if p.writeMode == writeHTML {
151 | 		for _, s := range list {
152 | 			htmlEscaper.WriteString(&p.buf, s)
153 | 		}
154 | 		return
155 | 	}
156 | 	for _, s := range list {
157 | 		p.buf.WriteString(s)
158 | 	}
159 | 
160 | }
161 | 
162 | func (p *printer) md(list ...string) {
163 | 	if p.writeMode != writeMarkdown {
164 | 		panic("markdown in non-markdown output")
165 | 	}
166 | 	for _, s := range list {
167 | 		p.buf.WriteString(s)
168 | 	}
169 | }
170 | 
171 | func (b *printer) WriteString(s string) (int, error) {
172 | 	if b.writeMode == writeMarkdown {
173 | 		for i := 0; i < len(s); i++ {
174 | 			if s[i] == '\n' {
175 | 				panic("Write \\n")
176 | 			}
177 | 		}
178 | 	}
179 | 	return b.buf.WriteString(s)
180 | }
181 | 
182 | func (b *printer) push(s string) int {
183 | 	n := len(b.prefix)
184 | 	b.prefix = append(b.prefix, s...)
185 | 	return n
186 | }
187 | 
188 | func (b *printer) pop(n int) {
189 | 	b.prefix = b.prefix[:n]
190 | }
191 | 


--------------------------------------------------------------------------------
/big_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2023 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"strings"
 10 | 	"testing"
 11 | )
 12 | 
 13 | var rep = strings.Repeat
 14 | 
 15 | func repf(f func(int) string, n int) string {
 16 | 	out := make([]string, n)
 17 | 	for i := 0; i < n; i++ {
 18 | 		out[i] = f(i)
 19 | 	}
 20 | 	return strings.Join(out, "")
 21 | }
 22 | 
 23 | // Many cases here derived from cmark-gfm/test/pathological_tests.py
 24 | 
 25 | var bigTests = []struct {
 26 | 	name string
 27 | 	in   string
 28 | 	out  string
 29 | }{
 30 | 	{
 31 | 		"nested strong emph",
 32 | 		rep("*a **a ", 65000) + "b" + rep(" a** a*", 65000),
 33 | 		"<p>" + rep("<em>a <strong>a ", 65000) + "b" + rep(" a</strong> a</em>", 65000) + "</p>\n",
 34 | 	},
 35 | 	{
 36 | 		"many emph closers with no openers",
 37 | 		rep("a_ ", 65000),
 38 | 		"",
 39 | 	},
 40 | 	{
 41 | 		"many emph openers with no closers",
 42 | 		rep("_a ", 65000),
 43 | 		"",
 44 | 	},
 45 | 	{
 46 | 		"many link closers with no openers",
 47 | 		rep("a]", 65000),
 48 | 		"",
 49 | 	},
 50 | 	{
 51 | 		"many link openers with no closers",
 52 | 		rep("[a", 65000),
 53 | 		"",
 54 | 	},
 55 | 	{
 56 | 		"mismatched openers and closers",
 57 | 		rep("*a_ ", 50000),
 58 | 		"",
 59 | 	},
 60 | 	{
 61 | 		"openers and closers multiple of 3",
 62 | 		"a**b" + rep("c* ", 50000),
 63 | 		"",
 64 | 	},
 65 | 	{
 66 | 		"link openers and emph closers",
 67 | 		rep("[ a_", 50000),
 68 | 		"",
 69 | 	},
 70 | 	{
 71 | 		"pattern [ (]( repeated",
 72 | 		rep("[ (](", 80000),
 73 | 		"",
 74 | 	},
 75 | 	{
 76 | 		"pattern ![[]() repeated",
 77 | 		rep("![[]()", 160000),
 78 | 		"<p>" + rep(`![<a href=""></a>`, 160000) + "</p>\n",
 79 | 	},
 80 | 	{
 81 | 		"hard link/emph case",
 82 | 		"**x [a*b**c*](d)",
 83 | 		`<p>**x <a href="d">a<em>b**c</em></a></p>` + "\n",
 84 | 	},
 85 | 	{
 86 | 		"nested brackets",
 87 | 		rep("[", 50000) + "a" + rep("]", 50000),
 88 | 		"",
 89 | 	},
 90 | 	{
 91 | 		"nested block quotes",
 92 | 		rep("> ", 50000) + "a",
 93 | 		rep("<blockquote>\n", 50000) + "<p>a</p>\n" + rep("</blockquote>\n", 50000),
 94 | 	},
 95 | 	{
 96 | 		"deeply nested lists",
 97 | 		repf(func(x int) string { return rep("  ", x) + "* a\n" }, 4000),
 98 | 		"<ul>\n" + rep("<li>a\n<ul>\n", 4000-1) + "<li>a</li>\n" + rep("</ul>\n</li>\n", 4000-1) + "</ul>\n",
 99 | 	},
100 | 	{
101 | 		"backticks",
102 | 		repf(func(x int) string { return "e" + rep("`", x) }, 5000),
103 | 		"",
104 | 	},
105 | 	{
106 | 		"backticks2",
107 | 		repf(func(x int) string { return "e" + rep("`", 5000-x) }, 5000),
108 | 		"",
109 | 	},
110 | 	{
111 | 		"unclosed links A",
112 | 		rep("[a](<b", 30000),
113 | 		"<p>" + rep("[a](&lt;b", 30000) + "</p>\n",
114 | 	},
115 | 	{
116 | 		"unclosed links B",
117 | 		rep("[a](b", 30000),
118 | 		"",
119 | 	},
120 | 	{
121 | 		"unclosed links C",
122 | 		rep("[a](b\\#", 30000),
123 | 		"<p>" + rep("[a](b#", 30000) + "</p>\n",
124 | 	},
125 | 	{
126 | 		"unclosed <!--",
127 | 		"</" + rep(" <!--", 30000),
128 | 		"<p>&lt;/" + rep(" &lt;!--", 30000) + "</p>\n",
129 | 	},
130 | 	{
131 | 		"unclosed <?",
132 | 		"</" + rep(" <?", 30000),
133 | 		"<p>&lt;/" + rep(" &lt;?", 30000) + "</p>\n",
134 | 	},
135 | 	{
136 | 		"unclosed <!X",
137 | 		"</" + rep(" <!X", 30000),
138 | 		"<p>&lt;/" + rep(" &lt;!X", 30000) + "</p>\n",
139 | 	},
140 | 	{
141 | 		"unclosed <![CDATA[",
142 | 		"</" + rep(" <![CDATA[", 30000),
143 | 		"<p>&lt;/" + rep(" &lt;![CDATA[", 30000) + "</p>\n",
144 | 	},
145 | 	{
146 | 		"tables",
147 | 		rep("abc\ndef\n|-\n", 30000),
148 | 		"<p>abc</p>\n<table>\n<thead>\n<tr>\n<th>def</th>\n</tr>\n</thead>\n<tbody>\n" +
149 | 			rep("<tr>\n<td>abc</td>\n</tr>\n<tr>\n<td>def</td>\n</tr>\n<tr>\n<td>-</td>\n</tr>\n", 30000-1) +
150 | 			"</tbody>\n</table>\n",
151 | 	},
152 | }
153 | 
154 | func compress(s string) string {
155 | 	var out []byte
156 | 	start := 0
157 | S:
158 | 	for i := 0; i+4 < len(s); i++ {
159 | 		c := s[i]
160 | 		for j := i + 1; j < i+100 && j < len(s); j++ {
161 | 			if s[j] == c {
162 | 				n := 1
163 | 				w := j - i
164 | 				for j+w <= len(s) && s[i:i+w] == s[j:j+w] {
165 | 					j += w
166 | 					n++
167 | 				}
168 | 				if n > 2 {
169 | 					out = append(out, s[start:i]...)
170 | 					out = fmt.Appendf(out, "«%d:%s»", n, s[i:i+w])
171 | 					start = j
172 | 					i = start - 1
173 | 					continue S
174 | 				}
175 | 			}
176 | 		}
177 | 	}
178 | 	out = append(out, s[start:]...)
179 | 	return string(out)
180 | }
181 | 
182 | func TestBig(t *testing.T) {
183 | 	if testing.Short() {
184 | 		t.Skip("skipping in -short mode")
185 | 	}
186 | 	for _, tt := range bigTests {
187 | 		t.Run(tt.name, func(t *testing.T) {
188 | 			var p Parser
189 | 			p.Table = true
190 | 			doc := p.Parse(tt.in)
191 | 			out := ToHTML(doc)
192 | 			if tt.out == "" {
193 | 				tt.out = "<p>" + strings.TrimSpace(tt.in) + "</p>\n"
194 | 			}
195 | 			if out != tt.out {
196 | 				t.Fatalf("%s: ToHTML(%q):\nhave %q\nwant %q", tt.name, compress(tt.in), compress(out), compress(tt.out))
197 | 			}
198 | 		})
199 | 	}
200 | }
201 | 
202 | func bench(b *testing.B, text string) {
203 | 	for i := 0; i < b.N; i++ {
204 | 		var p Parser
205 | 		_ = ToHTML(p.Parse(text))
206 | 	}
207 | 	b.SetBytes(int64(len(text)))
208 | }
209 | 
210 | func BenchmarkBrackets(b *testing.B) {
211 | 	bench(b, rep("[", 10000)+"a"+rep("]", 10000))
212 | }
213 | 
214 | func BenchmarkDeepList(b *testing.B) {
215 | 	bench(b, repf(func(x int) string { return rep("  ", x) + "* a\n" }, 1000))
216 | }
217 | 
218 | func BenchmarkList(b *testing.B) {
219 | 	bench(b, repf(func(x int) string { return "* a\n" }, 1000))
220 | }
221 | 


--------------------------------------------------------------------------------
/para.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"strings"
  9 | )
 10 | 
 11 | // TODO: unexport Empty?
 12 | 
 13 | // An Empty is a [Block] representing no block at all.
 14 | // The parser never returns a parse tree containing an Empty,
 15 | // but it can be useful during syntax editing.
 16 | // It does not render as anything at all.
 17 | type Empty struct {
 18 | 	Position
 19 | }
 20 | 
 21 | func (*Empty) Block() {}
 22 | 
 23 | func (b *Empty) printHTML(p *printer) {}
 24 | 
 25 | func (b *Empty) printMarkdown(*printer) {}
 26 | 
 27 | type Text struct {
 28 | 	Position
 29 | 	Inline Inlines
 30 | }
 31 | 
 32 | // TODO: This is only a Block for tight lists. Maybe keep the Paragraphs for those?
 33 | func (*Text) Block() {}
 34 | 
 35 | func (b *Text) printHTML(p *printer) {
 36 | 	for _, x := range b.Inline {
 37 | 		x.printHTML(p)
 38 | 	}
 39 | }
 40 | 
 41 | func (b *Text) printMarkdown(p *printer) {
 42 | 	for _, x := range b.Inline {
 43 | 		x.printMarkdown(p)
 44 | 	}
 45 | }
 46 | 
 47 | // A Paragraph is a [Block] representing a [paragraph].
 48 | // Except when they appear as top-level blocks in an item of a tight list,
 49 | // paragraphs render in <p>...</p> tags.
 50 | //
 51 | // [paragraph]: https://spec.commonmark.org/0.31.2/#paragraphs
 52 | type Paragraph struct {
 53 | 	Position
 54 | 	Text *Text
 55 | }
 56 | 
 57 | func (*Paragraph) Block() {}
 58 | 
 59 | func (b *Paragraph) printHTML(p *printer) {
 60 | 	p.html("<p>")
 61 | 	b.Text.printHTML(p)
 62 | 	p.html("</p>\n")
 63 | }
 64 | 
 65 | func (b *Paragraph) printMarkdown(p *printer) {
 66 | 	p.maybeNL()
 67 | 	b.Text.printMarkdown(p)
 68 | }
 69 | 
 70 | // A paraBuilder is a [blockBuilder] for a [Paragraph].
 71 | type paraBuilder struct {
 72 | 	text  []string // each line of the paragraph
 73 | 	table *tableBuilder
 74 | }
 75 | 
 76 | // startParagraph is a [starter] for a [Paragraph].
 77 | func startParagraph(p *parser, s line) (line, bool) {
 78 | 	// Process paragraph continuation text or start new paragraph.
 79 | 	b := p.para()
 80 | 	indented := p.lineDepth == len(p.stack)-2 // fully indented, not playing "pargraph continuation text" games
 81 | 	text := s.trimSpaceString()
 82 | 
 83 | 	if b != nil && b.table != nil {
 84 | 		if indented && text != "" && text != "|" {
 85 | 			// Continue table.
 86 | 			b.table.addRow(text)
 87 | 			return line{}, true
 88 | 		}
 89 | 		// Blank or unindented line ends table.
 90 | 		// (So does a new block structure, but the caller has checked that already.)
 91 | 		// So does a line with just a pipe:
 92 | 		// https://github.com/github/cmark-gfm/pull/127 and
 93 | 		// https://github.com/github/cmark-gfm/pull/128
 94 | 		// fixed a buffer overread by rejecting | by itself as a table line.
 95 | 		// That seems to violate the spec, but we will play along.
 96 | 		b = nil
 97 | 	}
 98 | 
 99 | 	// If we are looking for tables and this is a table start, start a table.
100 | 	if p.Table && b != nil && indented && len(b.text) > 0 && isTableStart(b.text[len(b.text)-1], text) {
101 | 		// The current line s is the delimiter line.
102 | 		// The previous line in the paragraph is the header line.
103 | 		// Take the header line out of the current paragraph and
104 | 		// start a new paragraph that will be only the table.
105 | 		// Removing the last line from b may result in an empty paragraph.
106 | 		// That is handled by [paraBuilder.build].
107 | 		//
108 | 		// TODO: Why not make tableBuilder its own builder?
109 | 		// It seems like that would work (tables don't get paragraph continuation text).
110 | 		hdr := b.text[len(b.text)-1]
111 | 		b.text = b.text[:len(b.text)-1]
112 | 		tb := new(paraBuilder)
113 | 		p.addBlock(tb)
114 | 		tb.table = new(tableBuilder)
115 | 		tb.table.start(hdr, text)
116 | 		return line{}, true
117 | 	}
118 | 
119 | 	if b != nil {
120 | 		for i := p.lineDepth; i < len(p.stack); i++ {
121 | 			p.stack[i].pos.EndLine = p.lineno
122 | 		}
123 | 	} else {
124 | 		// Note: Ends anything without a matching prefix.
125 | 		b = new(paraBuilder)
126 | 		p.addBlock(b)
127 | 	}
128 | 	b.text = append(b.text, text)
129 | 	return line{}, true
130 | }
131 | 
132 | // extend would normally extend the paragraph with the line s,
133 | // but we return false and let startParagraph handle extension,
134 | // which it must for “paragraph continuation text” anyway.
135 | func (b *paraBuilder) extend(p *parser, s line) (line, bool) {
136 | 	return s, false
137 | }
138 | 
139 | func (b *paraBuilder) build(p *parser) Block {
140 | 	// If this paragraph is actually a table, build the table instead.
141 | 	if b.table != nil {
142 | 		return b.table.build(p)
143 | 	}
144 | 
145 | 	// Join all the lines (leading framing already removed)
146 | 	// to produce the full string of the paragraph.
147 | 	// In theory the join could be avoided by having [parser.inline]
148 | 	// handle a slice of lines, but then all the [inlineParser] implementations
149 | 	// would need to do that too, which would complicate them.
150 | 	// The join is simple.
151 | 	s := strings.Join(b.text, "\n")
152 | 
153 | 	// Parse and remove any link reference definitions at the start of s.
154 | 	for s != "" {
155 | 		end, ok := parseLinkRefDef(p, s)
156 | 		if !ok {
157 | 			break
158 | 		}
159 | 		s = s[skipSpace(s, end):]
160 | 	}
161 | 
162 | 	// If the paragraph is empty, return an Empty.
163 | 	// This can happen if the text was entirely link reference definitions,
164 | 	// but it can also happen if there is no paragraph text before a table.
165 | 	if s == "" {
166 | 		return &Empty{p.pos()}
167 | 	}
168 | 
169 | 	// Recompute EndLine because the last line of b.text
170 | 	// might have been removed to start a table.
171 | 	pos := p.pos()
172 | 	pos.EndLine = pos.StartLine + len(b.text) - 1
173 | 	return &Paragraph{
174 | 		pos,
175 | 		p.newText(pos, s),
176 | 	}
177 | }
178 | 


--------------------------------------------------------------------------------
/testdata/basic_fmt.txt:
--------------------------------------------------------------------------------
  1 | Test cases for Format.
  2 | -- parser.json --
  3 | {"Strikethrough": true}
  4 | -- one --
  5 | A single line.
  6 | -- one_blank --
  7 | A single line with a blank line.
  8 | 
  9 | -- want --
 10 | A single line with a blank line.
 11 | -- paragraphs --
 12 | This is the first paragraph
 13 | spanning multiple lines.
 14 | 
 15 | Here is another paragraph.
 16 | -- emph --
 17 | one _two_ *three* __four__ plain text **five** _*six*_ _seven 8 9_
 18 | -- escaped --
 19 | one \_two_ *three\* \\ \[text]
 20 | -- code --
 21 | The output is `hello,` `world`.
 22 | -- link --
 23 | A [link with no title](http://a)
 24 | [with title](http://b "title")
 25 | [single quoted](http://c 'title')
 26 | [parens](https://d (title))
 27 | -- link_spacing --
 28 | [with title](http://b   "title")
 29 | [single quoted](http://c	'title')
 30 | [parens](https://d   (title))
 31 | -- want --
 32 | [with title](http://b "title")
 33 | [single quoted](http://c 'title')
 34 | [parens](https://d (title))
 35 | -- autolink --
 36 | Contact <mailto:me@gmail.com>.
 37 | -- image --
 38 | ![image](http://a "title")
 39 | -- htmltag --
 40 | Using <i>italics</i> and <code>code</code>.
 41 | -- hardbreak --
 42 | foo\
 43 | baz
 44 | -- hardbreak_emph --
 45 | *foo\
 46 | bar*
 47 | -- thematic --
 48 | ***
 49 | -- thematic2 --
 50 |    ------
 51 | -- want --
 52 | ***
 53 | -- thematic3 --
 54 | First theme.
 55 | 
 56 | ***
 57 | 
 58 | Second theme.
 59 | -- paragraphs --
 60 | First paragraph,
 61 | spanning two lines.
 62 | 
 63 | Here's a second one.
 64 | -- heading1 --
 65 | # H1
 66 | 
 67 | Content.
 68 | -- heading2 --
 69 | ## H2 some  `code` ####
 70 | 
 71 | More.
 72 | 
 73 | ### H3
 74 | No space.
 75 | -- want --
 76 | ## H2 some  `code`
 77 | 
 78 | More.
 79 | 
 80 | ### H3
 81 | 
 82 | No space.
 83 | -- heading3 --
 84 | ****
 85 | ## foo
 86 | ****
 87 | -- want --
 88 | ***
 89 | 
 90 | ## foo
 91 | 
 92 | ***
 93 | -- codeblock1 --
 94 | As shown here:
 95 | ```
 96 | func f(int)
 97 | ```
 98 | And more.
 99 | -- want --
100 | As shown here:
101 | 
102 | ```
103 | func f(int)
104 | ```
105 | 
106 | And more.
107 | -- codeblock2 --
108 | As shown here:
109 | 
110 | ```
111 | func f(int)
112 | ```
113 | 
114 | And more.
115 | -- codeblock3 --
116 | Indented
117 | 
118 |     func f(int)
119 | 
120 | done.
121 | -- codeblock4 --
122 | Indented
123 | 
124 | 	func f(int)
125 | 
126 | done.
127 | -- want --
128 | Indented
129 | 
130 |     func f(int)
131 | 
132 | done.
133 | -- codeblock5 --
134 | tildes
135 | ~~~~~
136 | func f(int)
137 | ~~~~~
138 | done
139 | -- want --
140 | tildes
141 | 
142 | ~~~~~
143 | func f(int)
144 | ~~~~~
145 | 
146 | done
147 | -- codeblock6 --
148 | ```go
149 | func f(int)
150 | ```
151 | -- htmlblock --
152 | Literally,
153 | <pre>
154 |   something
155 | </pre>
156 | Done.
157 | -- want --
158 | Literally,
159 | 
160 | <pre>
161 |   something
162 | </pre>
163 | 
164 | Done.
165 | -- list_tight --
166 |   - one
167 |   - two
168 |   - three
169 | -- list_loose --
170 |   - one
171 | 
172 |   - two
173 | 
174 |   - three
175 | -- numlist_tight --
176 |  1. Lather.
177 |  2. Rinse.
178 |  3. Repeat.
179 | -- numlist_tight_renumber --
180 |  1) Lather.
181 |  1) Rinse.
182 |  2) Repeat.
183 | -- want --
184 |  1) Lather.
185 |  2) Rinse.
186 |  3) Repeat.
187 | -- numlist_loose --
188 |  1. Lather
189 |  2. Rinse
190 | 
191 |  3. Repeat
192 | -- want --
193 |  1. Lather
194 | 
195 |  2. Rinse
196 | 
197 |  3. Repeat
198 | -- list_lines_tight --
199 |   - A first
200 |     item.
201 |   - Items can span
202 |     lines.
203 |   - Or not.
204 | -- list_lines_loose --
205 |   - A first
206 |     item.
207 | 
208 |   - Items can span
209 |     lines.
210 | 
211 |   - Or not.
212 | -- list_nest --
213 |   - Intro
214 |      1. first
215 |      2. second
216 |   - Middle
217 |       * alpha
218 |   - End
219 | -- list_indent --
220 | We have:
221 | 
222 |   - First.
223 |   - Second.
224 | -- quote1 --
225 | As Celine said:
226 | > Love is the infinite
227 | placed within the reach of poodles.
228 | 
229 | The end.
230 | -- want --
231 | As Celine said:
232 | > Love is the infinite
233 | > placed within the reach of poodles.
234 | 
235 | The end.
236 | -- quote2 --
237 | > Anything can be quoted.
238 | >
239 | > ### like a heading
240 | >
241 | > Or a list:
242 | >
243 | >   - a
244 | >   - b
245 | >
246 | > ```
247 | > some code
248 | > ```
249 | -- quote3 --
250 |    > drop indentation
251 | -- want --
252 | > drop indentation
253 | -- inline_code1 --
254 | ` \[\` `
255 | -- inline_code2 --
256 | ``\[\` ``
257 | -- want --
258 | ``\[\` ``
259 | -- inline_code3 --
260 | `` \[\` ``
261 | -- want --
262 | `` \[\` ``
263 | -- parser.json --
264 | {"Strikethrough": true}
265 | -- strike1 --
266 | hello ~~world~~
267 | -- parser.json --
268 | {"Strikethrough": false}
269 | -- strike2 --
270 | hello ~~world~~
271 | -- parser.json --
272 | {"Emoji": true}
273 | -- fire1 --
274 | hello :fire:
275 | 
276 | ![:fire:](fire.jpg)
277 | -- want --
278 | hello 🔥
279 | 
280 | ![🔥](fire.jpg)
281 | -- parser.json --
282 | {"Emoji": false}
283 | -- fire2 --
284 | hello :fire:
285 | -- tabspace --
286 | >	tab
287 | > 	space tab
288 | >  	space space tab
289 | >   	space space space tab
290 | 
291 | - list
292 |  space
293 |  	space tab
294 | 	tab
295 | 
296 |  - list
297 |  space
298 |  	space tab
299 | 	tab
300 | 
301 |   - list
302 |  space
303 |  	space tab
304 | 	tab
305 | 
306 | - list
307 |   - sublist
308 | 	tab
309 | -- want --
310 | > tab
311 | > space tab
312 | > space space tab
313 | > space space space tab
314 | 
315 |   - list
316 |     space
317 |     space tab
318 |     tab
319 | 
320 |   - list
321 |     space
322 |     space tab
323 |     tab
324 | 
325 |   - list
326 |     space
327 |     space tab
328 |     tab
329 | 
330 |   - list
331 | 
332 |       - sublist
333 |         tab
334 | -- escaping --
335 | <!-- https://go.dev/issue/50332 -->
336 | The `go` subcommands now accept
337 | `-C` `<dir>` to change directory to \<dir>
338 | before performing the command, which may be useful for scripts that need to
339 | execute commands in multiple different modules.
340 | -- want --
341 | <!-- https://go.dev/issue/50332 -->
342 | 
343 | The `go` subcommands now accept
344 | `-C` `<dir>` to change directory to \<dir>
345 | before performing the command, which may be useful for scripts that need to
346 | execute commands in multiple different modules.
347 | 


--------------------------------------------------------------------------------
/heading.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"strings"
 10 | )
 11 | 
 12 | // A Heading is a [Block] representing an [ATX heading] or
 13 | // [Setext heading], usually displayed with the <h1> through <h6> tags.
 14 | //
 15 | // [ATX heading]: https://spec.commonmark.org/0.31.2/#atx-headings
 16 | // [Setext heading]: https://spec.commonmark.org/0.31.2/#setext-headings
 17 | type Heading struct {
 18 | 	Position
 19 | 
 20 | 	// Level is the heading level: 1 through 6.
 21 | 	// Other values are clamped to the valid range.
 22 | 	Level int
 23 | 
 24 | 	// Text is the text of the heading.
 25 | 	Text *Text
 26 | 
 27 | 	// ID is the HTML id attribute.
 28 | 	// The parser populates this field if [Parser.HeadingID] is true
 29 | 	// and the heading ends with text like "{#id}".
 30 | 	ID string
 31 | }
 32 | 
 33 | func (*Heading) Block() {}
 34 | 
 35 | // level returns the effective level, clamping Level to the range [1, 6].
 36 | func (h *Heading) level() int {
 37 | 	return max(1, min(6, h.Level))
 38 | }
 39 | 
 40 | func (b *Heading) printHTML(p *printer) {
 41 | 	fmt.Fprintf(p, "<h%d", b.level())
 42 | 	if b.ID != "" {
 43 | 		fmt.Fprintf(p, ` id="%s"`, htmlEscaper.Replace(b.ID))
 44 | 	}
 45 | 	p.WriteByte('>')
 46 | 	b.Text.printHTML(p)
 47 | 	fmt.Fprintf(p, "</h%d>\n", b.level())
 48 | }
 49 | 
 50 | func (b *Heading) printMarkdown(p *printer) {
 51 | 	p.maybeNL()
 52 | 
 53 | 	// TODO: handle setext headings properly.
 54 | 	for i := b.level(); i > 0; i-- {
 55 | 		p.WriteByte('#')
 56 | 	}
 57 | 	p.WriteByte(' ')
 58 | 	b.Text.printMarkdown(p)
 59 | 	if b.ID != "" {
 60 | 		fmt.Fprintf(p, " {#%s}", b.ID)
 61 | 	}
 62 | }
 63 | 
 64 | // startATXHeading is a [starter] for an ATX [Heading], like "## Heading".
 65 | //
 66 | // See https://spec.commonmark.org/0.31.2/#atx-headings.
 67 | func startATXHeading(p *parser, s line) (line, bool) {
 68 | 	n, ok := trimATX(&s)
 69 | 	if !ok {
 70 | 		return s, false
 71 | 	}
 72 | 	text := trimRightSpaceTab(s.string())
 73 | 
 74 | 	// Remove any number of trailing '#'s if preceded by a space or tab.
 75 | 	if inner := strings.TrimRight(text, "#"); inner != trimRightSpaceTab(inner) || inner == "" {
 76 | 		text = inner
 77 | 	}
 78 | 
 79 | 	// Extract id if extension is enabled.
 80 | 	var id string
 81 | 	if p.HeadingID {
 82 | 		// Extension: Parse and remove ID attribute.
 83 | 		// It must come before trailing '#'s to more closely follow the spec:
 84 | 		//    The optional closing sequence of #s must be preceded by spaces or tabs
 85 | 		//    and may be followed by spaces or tabs only.
 86 | 		// But Goldmark allows it to come after.
 87 | 		text, id = trimHeadingID(p, text)
 88 | 	}
 89 | 
 90 | 	pos := Position{p.lineno, p.lineno}
 91 | 	p.doneBlock(&Heading{pos, n, p.newText(pos, text), id}) // TODO rename doneBlock?
 92 | 	return line{}, true
 93 | }
 94 | 
 95 | // trimHeadingID trims an {#id} suffix from s if one is present,
 96 | // returning the prefix before the {#id} and the id.
 97 | // If there is no {#id} suffix, trimID returns s, "".
 98 | // The {#id} suffix can be followed by spaces, which are
 99 | // ignored and discarded.
100 | func trimHeadingID(p *parser, s string) (text, id string) {
101 | 	text = s // failure result
102 | 	i := strings.LastIndexByte(s, '{')
103 | 	if i < 0 {
104 | 		return
105 | 	}
106 | 	j := i + strings.IndexByte(s[i:], '}')
107 | 	if j < i || trimRightSpaceTab(s[j+1:]) != "" {
108 | 		return
109 | 	}
110 | 	if j == i+1 || j == i+2 && s[i+1] == '#' {
111 | 		p.corner = true // goldmark accepts {} and {#}
112 | 		return
113 | 	}
114 | 	if s[i+1] != '#' {
115 | 		return
116 | 	}
117 | 	text, id = s[:i], strings.TrimSpace(s[i+2:j]) // TODO maybe trimSpace?
118 | 
119 | 	// Goldmark is strict about the id syntax.
120 | 	for i := range len(id) {
121 | 		if c := id[i]; c >= 0x80 || !isLetterDigit(byte(c)) {
122 | 			p.corner = true
123 | 		}
124 | 	}
125 | 
126 | 	return
127 | }
128 | 
129 | // startSetextHeading is a [starter] for a Setext [Heading], which is an
130 | // underlined paragraph of text. The parargraph is assumed to have
131 | // been parsed already; startSetextHeading looks for the underline.
132 | //
133 | // See https://spec.commonmark.org/0.31.2/#setext-headings.
134 | func startSetextHeading(p *parser, s line) (line, bool) {
135 | 	// Topmost block must be a paragraph.
136 | 	if p.nextB() != p.para() {
137 | 		return s, false
138 | 	}
139 | 
140 | 	// Need Setext underline.
141 | 	t := s
142 | 	level, ok := trimSetext(&t)
143 | 	if !ok {
144 | 		return s, false
145 | 	}
146 | 
147 | 	// The Setext heading forces an end-of-paragraph,
148 | 	// but this still may not be a Setext heading if the paragraph
149 | 	// closer decides this wasn't a paragraph after all.
150 | 	// Might turn out to be a link reference, for example.
151 | 	// Close active paragraph to find out.
152 | 	p.closeBlock()
153 | 	para, ok := p.last().(*Paragraph)
154 | 	if !ok {
155 | 		// Paragraph text didn't end in a pargraph after all.
156 | 		// Leave underline text for processing by something else.
157 | 		return s, false
158 | 	}
159 | 
160 | 	p.deleteLast()
161 | 	p.doneBlock(&Heading{Position{para.StartLine, p.lineno}, level, para.Text, ""})
162 | 	return line{}, true
163 | }
164 | 
165 | // trimATX trims an ATX heading prefix
166 | // (optional spaces and then 1-6 #s followd by a space) from s.
167 | // reporting the heading level and whether it was successful.
168 | // If trimATX is unsuccessful, it leaves s unmodified.
169 | func trimATX(s *line) (level int, ok bool) {
170 | 	t := *s
171 | 	t.trimSpace(0, 3, false)
172 | 	if !t.trim('#') {
173 | 		return
174 | 	}
175 | 	n := 1
176 | 	for n < 6 && t.trim('#') {
177 | 		n++
178 | 	}
179 | 	if !t.trimSpace(1, 1, true) {
180 | 		return
181 | 	}
182 | 	*s = t
183 | 	return n, true
184 | }
185 | 
186 | // trimSetext trims a Setext heading underline
187 | // (optional spaces and then only -'s or ='s
188 | // followed by optional spaces and EOL) from s,
189 | // reporting the leading level and whether it was successful.
190 | // If trimSetext is unsuccessful, it leaves s unmodiifed.
191 | func trimSetext(s *line) (level int, ok bool) {
192 | 	t := *s
193 | 	t.trimSpace(0, 3, false)
194 | 	c := t.peek()
195 | 	if c != '-' && c != '=' {
196 | 		return
197 | 	}
198 | 	for t.trim(c) {
199 | 	}
200 | 	t.skipSpace()
201 | 	if !t.eof() {
202 | 		return
203 | 	}
204 | 	level = 1
205 | 	if c == '-' {
206 | 		level = 2
207 | 	}
208 | 	*s = line{}
209 | 	return level, true
210 | }
211 | 


--------------------------------------------------------------------------------
/code.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"strings"
  9 | )
 10 | 
 11 | // A CodeBlock is a [Block] representing an [indented code block]
 12 | // or [fenced code block],
 13 | // usually displayed in <pre><code> tags.
 14 | //
 15 | // When printing a CodeBlock as Markdown, the Fence field is used as
 16 | // a starting hint but is made longer as needed if the suggested fence text
 17 | // appears in Text.
 18 | //
 19 | // [indented code block]: https://spec.commonmark.org/0.31.2/#indented-code-blocks
 20 | // [fenced code block]: https://spec.commonmark.org/0.31.2/#fenced-code-blocks
 21 | type CodeBlock struct {
 22 | 	Position
 23 | 	Fence string   // fence to use
 24 | 	Info  string   // info following open fence
 25 | 	Text  []string // lines of code block
 26 | }
 27 | 
 28 | func (*CodeBlock) Block() {}
 29 | 
 30 | func (b *CodeBlock) printHTML(p *printer) {
 31 | 	p.html("<pre><code")
 32 | 	if b.Info != "" {
 33 | 		// https://spec.commonmark.org/0.31.2/#info-string
 34 | 		// “The first word of the info string is typically used to
 35 | 		// specify the language of the code sample...”
 36 | 		// No definition of what “first word” means though.
 37 | 		// The Dingus splits on isUnicodeSpace, but Goldmark only uses space.
 38 | 		lang := b.Info
 39 | 		for i, c := range lang {
 40 | 			if isUnicodeSpace(c) {
 41 | 				lang = lang[:i]
 42 | 				break
 43 | 			}
 44 | 		}
 45 | 		p.html(` class="language-`)
 46 | 		p.text(lang)
 47 | 		p.html(`"`)
 48 | 	}
 49 | 	p.WriteString(">")
 50 | 	for _, s := range b.Text {
 51 | 		p.text(s, "\n")
 52 | 	}
 53 | 	p.html("</code></pre>\n")
 54 | }
 55 | 
 56 | func (b *CodeBlock) printMarkdown(p *printer) {
 57 | 	if b.Fence == "" {
 58 | 		p.maybeNL()
 59 | 		for i, line := range b.Text {
 60 | 			if i > 0 {
 61 | 				p.nl()
 62 | 			}
 63 | 			p.md("    ")
 64 | 			p.md(line)
 65 | 			p.noTrim()
 66 | 		}
 67 | 	} else {
 68 | 		// TODO compute correct fence
 69 | 		if p.tight == 0 {
 70 | 			p.maybeNL()
 71 | 		}
 72 | 		p.md(b.Fence)
 73 | 		p.md(b.Info)
 74 | 		for _, line := range b.Text {
 75 | 			p.nl()
 76 | 			p.md(line)
 77 | 			p.noTrim()
 78 | 		}
 79 | 		p.nl()
 80 | 		p.md(b.Fence)
 81 | 	}
 82 | }
 83 | 
 84 | // startIndentedCodeBlock is a [starter] for an indented [CodeBlock].
 85 | // See https://spec.commonmark.org/0.31.2/#indented-code-blocks.
 86 | func startIndentedCodeBlock(p *parser, s line) (line, bool) {
 87 | 	// Line must start with 4 spaces and then not be blank.
 88 | 	peek := s
 89 | 	if p.para() != nil || !peek.trimSpace(4, 4, false) || peek.isBlank() {
 90 | 		return s, false
 91 | 	}
 92 | 
 93 | 	b := &indentBuilder{}
 94 | 	p.addBlock(b)
 95 | 	if peek.nl != '\n' {
 96 | 		p.corner = true // goldmark does not normalize to \n
 97 | 	}
 98 | 	b.text = append(b.text, peek.string())
 99 | 	return line{}, true
100 | }
101 | 
102 | // startFencedCodeBlock is a [starter] for a fenced [CodeBlock].
103 | // See https://spec.commonmark.org/0.31.2/#fenced-code-blocks.
104 | func startFencedCodeBlock(p *parser, s line) (line, bool) {
105 | 	// Line must start with fence.
106 | 	indent, fence, info, ok := trimFence(&s)
107 | 	if !ok {
108 | 		return s, false
109 | 	}
110 | 
111 | 	// Note presence of corner cases, for testing.
112 | 	if fence[0] == '~' && info != "" {
113 | 		// goldmark does not handle info after ~~~
114 | 		p.corner = true
115 | 	} else if info != "" && !isLetter(info[0]) {
116 | 		// goldmark does not allow numbered info.
117 | 		// goldmark does not treat a tab as introducing a new word.
118 | 		p.corner = true
119 | 	}
120 | 	for _, c := range info {
121 | 		if isUnicodeSpace(c) {
122 | 			if c != ' ' {
123 | 				// goldmark only breaks on space
124 | 				p.corner = true
125 | 			}
126 | 			break
127 | 		}
128 | 	}
129 | 
130 | 	p.addBlock(&fenceBuilder{indent, fence, info, nil})
131 | 	return line{}, true
132 | }
133 | 
134 | // trimFence attempts to trim leading indentation (up to 3 spaces),
135 | // a code fence, and an info string from s.
136 | // If successful, it returns those values and ok=true, leaving s empty.
137 | // If unsuccessful, it leaves s unmodified and returns ok=false.
138 | func trimFence(s *line) (indent int, fence, info string, ok bool) {
139 | 	t := *s
140 | 	indent = 0
141 | 	for indent < 3 && t.trimSpace(1, 1, false) {
142 | 		indent++
143 | 	}
144 | 	c := t.peek()
145 | 	if c != '`' && c != '~' {
146 | 		return
147 | 	}
148 | 
149 | 	f := t.string()
150 | 	n := 0
151 | 	for t.trim(c) {
152 | 		n++
153 | 	}
154 | 	if n < 3 {
155 | 		return
156 | 	}
157 | 
158 | 	txt := mdUnescaper.Replace(t.trimString())
159 | 	if c == '`' && strings.Contains(txt, "`") {
160 | 		return
161 | 	}
162 | 	info = trimSpaceTab(txt)
163 | 	fence = f[:n]
164 | 	ok = true
165 | 	*s = line{}
166 | 	return
167 | }
168 | 
169 | // An indentBuilder is a [blockBuilder] for an indented (unfenced) [CodeBlock].
170 | type indentBuilder struct {
171 | 	indent string
172 | 	text   []string
173 | }
174 | 
175 | func (c *indentBuilder) extend(p *parser, s line) (line, bool) {
176 | 	// Extension lines must start with 4 spaces or be blank.
177 | 	if !s.trimSpace(4, 4, true) {
178 | 		return s, false
179 | 	}
180 | 	c.text = append(c.text, s.string())
181 | 	if s.nl != '\n' {
182 | 		p.corner = true // goldmark does not normalize to \n
183 | 	}
184 | 	return line{}, true
185 | }
186 | 
187 | func (b *indentBuilder) build(p *parser) Block {
188 | 	// Remove trailing blank lines, which are often used
189 | 	// just to separate the indented code block from what follows.
190 | 	for len(b.text) > 0 && b.text[len(b.text)-1] == "" {
191 | 		b.text = b.text[:len(b.text)-1]
192 | 	}
193 | 	return &CodeBlock{p.pos(), "", "", b.text}
194 | }
195 | 
196 | // A fenceBuilder is a [blockBuilder] for a fenced [CodeBlock].
197 | type fenceBuilder struct {
198 | 	indent int
199 | 	fence  string
200 | 	info   string
201 | 	text   []string
202 | }
203 | 
204 | func (c *fenceBuilder) extend(p *parser, s line) (line, bool) {
205 | 	// Check for closing fence, which must be at least as long as opening fence, with no info.
206 | 	// The closing fence can be indented less than the opening one.
207 | 	peek := s
208 | 	if _, fence, info, ok := trimFence(&peek); ok && strings.HasPrefix(fence, c.fence) && info == "" {
209 | 		return line{}, false
210 | 	}
211 | 
212 | 	// Otherwise trim the indentation from the fence line, if present.
213 | 	if !s.trimSpace(c.indent, c.indent, false) {
214 | 		p.corner = true // goldmark mishandles fenced blank lines with not enough spaces
215 | 		s.trimSpace(0, c.indent, false)
216 | 	}
217 | 
218 | 	c.text = append(c.text, s.string())
219 | 	p.corner = p.corner || s.nl != '\n' // goldmark does not normalize to \n
220 | 	return line{}, true
221 | }
222 | 
223 | func (c *fenceBuilder) build(p *parser) Block {
224 | 	return &CodeBlock{p.pos(), c.fence, c.info, c.text}
225 | }
226 | 


--------------------------------------------------------------------------------
/testdata/gfm_ext.txt:
--------------------------------------------------------------------------------
  1 | // go run cmark2txtar.go /users/rsc/pub/cmark-gfm/test/extensions.txt
  2 | -- parser.json --
  3 | {"Strikethrough": true, "Table": true}
  4 | -- 1.md --
  5 | | abc | def |
  6 | | --- | --- |
  7 | | ghi | jkl |
  8 | | mno | pqr |
  9 | -- 1.html --
 10 | <table>
 11 | <thead>
 12 | <tr>
 13 | <th>abc</th>
 14 | <th>def</th>
 15 | </tr>
 16 | </thead>
 17 | <tbody>
 18 | <tr>
 19 | <td>ghi</td>
 20 | <td>jkl</td>
 21 | </tr>
 22 | <tr>
 23 | <td>mno</td>
 24 | <td>pqr</td>
 25 | </tr>
 26 | </tbody>
 27 | </table>
 28 | -- 2.md --
 29 | Hello!
 30 | 
 31 | | _abc_ | セン |
 32 | | ----- | ---- |
 33 | | 1. Block elements inside cells don't work. | |
 34 | | But _**inline elements do**_. | x |
 35 | 
 36 | Hi!
 37 | -- 2.html --
 38 | <p>Hello!</p>
 39 | <table>
 40 | <thead>
 41 | <tr>
 42 | <th><em>abc</em></th>
 43 | <th>セン</th>
 44 | </tr>
 45 | </thead>
 46 | <tbody>
 47 | <tr>
 48 | <td>1. Block elements inside cells don't work.</td>
 49 | <td></td>
 50 | </tr>
 51 | <tr>
 52 | <td>But <em><strong>inline elements do</strong></em>.</td>
 53 | <td>x</td>
 54 | </tr>
 55 | </tbody>
 56 | </table>
 57 | <p>Hi!</p>
 58 | -- 3.md --
 59 | | Not enough table | to be considered table |
 60 | 
 61 | | Not enough table | to be considered table |
 62 | | Not enough table | to be considered table |
 63 | 
 64 | | Just enough table | to be considered table |
 65 | | ----------------- | ---------------------- |
 66 | 
 67 | | ---- | --- |
 68 | 
 69 | |x|
 70 | |-|
 71 | 
 72 | | xyz |
 73 | | --- |
 74 | -- 3.html --
 75 | <p>| Not enough table | to be considered table |</p>
 76 | <p>| Not enough table | to be considered table |
 77 | | Not enough table | to be considered table |</p>
 78 | <table>
 79 | <thead>
 80 | <tr>
 81 | <th>Just enough table</th>
 82 | <th>to be considered table</th>
 83 | </tr>
 84 | </thead>
 85 | </table>
 86 | <p>| ---- | --- |</p>
 87 | <table>
 88 | <thead>
 89 | <tr>
 90 | <th>x</th>
 91 | </tr>
 92 | </thead>
 93 | </table>
 94 | <table>
 95 | <thead>
 96 | <tr>
 97 | <th>xyz</th>
 98 | </tr>
 99 | </thead>
100 | </table>
101 | -- 4.md --
102 | abc | def
103 | --- | ---
104 | xyz | ghi
105 | -- 4.html --
106 | <table>
107 | <thead>
108 | <tr>
109 | <th>abc</th>
110 | <th>def</th>
111 | </tr>
112 | </thead>
113 | <tbody>
114 | <tr>
115 | <td>xyz</td>
116 | <td>ghi</td>
117 | </tr>
118 | </tbody>
119 | </table>
120 | -- 5.md --
121 | Hello!
122 | 
123 | | _abc_ | セン |
124 | | ----- | ---- |
125 | | this row has a space at the end | | ^J
126 | | But _**inline elements do**_. | x |
127 | 
128 | Hi!
129 | -- 5.html --
130 | <p>Hello!</p>
131 | <table>
132 | <thead>
133 | <tr>
134 | <th><em>abc</em></th>
135 | <th>セン</th>
136 | </tr>
137 | </thead>
138 | <tbody>
139 | <tr>
140 | <td>this row has a space at the end</td>
141 | <td></td>
142 | </tr>
143 | <tr>
144 | <td>But <em><strong>inline elements do</strong></em>.</td>
145 | <td>x</td>
146 | </tr>
147 | </tbody>
148 | </table>
149 | <p>Hi!</p>
150 | -- 6.md --
151 | aaa | bbb | ccc | ddd | eee
152 | :-- | --- | :-: | --- | --:
153 | fff | ggg | hhh | iii | jjj
154 | -- 6.html --
155 | <table>
156 | <thead>
157 | <tr>
158 | <th align="left">aaa</th>
159 | <th>bbb</th>
160 | <th align="center">ccc</th>
161 | <th>ddd</th>
162 | <th align="right">eee</th>
163 | </tr>
164 | </thead>
165 | <tbody>
166 | <tr>
167 | <td align="left">fff</td>
168 | <td>ggg</td>
169 | <td align="center">hhh</td>
170 | <td>iii</td>
171 | <td align="right">jjj</td>
172 | </tr>
173 | </tbody>
174 | </table>
175 | -- 7.md --
176 | | a | b | c |
177 | | --- | --- |
178 | | this | isn't | okay |
179 | -- 7.html --
180 | <p>| a | b | c |
181 | | --- | --- |
182 | | this | isn't | okay |</p>
183 | -- 8.md --
184 | | a | b | c |
185 | | --- | --- | ---
186 | | x
187 | | a | b
188 | | 1 | 2 | 3 | 4 | 5 |
189 | -- 8.html --
190 | <table>
191 | <thead>
192 | <tr>
193 | <th>a</th>
194 | <th>b</th>
195 | <th>c</th>
196 | </tr>
197 | </thead>
198 | <tbody>
199 | <tr>
200 | <td>x</td>
201 | <td></td>
202 | <td></td>
203 | </tr>
204 | <tr>
205 | <td>a</td>
206 | <td>b</td>
207 | <td></td>
208 | </tr>
209 | <tr>
210 | <td>1</td>
211 | <td>2</td>
212 | <td>3</td>
213 | </tr>
214 | </tbody>
215 | </table>
216 | -- 9.md --
217 | | a | b |
218 | | --- | --- |
219 | | Escaped pipes are \|okay\|. | Like \| this. |
220 | | Within `\|code\| is okay` too. |
221 | | _**`c\|`**_ \| complex
222 | | don't **\_reparse\_**
223 | -- 9.html --
224 | <table>
225 | <thead>
226 | <tr>
227 | <th>a</th>
228 | <th>b</th>
229 | </tr>
230 | </thead>
231 | <tbody>
232 | <tr>
233 | <td>Escaped pipes are |okay|.</td>
234 | <td>Like | this.</td>
235 | </tr>
236 | <tr>
237 | <td>Within <code>|code| is okay</code> too.</td>
238 | <td></td>
239 | </tr>
240 | <tr>
241 | <td><em><strong><code>c|</code></strong></em> | complex</td>
242 | <td></td>
243 | </tr>
244 | <tr>
245 | <td>don't <strong>_reparse_</strong></td>
246 | <td></td>
247 | </tr>
248 | </tbody>
249 | </table>
250 | -- 10.md --
251 | | a |
252 | --- |
253 | -- 10.html --
254 | <table>
255 | <thead>
256 | <tr>
257 | <th>a</th>
258 | </tr>
259 | </thead>
260 | </table>
261 | -- 11.md --
262 | | a | b |
263 | | --- | --- |
264 | | \\ | `\\` |
265 | | \\\\ | `\\\\` |
266 | | \_ | `\_` |
267 | | \| | `\|` |
268 | | \a | `\a` |
269 | 
270 | \\ `\\`
271 | 
272 | \\\\ `\\\\`
273 | 
274 | \_ `\_`
275 | 
276 | \| `\|`
277 | 
278 | \a `\a`
279 | -- 11.html --
280 | <table>
281 | <thead>
282 | <tr>
283 | <th>a</th>
284 | <th>b</th>
285 | </tr>
286 | </thead>
287 | <tbody>
288 | <tr>
289 | <td>\</td>
290 | <td><code>\\</code></td>
291 | </tr>
292 | <tr>
293 | <td>\\</td>
294 | <td><code>\\\\</code></td>
295 | </tr>
296 | <tr>
297 | <td>_</td>
298 | <td><code>\_</code></td>
299 | </tr>
300 | <tr>
301 | <td>|</td>
302 | <td><code>|</code></td>
303 | </tr>
304 | <tr>
305 | <td>\a</td>
306 | <td><code>\a</code></td>
307 | </tr>
308 | </tbody>
309 | </table>
310 | <p>\ <code>\\</code></p>
311 | <p>\\ <code>\\\\</code></p>
312 | <p>_ <code>\_</code></p>
313 | <p>| <code>\|</code></p>
314 | <p>\a <code>\a</code></p>
315 | -- 12.md --
316 | | a |
317 | | --- |
318 | | <strong>hello</strong> |
319 | | ok <br> sure |
320 | -- 12.html --
321 | <table>
322 | <thead>
323 | <tr>
324 | <th>a</th>
325 | </tr>
326 | </thead>
327 | <tbody>
328 | <tr>
329 | <td><strong>hello</strong></td>
330 | </tr>
331 | <tr>
332 | <td>ok <br> sure</td>
333 | </tr>
334 | </tbody>
335 | </table>
336 | -- 13.md --
337 | Here's a link to [Freedom Planet 2][].
338 | 
339 | | Here's a link to [Freedom Planet 2][] in a table header. |
340 | | --- |
341 | | Here's a link to [Freedom Planet 2][] in a table row. |
342 | 
343 | [Freedom Planet 2]: http://www.freedomplanet2.com/
344 | -- 13.html --
345 | <p>Here's a link to <a href="http://www.freedomplanet2.com/">Freedom Planet 2</a>.</p>
346 | <table>
347 | <thead>
348 | <tr>
349 | <th>Here's a link to <a href="http://www.freedomplanet2.com/">Freedom Planet 2</a> in a table header.</th>
350 | </tr>
351 | </thead>
352 | <tbody>
353 | <tr>
354 | <td>Here's a link to <a href="http://www.freedomplanet2.com/">Freedom Planet 2</a> in a table row.</td>
355 | </tr>
356 | </tbody>
357 | </table>
358 | -- 14.md --
359 | | a | b | c |
360 | | --- | --- | --- |
361 | | d || e |
362 | -- 14.html --
363 | <table>
364 | <thead>
365 | <tr>
366 | <th>a</th>
367 | <th>b</th>
368 | <th>c</th>
369 | </tr>
370 | </thead>
371 | <tbody>
372 | <tr>
373 | <td>d</td>
374 | <td></td>
375 | <td>e</td>
376 | </tr>
377 | </tbody>
378 | </table>
379 | -- 15.md --
380 | | a | b |
381 | | --- | --- |
382 | |***(a)***|
383 | -- 15.html --
384 | <table>
385 | <thead>
386 | <tr>
387 | <th>a</th>
388 | <th>b</th>
389 | </tr>
390 | </thead>
391 | <tbody>
392 | <tr>
393 | <td><em><strong>(a)</strong></em></td>
394 | <td></td>
395 | </tr>
396 | </tbody>
397 | </table>
398 | -- 16.md --
399 | 123
400 | 456
401 | | a | b |
402 | | ---| --- |
403 | d | e
404 | -- 16.html --
405 | <p>123
406 | 456</p>
407 | <table>
408 | <thead>
409 | <tr>
410 | <th>a</th>
411 | <th>b</th>
412 | </tr>
413 | </thead>
414 | <tbody>
415 | <tr>
416 | <td>d</td>
417 | <td>e</td>
418 | </tr>
419 | </tbody>
420 | </table>
421 | -- 17.md --
422 | A proper ~strikethrough~.
423 | -- 17.html --
424 | <p>A proper <del>strikethrough</del>.</p>
425 | -- 18.md --
426 | These are ~not strikethroughs.
427 | 
428 | No, they are not~
429 | 
430 | This ~is ~ legit~ isn't ~ legit.
431 | 
432 | This is not ~~~~~one~~~~~ huge strikethrough.
433 | 
434 | ~one~ ~~two~~ ~~~three~~~
435 | 
436 | No ~mismatch~~
437 | -- 18.html --
438 | <p>These are ~not strikethroughs.</p>
439 | <p>No, they are not~</p>
440 | <p>This <del>is ~ legit</del> isn't ~ legit.</p>
441 | <p>This is not ~~~~~one~~~~~ huge strikethrough.</p>
442 | <p><del>one</del> <del>two</del> ~~~three~~~</p>
443 | <p>No ~mismatch~~</p>
444 | 


--------------------------------------------------------------------------------
/parse.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"strings"
  9 | )
 10 | 
 11 | type blockBuilder interface {
 12 | 	extend(p *parser, s line) (line, bool)
 13 | 	build(*parser) Block
 14 | }
 15 | 
 16 | type openBlock struct {
 17 | 	builder blockBuilder
 18 | 	inner   []Block
 19 | 	pos     Position
 20 | }
 21 | 
 22 | func (p *parser) last() Block {
 23 | 	ob := &p.stack[len(p.stack)-1]
 24 | 	return ob.inner[len(ob.inner)-1]
 25 | }
 26 | 
 27 | func (p *parser) deleteLast() {
 28 | 	ob := &p.stack[len(p.stack)-1]
 29 | 	ob.inner = ob.inner[:len(ob.inner)-1]
 30 | }
 31 | 
 32 | type rootBuilder struct{}
 33 | 
 34 | func (b *rootBuilder) build(p *parser) Block {
 35 | 	return &Document{p.pos(), p.blocks(), p.links}
 36 | }
 37 | 
 38 | // A Parser is a Markdown parser.
 39 | // The exported fields in the struct can be filled in before calling
 40 | // [Parser.Parse] in order to customize the details of the parsing process.
 41 | // A Parser is safe for concurrent use by multiple goroutines.
 42 | type Parser struct {
 43 | 	// HeadingID determines whether the parser accepts
 44 | 	// the {#hdr} syntax for an HTML id="hdr" attribute on headings.
 45 | 	// For example, if HeadingIDs is true then the Markdown
 46 | 	//    ## Overview {#overview}
 47 | 	// will render as the HTML
 48 | 	//    <h2 id="overview">Overview</h2>
 49 | 	HeadingID bool
 50 | 
 51 | 	// Strikethrough determines whether the parser accepts
 52 | 	// ~abc~ and ~~abc~~ as strikethrough syntax, producing
 53 | 	// <del>abc</del> in HTML.
 54 | 	Strikethrough bool
 55 | 
 56 | 	// TaskList determines whether the parser accepts
 57 | 	// “task list items” as defined in GitHub Flavored Markdown.
 58 | 	// When a list item begins with the plain text [ ] or [x]
 59 | 	// that turns into an unchecked or checked check box.
 60 | 	TaskList bool
 61 | 
 62 | 	// TODO
 63 | 	AutoLinkText       bool
 64 | 	AutoLinkAssumeHTTP bool
 65 | 
 66 | 	// TODO
 67 | 	Table bool
 68 | 
 69 | 	// TODO
 70 | 	Emoji bool
 71 | 
 72 | 	// TODO
 73 | 	SmartDot   bool
 74 | 	SmartDash  bool
 75 | 	SmartQuote bool
 76 | 
 77 | 	// TODO
 78 | 	Footnote bool
 79 | }
 80 | 
 81 | type parser struct {
 82 | 	*Parser
 83 | 
 84 | 	corner bool // noticed corner case to ignore in cross-implementation testing
 85 | 
 86 | 	root      *Document
 87 | 	links     map[string]*Link
 88 | 	lineno    int
 89 | 	stack     []openBlock
 90 | 	lineDepth int
 91 | 	lineInfo
 92 | 
 93 | 	// texts to apply inline processing to
 94 | 	texts []textRaw
 95 | 
 96 | 	footnotes map[string]*Footnote
 97 | 
 98 | 	// inline parsing
 99 | 	s       string
100 | 	emitted int // s[:emitted] has been emitted into list
101 | 	list    []Inline
102 | 
103 | 	backticks backtickParser
104 | 
105 | 	fixups []func()
106 | }
107 | 
108 | func (p *parser) addFixup(f func()) {
109 | 	p.fixups = append(p.fixups, f)
110 | }
111 | 
112 | type lineInfo struct {
113 | 	noDeclEnd     bool // no > on line
114 | 	noCommentEnd  bool // no --> on line
115 | 	noProcInstEnd bool // no ?> on line
116 | 	noCDATAEnd    bool // ]]> on line
117 | }
118 | 
119 | type textRaw struct {
120 | 	*Text
121 | 	raw string
122 | }
123 | 
124 | func (p *parser) newText(pos Position, text string) *Text {
125 | 	b := &Text{Position: pos}
126 | 	p.texts = append(p.texts, textRaw{b, text})
127 | 	return b
128 | }
129 | 
130 | func (p *parser) blocks() []Block {
131 | 	b := &p.stack[len(p.stack)-1]
132 | 	return b.inner
133 | }
134 | 
135 | func (p *parser) pos() Position {
136 | 	b := &p.stack[len(p.stack)-1]
137 | 	return b.pos
138 | }
139 | 
140 | func (p *Parser) Parse(text string) *Document {
141 | 	d, _ := p.parse(text)
142 | 	return d
143 | }
144 | 
145 | func (p *Parser) parse(text string) (d *Document, corner bool) {
146 | 	var ps parser
147 | 	ps.Parser = p
148 | 	if strings.Contains(text, "\x00") {
149 | 		text = strings.ReplaceAll(text, "\x00", "\uFFFD")
150 | 		ps.corner = true // goldmark does not replace NUL
151 | 	}
152 | 
153 | 	ps.lineDepth = -1
154 | 	ps.addBlock(&rootBuilder{})
155 | 	for text != "" {
156 | 		end := 0
157 | 		for end < len(text) && text[end] != '\n' && text[end] != '\r' {
158 | 			end++
159 | 		}
160 | 		ln := text[:end]
161 | 		text = text[end:]
162 | 		nl := byte(0)
163 | 		switch {
164 | 		case len(text) >= 2 && text[0] == '\r' && text[1] == '\n':
165 | 			nl = '\r' + '\n'
166 | 			text = text[2:]
167 | 		case len(text) >= 1:
168 | 			nl = text[0]
169 | 			text = text[1:]
170 | 		}
171 | 		ps.lineno++
172 | 		ps.addLine(makeLine(ln, nl))
173 | 	}
174 | 	ps.trimStack(0)
175 | 
176 | 	for _, t := range ps.texts {
177 | 		t.Inline = ps.inline(t.raw)
178 | 	}
179 | 
180 | 	for _, f := range ps.fixups {
181 | 		f()
182 | 	}
183 | 
184 | 	// TODO move into its own function
185 | 	var fixBlock func(Block)
186 | 
187 | 	fixBlocks := func(blocks []Block) []Block {
188 | 		keep := blocks[:0]
189 | 		for _, b := range blocks {
190 | 			fixBlock(b)
191 | 			if _, ok := b.(*Empty); ok {
192 | 				continue
193 | 			}
194 | 			keep = append(keep, b)
195 | 		}
196 | 		return keep
197 | 	}
198 | 
199 | 	fixBlock = func(x Block) {
200 | 		switch x := x.(type) {
201 | 		case *Document:
202 | 			x.Blocks = fixBlocks(x.Blocks)
203 | 		case *Quote:
204 | 			x.Blocks = fixBlocks(x.Blocks)
205 | 		case *List:
206 | 			for _, item := range x.Items {
207 | 				fixBlock(item)
208 | 			}
209 | 		case *Item:
210 | 			x.Blocks = fixBlocks(x.Blocks)
211 | 		}
212 | 	}
213 | 
214 | 	fixBlock(ps.root)
215 | 
216 | 	return ps.root, ps.corner
217 | }
218 | 
219 | func (p *parser) curB() blockBuilder {
220 | 	if p.lineDepth < len(p.stack) {
221 | 		return p.stack[p.lineDepth].builder
222 | 	}
223 | 	return nil
224 | }
225 | 
226 | func (p *parser) nextB() blockBuilder {
227 | 	if p.lineDepth+1 < len(p.stack) {
228 | 		return p.stack[p.lineDepth+1].builder
229 | 	}
230 | 	return nil
231 | }
232 | func (p *parser) trimStack(depth int) {
233 | 	if len(p.stack) < depth {
234 | 		// unreachable
235 | 		panic("trimStack")
236 | 	}
237 | 	for len(p.stack) > depth {
238 | 		p.closeBlock()
239 | 	}
240 | }
241 | 
242 | func (p *parser) addBlock(c blockBuilder) {
243 | 	p.trimStack(p.lineDepth + 1)
244 | 	p.stack = append(p.stack, openBlock{})
245 | 	ob := &p.stack[len(p.stack)-1]
246 | 	ob.builder = c
247 | 	ob.pos.StartLine = p.lineno
248 | 	ob.pos.EndLine = p.lineno
249 | }
250 | 
251 | func (p *parser) doneBlock(b Block) {
252 | 	p.trimStack(p.lineDepth + 1)
253 | 	ob := &p.stack[len(p.stack)-1]
254 | 	ob.inner = append(ob.inner, b)
255 | }
256 | 
257 | func (p *parser) para() *paraBuilder {
258 | 	if b, ok := p.stack[len(p.stack)-1].builder.(*paraBuilder); ok {
259 | 		return b
260 | 	}
261 | 	return nil
262 | }
263 | 
264 | func (p *parser) closeBlock() Block {
265 | 	b := &p.stack[len(p.stack)-1]
266 | 	if b.builder == nil {
267 | 		println("closeBlock", len(p.stack)-1)
268 | 	}
269 | 	blk := b.builder.build(p)
270 | 	p.stack = p.stack[:len(p.stack)-1]
271 | 	if len(p.stack) > 0 {
272 | 		b := &p.stack[len(p.stack)-1]
273 | 		b.inner = append(b.inner, blk)
274 | 		// _ = b
275 | 	} else {
276 | 		p.root = blk.(*Document)
277 | 	}
278 | 	return blk
279 | }
280 | 
281 | func (p *parser) link(label string) *Link {
282 | 	return p.links[label]
283 | }
284 | 
285 | func (p *parser) defineLink(label string, link *Link) {
286 | 	if p.links == nil {
287 | 		p.links = make(map[string]*Link)
288 | 	}
289 | 	p.links[label] = link
290 | }
291 | 
292 | func (p *parser) addLine(s line) {
293 | 	// Process continued prefixes.
294 | 	p.lineDepth = 0
295 | 	for ; p.lineDepth+1 < len(p.stack); p.lineDepth++ {
296 | 		old := s
297 | 		var ok bool
298 | 		s, ok = p.stack[p.lineDepth+1].builder.extend(p, s)
299 | 		// Note: s != old is efficient only because s.text is either the same string (same pointer, len)
300 | 		// as old.text or has a different length or is empty; either way so there is no actual data comparison.
301 | 		// Sometimes s.text = "" and there is still
302 | 		if (ok || s != old) && !old.isBlank() {
303 | 			p.stack[p.lineDepth+1].pos.EndLine = p.lineno
304 | 		}
305 | 		if !ok {
306 | 			break
307 | 		}
308 | 	}
309 | 
310 | 	if s.isBlank() {
311 | 		p.trimStack(p.lineDepth + 1)
312 | 		return
313 | 	}
314 | 
315 | 	// Process new prefixes, if any.
316 | Prefixes:
317 | 	// Start new block inside p.stack[depth].
318 | 	for _, fn := range starters {
319 | 		if l, ok := fn(p, s); ok {
320 | 			s = l
321 | 			if s.isBlank() {
322 | 				return
323 | 			}
324 | 			p.lineDepth++
325 | 			goto Prefixes
326 | 		}
327 | 	}
328 | 
329 | 	startParagraph(p, s)
330 | }
331 | 
332 | func (c *rootBuilder) extend(p *parser, s line) (line, bool) {
333 | 	// unreachable
334 | 	panic("root extend")
335 | }
336 | 
337 | type starter func(*parser, line) (line, bool)
338 | 
339 | var starters = []starter{
340 | 	startIndentedCodeBlock,
341 | 	startFencedCodeBlock,
342 | 	startBlockQuote,
343 | 	startATXHeading,
344 | 	startSetextHeading,
345 | 	startThematicBreak,
346 | 	startListItem,
347 | 	startHTMLBlock,
348 | 	startFootnote,
349 | }
350 | 


--------------------------------------------------------------------------------
/table.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2023 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"strings"
  9 | 	"unicode/utf8"
 10 | )
 11 | 
 12 | // A Table is a [Block] representing a [table], a GitHub-flavored Markdown extension.
 13 | //
 14 | // [table]: https://github.github.com/gfm/#tables-extension-
 15 | type Table struct {
 16 | 	Position
 17 | 	Header []*Text   // header row (slice of columns)
 18 | 	Align  []string  // alignment for columns: "left", "center", "right"; "" for unset
 19 | 	Rows   [][]*Text // data rows (slices of columns, not necessarily all same width)
 20 | }
 21 | 
 22 | func (*Table) Block() {}
 23 | 
 24 | func (t *Table) printHTML(p *printer) {
 25 | 	p.html("<table>\n")
 26 | 	p.html("<thead>\n")
 27 | 	p.html("<tr>\n")
 28 | 	for i, hdr := range t.Header {
 29 | 		p.html("<th")
 30 | 		if t.Align[i] != "" {
 31 | 			p.html(` align="`, t.Align[i], `"`)
 32 | 		}
 33 | 		p.html(">")
 34 | 		hdr.printHTML(p)
 35 | 		p.html("</th>\n")
 36 | 	}
 37 | 	p.html("</tr>\n")
 38 | 	p.html("</thead>\n")
 39 | 	if len(t.Rows) > 0 {
 40 | 		p.html("<tbody>\n")
 41 | 		for _, row := range t.Rows {
 42 | 			p.html("<tr>\n")
 43 | 			for i, cell := range row {
 44 | 				p.html("<td")
 45 | 				if i < len(t.Align) && t.Align[i] != "" {
 46 | 					p.html(` align="`, t.Align[i], `"`)
 47 | 				}
 48 | 				p.html(">")
 49 | 				cell.printHTML(p)
 50 | 				p.html("</td>\n")
 51 | 			}
 52 | 			p.html("</tr>\n")
 53 | 		}
 54 | 		p.html("</tbody>\n")
 55 | 	}
 56 | 	p.html("</table>\n")
 57 | }
 58 | 
 59 | func (t *Table) printMarkdown(p *printer) {
 60 | 	// TODO: double-check this
 61 | 	// inline all Text values in Header and Rows to
 62 | 	// get final, rendered widths
 63 | 	var (
 64 | 		hdr       = make([]string, len(t.Header))
 65 | 		rows      = make([][]string, 0, len(t.Rows))
 66 | 		maxWidths = make([]int, len(t.Header))
 67 | 
 68 | 		xb = &printer{}
 69 | 		xs string
 70 | 	)
 71 | 
 72 | 	toString := func(txt *Text) string {
 73 | 		xb.buf.Reset()
 74 | 		txt.printMarkdown(xb)
 75 | 		return strings.TrimSpace(xb.buf.String())
 76 | 	}
 77 | 
 78 | 	for i, txt := range t.Header {
 79 | 		xs = toString(txt)
 80 | 		hdr[i] = xs
 81 | 		maxWidths[i] = utf8.RuneCountInString(xs)
 82 | 	}
 83 | 
 84 | 	for _, row := range t.Rows {
 85 | 		xrow := make([]string, len(hdr))
 86 | 		for j := range t.Header {
 87 | 			xs = toString(row[j])
 88 | 			xrow[j] = xs
 89 | 			if n := utf8.RuneCountInString(xs); n > maxWidths[j] {
 90 | 				maxWidths[j] = n
 91 | 			}
 92 | 		}
 93 | 		rows = append(rows, xrow)
 94 | 	}
 95 | 
 96 | 	p.maybeQuoteNL('|')
 97 | 	for i, cell := range hdr {
 98 | 		p.WriteString("| ")
 99 | 		pad(p, cell, t.Align[i], maxWidths[i])
100 | 		p.WriteString(" ")
101 | 	}
102 | 	p.WriteString("|")
103 | 
104 | 	p.nl()
105 | 	for i, a := range t.Align {
106 | 		w := maxWidths[i]
107 | 		p.WriteString("| ")
108 | 		switch a {
109 | 		case "left":
110 | 			p.WriteString(":")
111 | 			repeat(p, '-', w-1)
112 | 		case "center":
113 | 			p.WriteString(":")
114 | 			repeat(p, '-', w-2)
115 | 			p.WriteString(":")
116 | 		case "right":
117 | 			repeat(p, '-', w-1)
118 | 			p.WriteString(":")
119 | 		default:
120 | 			repeat(p, '-', w)
121 | 		}
122 | 		p.WriteString(" ")
123 | 	}
124 | 	p.WriteString("|")
125 | 
126 | 	for _, row := range rows {
127 | 		p.nl()
128 | 		for i := range t.Header {
129 | 			p.WriteString("| ")
130 | 			pad(p, row[i], t.Align[i], maxWidths[i])
131 | 			p.WriteString(" ")
132 | 		}
133 | 		p.WriteString("|")
134 | 	}
135 | }
136 | 
137 | // repeat prints c n times to p.
138 | func repeat(p *printer, c byte, n int) {
139 | 	for i := 0; i < n; i++ {
140 | 		p.WriteByte(c)
141 | 	}
142 | }
143 | 
144 | // pad prints text to p aligned according to align,
145 | // aiming for a width of w runes.
146 | // It can happen that multiple runes appear as a single “character”,
147 | // which will break the alignment, but this is the best we can do for now.
148 | func pad(p *printer, text, align string, w int) {
149 | 	n := w - utf8.RuneCountInString(text)
150 | 	switch align {
151 | 	default:
152 | 		p.WriteString(text)
153 | 		repeat(p, ' ', n)
154 | 	case "right":
155 | 		repeat(p, ' ', n)
156 | 		p.WriteString(text)
157 | 	case "center":
158 | 		repeat(p, ' ', n/2)
159 | 		p.WriteString(text)
160 | 		repeat(p, ' ', n-n/2)
161 | 	}
162 | }
163 | 
164 | // A tableTrimmed is a table row with the outer pipes (if any) removed.
165 | // It is a separate type to avoid accidentally trimming the outer pipes multiple times,
166 | // which would instead discard outer empty cells.
167 | type tableTrimmed string
168 | 
169 | // isTableSpace reports whether c is a space as far as tables are concerned.
170 | func isTableSpace(c byte) bool {
171 | 	return c == ' ' || c == '\t' || c == '\v' || c == '\f'
172 | }
173 | 
174 | // tableTrimSpace returns s with table space prefixes and suffixes removed.
175 | func tableTrimSpace(s string) string {
176 | 	i := 0
177 | 	for i < len(s) && isTableSpace(s[i]) {
178 | 		i++
179 | 	}
180 | 	j := len(s)
181 | 	for j > i && isTableSpace(s[j-1]) {
182 | 		j--
183 | 	}
184 | 	return s[i:j]
185 | }
186 | 
187 | // tableTrimOuter trims the outer | |, if any, from the row.
188 | func tableTrimOuter(row string) tableTrimmed {
189 | 	row = tableTrimSpace(row)
190 | 	if len(row) > 0 && row[0] == '|' {
191 | 		row = row[1:]
192 | 	}
193 | 	if len(row) > 0 && row[len(row)-1] == '|' {
194 | 		row = row[:len(row)-1]
195 | 	}
196 | 	return tableTrimmed(row)
197 | }
198 | 
199 | // isTableStart reports whether the pair of lines hdr1, delim1
200 | // are a valid table start.
201 | func isTableStart(hdr1, delim1 string) bool {
202 | 	// Scan potential delimiter string, counting columns.
203 | 	// This happens on every line of text,
204 | 	// so make it relatively quick - nothing expensive.
205 | 	col := 0
206 | 	delim := tableTrimOuter(delim1)
207 | 	i := 0
208 | 	for ; ; col++ {
209 | 		for i < len(delim) && isTableSpace(delim[i]) {
210 | 			i++
211 | 		}
212 | 		if i >= len(delim) {
213 | 			break
214 | 		}
215 | 		if i < len(delim) && delim[i] == ':' {
216 | 			i++
217 | 		}
218 | 		if i >= len(delim) || delim[i] != '-' {
219 | 			return false
220 | 		}
221 | 		i++
222 | 		for i < len(delim) && delim[i] == '-' {
223 | 			i++
224 | 		}
225 | 		if i < len(delim) && delim[i] == ':' {
226 | 			i++
227 | 		}
228 | 		for i < len(delim) && isTableSpace(delim[i]) {
229 | 			i++
230 | 		}
231 | 		if i < len(delim) && delim[i] == '|' {
232 | 			i++
233 | 		}
234 | 	}
235 | 
236 | 	if tableTrimSpace(hdr1) == "|" {
237 | 		// https://github.com/github/cmark-gfm/pull/127 and
238 | 		// https://github.com/github/cmark-gfm/pull/128
239 | 		// fixed a buffer overread by rejecting | by itself as a table line.
240 | 		// That seems to violate the “spec”, but we will play along.
241 | 		return false
242 | 	}
243 | 
244 | 	return col == tableCount(tableTrimOuter(hdr1))
245 | }
246 | 
247 | // tableCount returns the number of columns in the row.
248 | func tableCount(row tableTrimmed) int {
249 | 	col := 1
250 | 	prev := byte(0)
251 | 	for i := 0; i < len(row); i++ {
252 | 		c := row[i]
253 | 		if c == '|' && prev != '\\' {
254 | 			col++
255 | 		}
256 | 		prev = c
257 | 	}
258 | 	return col
259 | }
260 | 
261 | // A tableBuilder is a [blockBuilder] for a [Table].
262 | type tableBuilder struct {
263 | 	hdr   tableTrimmed   // header line
264 | 	delim tableTrimmed   // delimiter line
265 | 	rows  []tableTrimmed // data lines
266 | }
267 | 
268 | // start starts the builder with the given header and delimiter lines.
269 | func (b *tableBuilder) start(hdr, delim string) {
270 | 	b.hdr = tableTrimOuter(hdr)
271 | 	b.delim = tableTrimOuter(delim)
272 | }
273 | 
274 | // addRow adds a new row to the table.
275 | func (b *tableBuilder) addRow(row string) {
276 | 	b.rows = append(b.rows, tableTrimOuter(row))
277 | }
278 | 
279 | // build returns the [Table] for this tableBuilder.
280 | func (b *tableBuilder) build(p *parser) Block {
281 | 	pos := p.pos()
282 | 	pos.StartLine-- // builder does not count header
283 | 	pos.EndLine = pos.StartLine + 1 + len(b.rows)
284 | 	t := &Table{
285 | 		Position: pos,
286 | 	}
287 | 	width := tableCount(b.hdr)
288 | 	t.Header = b.parseRow(p, b.hdr, pos.StartLine, width)
289 | 	t.Align = b.parseAlign(b.delim, width)
290 | 	t.Rows = make([][]*Text, len(b.rows))
291 | 	for i, row := range b.rows {
292 | 		t.Rows[i] = b.parseRow(p, row, pos.StartLine+2+i, width)
293 | 	}
294 | 	return t
295 | }
296 | 
297 | // parseRow TODO explain
298 | func (b *tableBuilder) parseRow(p *parser, row tableTrimmed, line int, width int) []*Text {
299 | 	out := make([]*Text, 0, width)
300 | 	pos := Position{StartLine: line, EndLine: line}
301 | 	start := 0
302 | 	unesc := nop
303 | 	for i := 0; i < len(row); i++ {
304 | 		c := row[i]
305 | 		if c == '\\' && i+1 < len(row) && row[i+1] == '|' {
306 | 			unesc = tableUnescape
307 | 			i++
308 | 			continue
309 | 		}
310 | 		if c == '|' {
311 | 			out = append(out, p.newText(pos, unesc(strings.Trim(string(row[start:i]), " \t\v\f"))))
312 | 			if len(out) == width {
313 | 				// Extra cells are discarded!
314 | 				return out
315 | 			}
316 | 			start = i + 1
317 | 			unesc = nop
318 | 		}
319 | 	}
320 | 	out = append(out, p.newText(pos, unesc(strings.Trim(string(row[start:]), " \t\v\f"))))
321 | 	for len(out) < width {
322 | 		// Missing cells are considered empty.
323 | 		out = append(out, p.newText(pos, ""))
324 | 	}
325 | 	return out
326 | }
327 | 
328 | func nop(text string) string {
329 | 	return text
330 | }
331 | 
332 | // tableUnescape TODO
333 | func tableUnescape(text string) string {
334 | 	out := make([]byte, 0, len(text))
335 | 	for i := 0; i < len(text); i++ {
336 | 		c := text[i]
337 | 		if c == '\\' && i+1 < len(text) && text[i+1] == '|' {
338 | 			i++
339 | 			c = '|'
340 | 		}
341 | 		out = append(out, c)
342 | 	}
343 | 	return string(out)
344 | }
345 | 
346 | // parseAlign TODO
347 | func (b *tableBuilder) parseAlign(delim tableTrimmed, n int) []string {
348 | 	align := make([]string, 0, tableCount(delim))
349 | 	start := 0
350 | 	for i := 0; i < len(delim); i++ {
351 | 		if delim[i] == '|' {
352 | 			align = append(align, tableAlign(string(delim[start:i])))
353 | 			start = i + 1
354 | 		}
355 | 	}
356 | 	align = append(align, tableAlign(string(delim[start:])))
357 | 	return align
358 | }
359 | 
360 | // tableAlign TODO
361 | func tableAlign(cell string) string {
362 | 	cell = tableTrimSpace(cell)
363 | 	l := cell[0] == ':'
364 | 	r := cell[len(cell)-1] == ':'
365 | 	switch {
366 | 	case l && r:
367 | 		return "center"
368 | 	case l:
369 | 		return "left"
370 | 	case r:
371 | 		return "right"
372 | 	}
373 | 	return ""
374 | }
375 | 


--------------------------------------------------------------------------------
/list.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"strconv"
 10 | )
 11 | 
 12 | // TODO should Item implement Block?
 13 | // maybe make a itemBlock internal Block for use with the builders?
 14 | 
 15 | // A List is a [Block] representing a [list],
 16 | // either an unordered (bullet) list
 17 | // or an ordered (numbered) list.
 18 | //
 19 | // Lists can be [loose or tight], which controls the spacing between list items.
 20 | // In Markdown, a list is loose when there is a blank line
 21 | // between any two list items, or when any list item
 22 | // directly contains two blocks that are separated by a blank line.
 23 | // (Note that because paragraphs must be separated by blank lines,
 24 | // any multi-paragraph item necessarily creates a loose list.)
 25 | // When rendering HTML, loose list items are formatted in the usual way.
 26 | // For tight lists, a list item consisting of a single paragraph omits
 27 | // the <p>...</p> tags around the paragraph text.
 28 | //
 29 | // [list]: https://spec.commonmark.org/0.31.2/#lists
 30 | // [loose or tight]: https://spec.commonmark.org/0.31.2/#loose
 31 | type List struct {
 32 | 	Position
 33 | 
 34 | 	// Bullet is the bullet character used in the list: '-', '+', or '*'.
 35 | 	// For an ordered list, Bullet is the character following the number: '.' or ')'.
 36 | 	Bullet rune
 37 | 
 38 | 	// Start is the number of the first item in an ordered list.
 39 | 	Start int
 40 | 
 41 | 	// Loose indicates whether the list is loose.
 42 | 	// (See the [List] doc comment for details.)
 43 | 	Loose bool
 44 | 
 45 | 	// Items is the list's items.
 46 | 	// TODO: Should this be []*Item or Blocks?
 47 | 	Items []Block // always *Item
 48 | }
 49 | 
 50 | func (*List) Block() {}
 51 | 
 52 | // Ordered reports whether the list is ordered (numbered).
 53 | func (l *List) Ordered() bool {
 54 | 	return l.Bullet == '.' || l.Bullet == ')'
 55 | }
 56 | 
 57 | // An Item is a [Block] representing a [list item].
 58 | //
 59 | // [list item]: https://spec.commonmark.org/0.31.2/#list-items
 60 | type Item struct {
 61 | 	Position
 62 | 
 63 | 	// Blocks is the item content.
 64 | 	Blocks []Block
 65 | }
 66 | 
 67 | func (*Item) Block() {}
 68 | 
 69 | func (b *List) printHTML(p *printer) {
 70 | 	if b.Bullet == '.' || b.Bullet == ')' {
 71 | 		p.html("<ol")
 72 | 		if b.Start != 1 {
 73 | 			p.html(` start="`, strconv.Itoa(b.Start), `"`)
 74 | 		}
 75 | 		p.html(">\n")
 76 | 	} else {
 77 | 		p.html("<ul>\n")
 78 | 	}
 79 | 	for _, item := range b.Items {
 80 | 		item.printHTML(p)
 81 | 	}
 82 | 	if b.Bullet == '.' || b.Bullet == ')' {
 83 | 		p.html("</ol>\n")
 84 | 	} else {
 85 | 		p.html("</ul>\n")
 86 | 	}
 87 | }
 88 | 
 89 | func (b *Item) printHTML(p *printer) {
 90 | 	p.html("<li>")
 91 | 	if len(b.Blocks) > 0 {
 92 | 		if _, ok := b.Blocks[0].(*Text); !ok {
 93 | 			p.WriteString("\n")
 94 | 		}
 95 | 	}
 96 | 	for i, c := range b.Blocks {
 97 | 		c.printHTML(p)
 98 | 		if i+1 < len(b.Blocks) {
 99 | 			if _, ok := c.(*Text); ok {
100 | 				p.WriteString("\n")
101 | 			}
102 | 		}
103 | 	}
104 | 	p.html("</li>\n")
105 | }
106 | 
107 | func (b *List) printMarkdown(p *printer) {
108 | 	old := p.listOut
109 | 	defer func() {
110 | 		p.listOut = old
111 | 	}()
112 | 	p.bullet = b.Bullet
113 | 	p.num = b.Start
114 | 	if b.Loose {
115 | 		p.loose++
116 | 	} else {
117 | 		p.tight++
118 | 	}
119 | 	p.maybeNL()
120 | 	for i, item := range b.Items {
121 | 		if i > 0 {
122 | 			p.nl()
123 | 			if b.Loose {
124 | 				p.nl()
125 | 			}
126 | 		}
127 | 		item.printMarkdown(p)
128 | 		p.num++
129 | 	}
130 | }
131 | 
132 | func (b *Item) printMarkdown(p *printer) {
133 | 	var marker string
134 | 	if p.bullet == '.' || p.bullet == ')' {
135 | 		marker = fmt.Sprintf(" %d%c ", p.num, p.bullet)
136 | 	} else {
137 | 		marker = fmt.Sprintf("  %c ", p.bullet)
138 | 	}
139 | 	p.WriteString(marker)
140 | 	n := len(marker)
141 | 	if n > 4 {
142 | 		n = 4
143 | 	}
144 | 	defer p.pop(p.push("    "[:n]))
145 | 	printMarkdownBlocks(b.Blocks, p)
146 | }
147 | 
148 | // A listBuilder is a [blockBuilder] for a [List].
149 | type listBuilder struct {
150 | 	// List fields
151 | 	bullet rune
152 | 	start  int
153 | 
154 | 	// item is the builder for the current item.
155 | 	item *itemBuilder
156 | 
157 | 	//
158 | 	todo func() line
159 | }
160 | 
161 | // An itemBuilder is a [blockBuilder] for an [Item].
162 | type itemBuilder struct {
163 | 	list        *listBuilder //  list containing item
164 | 	width       int          // TODO
165 | 	haveContent bool         // TODO
166 | }
167 | 
168 | // TODO explain
169 | // startListItem is a [starter] for a list item.
170 | // The first list item in a list also starts the list itself.
171 | func startListItem(p *parser, s line) (_ line, _ bool) {
172 | 	if list, ok := p.curB().(*listBuilder); ok && list.todo != nil {
173 | 		s = list.todo()
174 | 		list.todo = nil
175 | 		return s, true
176 | 	}
177 | 
178 | 	t := s
179 | 	n := 0
180 | 	for i := 0; i < 3; i++ {
181 | 		if !t.trimSpace(1, 1, false) {
182 | 			break
183 | 		}
184 | 		n++
185 | 	}
186 | 	bullet := t.peek()
187 | 	var num int
188 | Switch:
189 | 	switch bullet {
190 | 	default:
191 | 		return
192 | 	case '-', '*', '+':
193 | 		t.trim(bullet)
194 | 		n++
195 | 	case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
196 | 		for j := t.i; ; j++ {
197 | 			if j >= len(t.text) {
198 | 				return
199 | 			}
200 | 			c := t.text[j]
201 | 			if c == '.' || c == ')' {
202 | 				// success
203 | 				bullet = c
204 | 				j++
205 | 				n += j - t.i
206 | 				t.i = j
207 | 				break Switch
208 | 			}
209 | 			if c < '0' || '9' < c {
210 | 				return
211 | 			}
212 | 			if j-t.i >= 9 {
213 | 				return
214 | 			}
215 | 			num = num*10 + int(c) - '0'
216 | 		}
217 | 
218 | 	}
219 | 	if !t.trimSpace(1, 1, true) {
220 | 		return
221 | 	}
222 | 	n++
223 | 	tt := t
224 | 	m := 0
225 | 	for i := 0; i < 3 && tt.trimSpace(1, 1, false); i++ {
226 | 		m++
227 | 	}
228 | 	if !tt.trimSpace(1, 1, true) {
229 | 		n += m
230 | 		t = tt
231 | 	}
232 | 
233 | 	// Pretty sure we have a list item now.
234 | 
235 | 	var list *listBuilder
236 | 	if c, ok := p.nextB().(*listBuilder); ok {
237 | 		list = c
238 | 	}
239 | 	if list == nil || list.bullet != rune(bullet) {
240 | 		// “When the first list item in a list interrupts a paragraph—that is,
241 | 		// when it starts on a line that would otherwise count as
242 | 		// paragraph continuation text—then (a) the lines Ls must
243 | 		// not begin with a blank line,
244 | 		// and (b) if the list item is ordered, the start number must be 1.”
245 | 		if list == nil && p.para() != nil && (t.isBlank() || (bullet == '.' || bullet == ')') && num != 1) {
246 | 			// Goldmark and Dingus both seem to get this wrong
247 | 			// (or the words above don't mean what we think they do).
248 | 			// when the paragraph that could be continued
249 | 			// is inside a block quote.
250 | 			// See testdata/extra.txt 117.md.
251 | 			p.corner = true
252 | 			return
253 | 		}
254 | 		list = &listBuilder{bullet: rune(bullet), start: num}
255 | 		p.addBlock(list)
256 | 	}
257 | 	b := &itemBuilder{list: list, width: n, haveContent: !t.isBlank()}
258 | 	list.todo = func() line {
259 | 		p.addBlock(b)
260 | 		list.item = b
261 | 		return t
262 | 	}
263 | 
264 | 	// TODO explain s not t
265 | 	return s, true
266 | }
267 | 
268 | func (c *listBuilder) extend(p *parser, s line) (line, bool) {
269 | 	// TODO explain
270 | 	item := c.item
271 | 	if item == nil && s.isBlank() { // TODO how can this happen
272 | 		return s, true
273 | 	}
274 | 
275 | 	// If we can trim the indentation required by the current item,
276 | 	// do that and return true, allowing s to be passed to the
277 | 	// item builder.
278 | 	if item != nil && s.trimSpace(item.width, item.width, true) {
279 | 		return s, true
280 | 	}
281 | 	return s, false
282 | }
283 | 
284 | func (c *itemBuilder) extend(p *parser, s line) (line, bool) {
285 | 	blank := s.isBlank()
286 | 
287 | 	// If there is a blank line and no content so far,
288 | 	// the item is over. TODO explain
289 | 	if blank && !c.haveContent {
290 | 		return s, false
291 | 	}
292 | 
293 | 	// TODO explain
294 | 	if blank {
295 | 		// Goldmark does this and apparently commonmark.js too.
296 | 		// Not sure why it is necessary.
297 | 		return line{}, true
298 | 	}
299 | 
300 | 	// TODO explain
301 | 	if !blank {
302 | 		c.haveContent = true
303 | 	}
304 | 	return s, true
305 | }
306 | 
307 | func (b *itemBuilder) build(p *parser) Block {
308 | 	b.list.item = nil
309 | 	return &Item{p.pos(), p.blocks()}
310 | }
311 | 
312 | func (b *listBuilder) build(p *parser) Block {
313 | 	blocks := p.blocks()
314 | 	pos := p.pos()
315 | 
316 | 	// list can have wrong pos b/c extend dance.
317 | 	// TODO explain
318 | 	pos.EndLine = blocks[len(blocks)-1].Pos().EndLine
319 | 
320 | 	// Decide whether list is loose.
321 | 	loose := false
322 | Loose:
323 | 	for i, c := range blocks {
324 | 		c := c.(*Item)
325 | 		if i+1 < len(blocks) {
326 | 			if blocks[i+1].Pos().StartLine-c.EndLine > 1 {
327 | 				loose = true
328 | 				break Loose
329 | 			}
330 | 		}
331 | 		for j, d := range c.Blocks {
332 | 			endLine := d.Pos().EndLine
333 | 			if j+1 < len(c.Blocks) {
334 | 				if c.Blocks[j+1].Pos().StartLine-endLine > 1 {
335 | 					loose = true
336 | 					break Loose
337 | 				}
338 | 			}
339 | 		}
340 | 	}
341 | 
342 | 	if !loose {
343 | 		// TODO: rethink whether this is correct.
344 | 		// Perhaps the blocks should still be Paragraph
345 | 		// and we just skip over the <p> during formatting?
346 | 		// Then Text might not need to be a Block.
347 | 		for _, c := range blocks {
348 | 			c := c.(*Item)
349 | 			for i, d := range c.Blocks {
350 | 				if p, ok := d.(*Paragraph); ok {
351 | 					c.Blocks[i] = p.Text
352 | 				}
353 | 			}
354 | 		}
355 | 	}
356 | 
357 | 	x := &List{
358 | 		pos,
359 | 		b.bullet,
360 | 		b.start,
361 | 		loose,
362 | 		p.blocks(),
363 | 	}
364 | 	listCorner(p, x)
365 | 	if p.TaskList {
366 | 		p.addFixup(func() {
367 | 			parseTaskList(p, x)
368 | 		})
369 | 	}
370 | 	return x
371 | }
372 | 
373 | // listCorner checks whether list contains any corner cases
374 | // that other implementations mishandle, and if so sets p.corner.
375 | func listCorner(p *parser, list *List) {
376 | 	for _, item := range list.Items {
377 | 		item := item.(*Item)
378 | 		if len(item.Blocks) == 0 {
379 | 			// Goldmark mishandles what follows; see testdata/extra.txt 111.md.
380 | 			p.corner = true
381 | 			return
382 | 		}
383 | 		switch item.Blocks[0].(type) {
384 | 		case *List, *ThematicBreak, *CodeBlock:
385 | 			// Goldmark mishandles a list with various block items inside it.
386 | 			p.corner = true
387 | 			return
388 | 		}
389 | 	}
390 | }
391 | 
392 | // GitHub task list extension
393 | 
394 | // A Task is an [Inline] for a [task list item marker] (a checkbox),
395 | // a GitHub-flavored Markdown extension.
396 | //
397 | // [task list item marker]: https://github.github.com/gfm/#task-list-items-extension-
398 | type Task struct {
399 | 	Checked bool
400 | }
401 | 
402 | func (*Task) Inline() {}
403 | 
404 | func (x *Task) printHTML(p *printer) {
405 | 	p.html("<input ")
406 | 	if x.Checked {
407 | 		p.html(`checked="" `)
408 | 	}
409 | 	p.html(`disabled="" type="checkbox"> `)
410 | }
411 | 
412 | func (x *Task) printMarkdown(p *printer) {
413 | 	if x.Checked {
414 | 		p.text(`[x] `)
415 | 	} else {
416 | 		p.text(`[ ] `)
417 | 	}
418 | }
419 | 
420 | func (x *Task) printText(p *printer) {
421 | 	// Unreachable: printText is only used to render the
422 | 	// alt text of an image, which can only contain inlines,
423 | 	// and while Task is an inline, it only appears inside
424 | 	// lists, and a list cannot appear in an alt text.
425 | 	// Even so, maybe someone will make malformed syntax trees.
426 | 	x.printMarkdown(p)
427 | }
428 | 
429 | // taskList checks whether any items in list begin with task list markers.
430 | // If so, it replaces the markers with [Task]s.
431 | func parseTaskList(p *parser, list *List) {
432 | 	for _, item := range list.Items {
433 | 		item := item.(*Item)
434 | 		if len(item.Blocks) == 0 {
435 | 			continue
436 | 		}
437 | 		var text *Text
438 | 		switch b := item.Blocks[0].(type) {
439 | 		default:
440 | 			continue
441 | 		case *Paragraph:
442 | 			text = b.Text
443 | 		case *Text:
444 | 			text = b
445 | 		}
446 | 		if len(text.Inline) < 1 {
447 | 			// unreachable with standard parser
448 | 			continue
449 | 		}
450 | 		pl, ok := text.Inline[0].(*Plain)
451 | 		if !ok {
452 | 			continue
453 | 		}
454 | 		s := pl.Text
455 | 		if len(s) < 4 || s[0] != '[' || s[2] != ']' || (s[1] != ' ' && s[1] != 'x' && s[1] != 'X') {
456 | 			continue
457 | 		}
458 | 		if s[3] != ' ' && s[3] != '\t' {
459 | 			p.corner = true // goldmark does not require the space
460 | 			continue
461 | 		}
462 | 		text.Inline = append([]Inline{&Task{Checked: s[1] == 'x' || s[1] == 'X'},
463 | 			&Plain{Text: s[len("[x] "):]}}, text.Inline[1:]...)
464 | 	}
465 | }
466 | 


--------------------------------------------------------------------------------
/testdata/autoext.txt:
--------------------------------------------------------------------------------
  1 | -- parser.json --
  2 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true}
  3 | -- gfm622.md --
  4 | www.commonmark.org
  5 | -- gfm622.html --
  6 | <p><a href="http://www.commonmark.org">www.commonmark.org</a></p>
  7 | -- gfm623.md --
  8 | Visit www.commonmark.org/help for more information.
  9 | -- gfm623.html --
 10 | <p>Visit <a href="http://www.commonmark.org/help">www.commonmark.org/help</a> for more information.</p>
 11 | -- gfm624.md --
 12 | Visit www.commonmark.org.
 13 | 
 14 | Visit www.commonmark.org/a.b.
 15 | -- gfm624.html --
 16 | <p>Visit <a href="http://www.commonmark.org">www.commonmark.org</a>.</p>
 17 | <p>Visit <a href="http://www.commonmark.org/a.b">www.commonmark.org/a.b</a>.</p>
 18 | -- gfm625.md --
 19 | www.google.com/search?q=Markup+(business)
 20 | 
 21 | www.google.com/search?q=Markup+(business)))
 22 | 
 23 | (www.google.com/search?q=Markup+(business))
 24 | 
 25 | (www.google.com/search?q=Markup+(business)
 26 | -- gfm625.html --
 27 | <p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
 28 | <p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>))</p>
 29 | <p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)</p>
 30 | <p>(<a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a></p>
 31 | -- gfm626.md --
 32 | www.google.com/search?q=(business))+ok
 33 | -- gfm626.html --
 34 | <p><a href="http://www.google.com/search?q=(business))+ok">www.google.com/search?q=(business))+ok</a></p>
 35 | -- gfm627.md --
 36 | www.google.com/search?q=commonmark&hl=en
 37 | 
 38 | www.google.com/search?q=commonmark&hl;
 39 | -- gfm627.html --
 40 | <p><a href="http://www.google.com/search?q=commonmark&amp;hl=en">www.google.com/search?q=commonmark&amp;hl=en</a></p>
 41 | <p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&amp;hl;</p>
 42 | -- gfm628.md --
 43 | www.commonmark.org/he<lp
 44 | -- gfm628.html --
 45 | <p><a href="http://www.commonmark.org/he">www.commonmark.org/he</a>&lt;lp</p>
 46 | -- gfm629.md --
 47 | http://commonmark.org
 48 | 
 49 | (Visit http://encrypted.google.com/search?q=Markup+(business))
 50 | -- gfm629.html --
 51 | <p><a href="http://commonmark.org">http://commonmark.org</a></p>
 52 | <p>(Visit <a href="http://encrypted.google.com/search?q=Markup+(business)">http://encrypted.google.com/search?q=Markup+(business)</a>)</p>
 53 | -- gfm630.md --
 54 | foo@bar.baz
 55 | -- gfm630.html --
 56 | <p><a href="mailto:foo@bar.baz">foo@bar.baz</a></p>
 57 | -- gfm631.md --
 58 | hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.
 59 | -- gfm631.html --
 60 | <p>hello@mail+xyz.example isn't valid, but <a href="mailto:hello+xyz@mail.example">hello+xyz@mail.example</a> is.</p>
 61 | -- gfm632.md --
 62 | a.b-c_d@a.b
 63 | 
 64 | a.b-c_d@a.b.
 65 | 
 66 | a.b-c_d@a.b-
 67 | 
 68 | a.b-c_d@a.b_
 69 | -- gfm632.html --
 70 | <p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a></p>
 71 | <p><a href="mailto:a.b-c_d@a.b">a.b-c_d@a.b</a>.</p>
 72 | <p>a.b-c_d@a.b-</p>
 73 | <p>a.b-c_d@a.b_</p>
 74 | -- gfm633.md --
 75 | mailto:foo@bar.baz
 76 | 
 77 | mailto:a.b-c_d@a.b
 78 | 
 79 | mailto:a.b-c_d@a.b.
 80 | 
 81 | mailto:a.b-c_d@a.b/
 82 | 
 83 | mailto:a.b-c_d@a.b-
 84 | 
 85 | mailto:a.b-c_d@a.b_
 86 | 
 87 | xmpp:foo@bar.baz
 88 | 
 89 | xmpp:foo@bar.baz.
 90 | -- gfm633.html --
 91 | <p><a href="mailto:foo@bar.baz">mailto:foo@bar.baz</a></p>
 92 | <p><a href="mailto:a.b-c_d@a.b">mailto:a.b-c_d@a.b</a></p>
 93 | <p><a href="mailto:a.b-c_d@a.b">mailto:a.b-c_d@a.b</a>.</p>
 94 | <p><a href="mailto:a.b-c_d@a.b">mailto:a.b-c_d@a.b</a>/</p>
 95 | <p>mailto:a.b-c_d@a.b-</p>
 96 | <p>mailto:a.b-c_d@a.b_</p>
 97 | <p><a href="xmpp:foo@bar.baz">xmpp:foo@bar.baz</a></p>
 98 | <p><a href="xmpp:foo@bar.baz">xmpp:foo@bar.baz</a>.</p>
 99 | -- gfm634.md --
100 | xmpp:foo@bar.baz/txt
101 | 
102 | xmpp:foo@bar.baz/txt@bin
103 | 
104 | xmpp:foo@bar.baz/txt@bin.com
105 | -- gfm634.html --
106 | <p><a href="xmpp:foo@bar.baz/txt">xmpp:foo@bar.baz/txt</a></p>
107 | <p><a href="xmpp:foo@bar.baz/txt@bin">xmpp:foo@bar.baz/txt@bin</a></p>
108 | <p><a href="xmpp:foo@bar.baz/txt@bin.com">xmpp:foo@bar.baz/txt@bin.com</a></p>
109 | -- gfm635.md --
110 | xmpp:foo@bar.baz/txt/bin
111 | -- gfm635.html --
112 | <p><a href="xmpp:foo@bar.baz/txt">xmpp:foo@bar.baz/txt</a>/bin</p>
113 | -- 1.md --
114 | xhttp://go.dev y z
115 | αhttp://go.dev y z
116 | -- 1.html --
117 | <p>xhttp://go.dev y z
118 | α<a href="http://go.dev">http://go.dev</a> y z</p>
119 | -- 1a.md --
120 | xhttps://go.dev y z
121 | αhttps://go.dev y z
122 | -- 1a.html --
123 | <p>xhttps://go.dev y z
124 | α<a href="https://go.dev">https://go.dev</a> y z</p>
125 | -- 2.md --
126 | cannot follow ascii letter
127 | xhttp://go.dev y z
128 | x0http://go.dev
129 | αhttp://go.dev
130 | -- 2.html --
131 | <p>cannot follow ascii letter
132 | xhttp://go.dev y z
133 | x0<a href="http://go.dev">http://go.dev</a>
134 | α<a href="http://go.dev">http://go.dev</a></p>
135 | -- 3.md --
136 | deviations - github would include the suffixes in the URLs
137 | www.go.dev@def.ghi is my email
138 | www.go.dev!wtf
139 | -- 3.html --
140 | <p>deviations - github would include the suffixes in the URLs
141 | <a href="http://www.go.dev">www.go.dev</a>@def.ghi is my email
142 | <a href="http://www.go.dev">www.go.dev</a>!wtf</p>
143 | -- 4.md --
144 | trimming
145 | www.google.com/search?q=Markup+(business)))
146 | -- 4.html --
147 | <p>trimming
148 | <a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>))</p>
149 | -- 5.md --
150 | www.google.com/search?q=Markup+(business))).
151 | -- 5.html --
152 | <p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>)).</p>
153 | -- 6.md --
154 | www.google.com/search?q=Markup+(business).
155 | -- 6.html --
156 | <p><a href="http://www.google.com/search?q=Markup+(business)">www.google.com/search?q=Markup+(business)</a>.</p>
157 | -- 7.md --
158 | www.google.com/search?q=Markup+)()((business)
159 | -- 7.html --
160 | <p><a href="http://www.google.com/search?q=Markup+)()((business)">www.google.com/search?q=Markup+)()((business)</a></p>
161 | -- 8.md --
162 | www.google.com/search?q=commonmark&hl;
163 | -- 8.html --
164 | <p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&amp;hl;</p>
165 | -- 9.md --
166 | www.google.com/search?q=commonmark&hl;)
167 | -- 9.html --
168 | <p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>&amp;hl;)</p>
169 | -- 10.md --
170 | www.google.com/search?q=(commonmark&hl;)
171 | -- 10.html --
172 | <p><a href="http://www.google.com/search?q=(commonmark&amp;hl;)">www.google.com/search?q=(commonmark&amp;hl;)</a></p>
173 | -- 11.md --
174 | www.google.com/search?q=commonmark)&hl;
175 | -- 11.html --
176 | <p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>)&amp;hl;</p>
177 | -- 12.md --
178 | www.google.com/search?q=commonmark).&hl;
179 | -- 12.html --
180 | <p><a href="http://www.google.com/search?q=commonmark">www.google.com/search?q=commonmark</a>).&amp;hl;</p>
181 | -- 13.md --
182 | www.google.com/search?q=commonmark).&hl
183 | -- 13.html --
184 | <p><a href="http://www.google.com/search?q=commonmark).&amp;hl">www.google.com/search?q=commonmark).&amp;hl</a></p>
185 | -- 14.md --
186 | www.google.com/search?q=commonmark).&hl
187 | -- 14.html --
188 | <p><a href="http://www.google.com/search?q=commonmark).&amp;hl">www.google.com/search?q=commonmark).&amp;hl</a></p>
189 | -- 15.md --
190 | www.goo-gle.com/search
191 | -- 15.html --
192 | <p><a href="http://www.goo-gle.com/search">www.goo-gle.com/search</a></p>
193 | -- 16.md --
194 | www.goo_gle.com/search
195 | -- 16.html --
196 | <p>www.goo_gle.com/search</p>
197 | -- 17.md --
198 | www.foo_bar.google.com/search
199 | -- 17.html --
200 | <p><a href="http://www.foo_bar.google.com/search">www.foo_bar.google.com/search</a></p>
201 | -- 18.md --
202 | www./search
203 | -- 18.html --
204 | <p><a href="http://www./search">www./search</a></p>
205 | -- 19.md --
206 | www.google.com.foo_bar/search
207 | -- 19.html --
208 | <p>www.google.com.foo_bar/search</p>
209 | -- 20.md --
210 | www.search
211 | -- 20.html --
212 | <p><a href="http://www.search">www.search</a></p>
213 | -- 21.md --
214 | www.
215 | -- 21.html --
216 | <p>www.</p>
217 | -- 21a.md --
218 | www.!search
219 | -- 21a.html --
220 | <p><a href="http://www.">www.</a>!search</p>
221 | -- 22.md --
222 | www.sea_rch
223 | -- 22.html --
224 | <p>www.sea_rch</p>
225 | -- 23.md --
226 | http://!search
227 | -- 23.html --
228 | <p>http://!search</p>
229 | -- 24.md --
230 | http://!search
231 | -- 24.html --
232 | <p>http://!search</p>
233 | -- 25.md --
234 | http://search
235 | -- 25.html --
236 | <p><a href="http://search">http://search</a></p>
237 | -- 26.md --
238 | https://search
239 | -- 26.html --
240 | <p><a href="https://search">https://search</a></p>
241 | -- 27.md --
242 | http://sea_rch
243 | -- 27.html --
244 | <p>http://sea_rch</p>
245 | -- 28.md --
246 | https://sea_rch
247 | -- 28.html --
248 | <p>https://sea_rch</p>
249 | -- 29.md --
250 | http://sea_rch.x
251 | -- 29.html --
252 | <p>http://sea_rch.x</p>
253 | -- 30.md --
254 | https://sea_rch.x
255 | -- 30.html --
256 | <p>https://sea_rch.x</p>
257 | -- 31.md --
258 | http://sea_rch.x.y
259 | -- 31.html --
260 | <p><a href="http://sea_rch.x.y">http://sea_rch.x.y</a></p>
261 | -- 32.md --
262 | http://sea_rch.x.y.http://www.google.com
263 | -- 32.html --
264 | <p><a href="http://sea_rch.x.y.http://www.google.com">http://sea_rch.x.y.http://www.google.com</a></p>
265 | -- 33.md --
266 | http://sea_rch.http://www.google.com
267 | -- 33.html --
268 | <p>http://sea_rch.<a href="http://www.google.com">http://www.google.com</a></p>
269 | -- 34.md --
270 | _abc_@ghi.def is my email
271 | -- 34.html --
272 | <p><em>abc</em>@ghi.def is my email</p>
273 | -- 35.md --
274 | _abc@ghi_.def is my email
275 | -- 35.html --
276 | <p><em>abc@ghi</em>.def is my email</p>
277 | -- 36.md --
278 | `hello`abc@def.ghi is my email
279 | -- 36.html --
280 | <p><code>hello</code><a href="mailto:abc@def.ghi">abc@def.ghi</a> is my email</p>
281 | -- 37.md --
282 | `hello` abc@def.ghi is my email
283 | -- 37.html --
284 | <p><code>hello</code> <a href="mailto:abc@def.ghi">abc@def.ghi</a> is my email</p>
285 | -- 38.md --
286 | *hello*abc@def.ghi is my email
287 | -- 38.html --
288 | <p><em>hello</em><a href="mailto:abc@def.ghi">abc@def.ghi</a> is my email</p>
289 | -- 39.md --
290 | [link](link)abc@def.ghi is my email
291 | -- 39.html --
292 | <p><a href="link">link</a><a href="mailto:abc@def.ghi">abc@def.ghi</a> is my email</p>
293 | -- 40.md --
294 | \!abc@def.ghi is my email
295 | -- 40.html --
296 | <p>!<a href="mailto:abc@def.ghi">abc@def.ghi</a> is my email</p>
297 | -- 41.md --
298 | $abc@def.ghi is my email
299 | -- 41.html --
300 | <p>$<a href="mailto:abc@def.ghi">abc@def.ghi</a> is my email</p>
301 | -- 42.md --
302 | www.go.dev@def.ghi is my email
303 | -- 42.html --
304 | <p><a href="http://www.go.dev">www.go.dev</a>@def.ghi is my email</p>
305 | -- 43.md --
306 | abc@www.go.dev is my email
307 | -- 43.html --
308 | <p><a href="mailto:abc@www.go.dev">abc@www.go.dev</a> is my email</p>
309 | -- 44.md --
310 | αabc@def.ghi
311 | -- 44.html --
312 | <p>α<a href="mailto:abc@def.ghi">abc@def.ghi</a></p>
313 | -- 45.md --
314 | https://web.site:8080/~matloob
315 | -- 45.html --
316 | <p><a href="https://web.site:8080/~matloob">https://web.site:8080/~matloob</a></p>
317 | -- parser.json --
318 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true, "Strikethrough": true}
319 | -- 46.md --
320 | https://web.site:8080/~matloob
321 | -- 46.html --
322 | <p><a href="https://web.site:8080/~matloob">https://web.site:8080/~matloob</a></p>
323 | -- parser.json --
324 | {"AutoLinkText": true, "AutoLinkAssumeHTTP": true}
325 | -- 47.md --
326 | https://web.site:8080/*matlo_ob
327 | -- 47.html --
328 | <p><a href="https://web.site:8080/*matlo_ob">https://web.site:8080/*matlo_ob</a></p>
329 | -- parser.json --
330 | {"AutoLinkText": true, "Strikethrough": true}
331 | -- 48.md --
332 | *user@dom.org*
333 | -- 48.html --
334 | <p><em><a href="mailto:user@dom.org">user@dom.org</a></em></p>
335 | -- 49.md --
336 | **user@dom.org**
337 | -- 49.html --
338 | <p><strong><a href="mailto:user@dom.org">user@dom.org</a></strong></p>
339 | -- 50.md --
340 | ~~user@dom.org~~
341 | -- 50.html --
342 | <p><del><a href="mailto:user@dom.org">user@dom.org</a></del></p>
343 | -- 51.md --
344 | www.google.com/search?q=cmark&-hl;
345 | -- 51.html --
346 | <p><a href="https://www.google.com/search?q=cmark&amp;-hl">www.google.com/search?q=cmark&amp;-hl</a>;</p>
347 | -- 52.md --
348 | foo@.bar
349 | -- 52.html --
350 | <p><a href="mailto:foo@.bar">foo@.bar</a></p>
351 | -- 53.md --
352 | foo@..bar
353 | -- 53.html --
354 | <p>foo@..bar</p>
355 | -- 54.md --
356 | mailto:none
357 | mailto:none#
358 | -- 54.html --
359 | <p>mailto:none
360 | mailto:none#</p>
361 | -- 55.md --
362 | xmpp:none
363 | xmpp:none#
364 | xmpp:foo@..bar
365 | -- 55.html --
366 | <p>xmpp:none
367 | xmpp:none#
368 | xmpp:foo@..bar</p>
369 | 


--------------------------------------------------------------------------------
/md_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"encoding/json"
 10 | 	"flag"
 11 | 	"fmt"
 12 | 	"go/token"
 13 | 	"io"
 14 | 	"net/url"
 15 | 	"os"
 16 | 	"path/filepath"
 17 | 	"reflect"
 18 | 	"strings"
 19 | 	"testing"
 20 | 
 21 | 	"github.com/yuin/goldmark"
 22 | 	gext "github.com/yuin/goldmark/extension"
 23 | 	gparser "github.com/yuin/goldmark/parser"
 24 | 	ghtml "github.com/yuin/goldmark/renderer/html"
 25 | 	"golang.org/x/tools/txtar"
 26 | )
 27 | 
 28 | var goldmarkFlag = flag.Bool("goldmark", false, "run goldmark tests")
 29 | 
 30 | var roundTripFailures = map[string]bool{
 31 | 	"TestToHTML/extra/13":  true, // indentation of tag
 32 | 	"TestToHTML/extra/75":  true, // weird list
 33 | 	"TestToHTML/extra/76":  true, // weird list
 34 | 	"TestToHTML/extra/115": true, // weird list
 35 | 
 36 | 	"TestToHTML/gfm_ext/9":  true, // table
 37 | 	"TestToHTML/gfm_ext/11": true, // table
 38 | 
 39 | 	"TestToHTML/spec0.29/19":  true, // thematic break
 40 | 	"TestToHTML/spec0.29/40":  true, // indentation of heading
 41 | 	"TestToHTML/spec0.29/51":  true, // newline in heading
 42 | 	"TestToHTML/spec0.29/52":  true, // newline in heading
 43 | 	"TestToHTML/spec0.29/57":  true, // setext heading
 44 | 	"TestToHTML/spec0.29/63":  true, // setext heading
 45 | 	"TestToHTML/spec0.29/65":  true, // newline in heading
 46 | 	"TestToHTML/spec0.29/171": true, // link ref def
 47 | 	"TestToHTML/spec0.29/208": true, // weird list
 48 | 	"TestToHTML/spec0.29/227": true, // weird list
 49 | 	"TestToHTML/spec0.29/241": true, // weird list
 50 | 	"TestToHTML/spec0.29/282": true, // weird list
 51 | 	"TestToHTML/spec0.29/283": true, // weird list
 52 | 	"TestToHTML/spec0.29/312": true, // escape plain
 53 | 	"TestToHTML/spec0.29/323": true, // escape plain
 54 | 	"TestToHTML/spec0.29/324": true, // escape plain
 55 | 	"TestToHTML/spec0.29/325": true, // escape plain
 56 | 	"TestToHTML/spec0.29/326": true, // escape plain
 57 | 	"TestToHTML/spec0.29/327": true, // escape plain
 58 | 	"TestToHTML/spec0.29/331": true, // backtick spaces
 59 | 	"TestToHTML/spec0.29/349": true, // backticks
 60 | 	"TestToHTML/spec0.29/502": true, // escape quotes
 61 | 
 62 | 	"TestToHTML/spec0.30/26":  true, // escape plain
 63 | 	"TestToHTML/spec0.30/37":  true, // escape plain
 64 | 	"TestToHTML/spec0.30/38":  true, // escape plain
 65 | 	"TestToHTML/spec0.30/39":  true, // escape plain
 66 | 	"TestToHTML/spec0.30/40":  true, // escape plain
 67 | 	"TestToHTML/spec0.30/41":  true, // escape plain
 68 | 	"TestToHTML/spec0.30/49":  true, // thematic break
 69 | 	"TestToHTML/spec0.30/70":  true, // indentation of heading
 70 | 	"TestToHTML/spec0.30/81":  true, // newline in heading
 71 | 	"TestToHTML/spec0.30/82":  true, // newline in heading
 72 | 	"TestToHTML/spec0.30/87":  true, // setext heading
 73 | 	"TestToHTML/spec0.30/93":  true, // setext heading
 74 | 	"TestToHTML/spec0.30/95":  true, // newline in heading
 75 | 	"TestToHTML/spec0.30/202": true, // link ref def
 76 | 	"TestToHTML/spec0.30/238": true, // weird list
 77 | 	"TestToHTML/spec0.30/257": true, // weird list
 78 | 	"TestToHTML/spec0.30/271": true, // weird list
 79 | 	"TestToHTML/spec0.30/312": true, // weird list
 80 | 	"TestToHTML/spec0.30/313": true, // weird list
 81 | 	"TestToHTML/spec0.30/331": true, // backtick spaces
 82 | 	"TestToHTML/spec0.30/349": true, // backticks
 83 | 	"TestToHTML/spec0.30/505": true, // escape quotes
 84 | 
 85 | 	"TestToHTML/spec0.31.2/26":  true, // escape plain
 86 | 	"TestToHTML/spec0.31.2/37":  true, // escape plain
 87 | 	"TestToHTML/spec0.31.2/38":  true, // escape plain
 88 | 	"TestToHTML/spec0.31.2/39":  true, // escape plain
 89 | 	"TestToHTML/spec0.31.2/40":  true, // escape plain
 90 | 	"TestToHTML/spec0.31.2/41":  true, // escape plain
 91 | 	"TestToHTML/spec0.31.2/49":  true, // thematic break
 92 | 	"TestToHTML/spec0.31.2/70":  true, // indentation of heading
 93 | 	"TestToHTML/spec0.31.2/81":  true, // newline in heading
 94 | 	"TestToHTML/spec0.31.2/82":  true, // newline in heading
 95 | 	"TestToHTML/spec0.31.2/87":  true, // setext heading
 96 | 	"TestToHTML/spec0.31.2/93":  true, // setext heading
 97 | 	"TestToHTML/spec0.31.2/95":  true, // newline in heading
 98 | 	"TestToHTML/spec0.31.2/202": true, // link ref def
 99 | 	"TestToHTML/spec0.31.2/238": true, // weird list
100 | 	"TestToHTML/spec0.31.2/257": true, // weird list
101 | 	"TestToHTML/spec0.31.2/271": true, // weird list
102 | 	"TestToHTML/spec0.31.2/312": true, // weird list
103 | 	"TestToHTML/spec0.31.2/313": true, // weird list
104 | 	"TestToHTML/spec0.31.2/331": true, // backtick spaces
105 | 	"TestToHTML/spec0.31.2/349": true, // backticks
106 | 	"TestToHTML/spec0.31.2/506": true, // escape quotes
107 | 
108 | 	"TestToHTML/table/gfm200": true, // table
109 | 	"TestToHTML/table/2":      true, // table
110 | }
111 | 
112 | func TestToHTML(t *testing.T) {
113 | 	files, err := filepath.Glob("testdata/*.txt")
114 | 	if err != nil {
115 | 		t.Fatal(err)
116 | 	}
117 | 	for _, file := range files {
118 | 		if strings.HasSuffix(file, "_fmt.txt") {
119 | 			continue
120 | 		}
121 | 		t.Run(strings.TrimSuffix(filepath.Base(file), ".txt"), func(t *testing.T) {
122 | 			a, err := txtar.ParseFile(file)
123 | 			if err != nil {
124 | 				t.Fatal(err)
125 | 			}
126 | 
127 | 			var p Parser
128 | 			var ncase, npass int
129 | 			for i := 0; i+2 <= len(a.Files); {
130 | 				if a.Files[i].Name == "parser.json" {
131 | 					p = parseParser(t, a.Files[i].Data)
132 | 					i++
133 | 					continue
134 | 				}
135 | 				ncase++
136 | 				md := a.Files[i]
137 | 				html := a.Files[i+1]
138 | 				i += 2
139 | 				name := strings.TrimSuffix(md.Name, ".md")
140 | 				if name != strings.TrimSuffix(html.Name, ".html") {
141 | 					t.Fatalf("mismatched file pair: %s and %s", md.Name, html.Name)
142 | 				}
143 | 
144 | 				t.Run(name, func(t *testing.T) {
145 | 					doc := p.Parse(decode(string(md.Data)))
146 | 					h := encode(ToHTML(doc))
147 | 					if h != string(html.Data) {
148 | 						q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20")
149 | 						t.Fatalf("input %q\nparse:\n%s\nhave %q\nwant %q\ndingus: (https://spec.commonmark.org/dingus/?text=%s)\ngithub: (https://github.com/rsc/tmp/issues/new?body=%s)", md.Data, dump(doc), h, html.Data, q, q)
150 | 					}
151 | 
152 | 					// Make sure unexported types like emphPlain don't leak into result.
153 | 					if x, ok := findUnexported(reflect.ValueOf(doc)); ok {
154 | 						t.Fatalf("input %q\nparse:\n%s\nfound parsed value of unexported type %s", md.Data, dump(doc), x.Type())
155 | 					}
156 | 
157 | 					// Make sure Format preserves the HTML.
158 | 					md1 := Format(doc)
159 | 					doc1 := p.Parse(md1)
160 | 					h1 := encode(ToHTML(doc1))
161 | 					if h1 != string(html.Data) && !roundTripFailures[t.Name()] {
162 | 						q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20")
163 | 						t.Fatalf("input %q\nreformat %q\n%s\n%s\nhave %q\nwant %q\ndingus: (https://spec.commonmark.org/dingus/?text=%s)\ngithub: (https://github.com/rsc/tmp/issues/new?body=%s)", md.Data, md1, dump(doc), dump(doc1), h1, html.Data, q, q)
164 | 					}
165 | 					if h1 == string(html.Data) && roundTripFailures[t.Name()] {
166 | 						t.Fatalf("no longer failing")
167 | 					}
168 | 
169 | 					npass++
170 | 				})
171 | 
172 | 				if !*goldmarkFlag {
173 | 					continue
174 | 				}
175 | 				t.Run("goldmark/"+name, func(t *testing.T) {
176 | 					in := decode(string(md.Data))
177 | 					_, corner := p.parse(in)
178 | 					if corner {
179 | 						t.Skip("known corner case")
180 | 					}
181 | 					gm := goldmarkParser(&p)
182 | 					var buf bytes.Buffer
183 | 					if err := gm.Convert([]byte(in), &buf); err != nil {
184 | 						t.Fatal(err)
185 | 					}
186 | 					if buf.Len() > 0 && buf.Bytes()[buf.Len()-1] != '\n' {
187 | 						buf.WriteByte('\n')
188 | 					}
189 | 					want := decode(string(html.Data))
190 | 					want = strings.ReplaceAll(want, " />", ">")
191 | 					out := buf.String()
192 | 					out = strings.ReplaceAll(out, " />", ">")
193 | 					q := strings.ReplaceAll(url.QueryEscape(decode(string(md.Data))), "+", "%20")
194 | 					if out != want {
195 | 						t.Fatalf("\n    - input: ``%q``\n    - output: ``%q``\n    - golden: ``%q``\n    - [dingus](https://spec.commonmark.org/dingus/?text=%s)\n    - [github](https://github.com/rsc/tmp/issues/new?body=%s)", in, out, want, q, q)
196 | 					}
197 | 					npass++
198 | 
199 | 				})
200 | 			}
201 | 			t.Logf("%d/%d pass", npass, ncase)
202 | 		})
203 | 	}
204 | }
205 | 
206 | func goldmarkParser(p *Parser) goldmark.Markdown {
207 | 	opts := []goldmark.Option{
208 | 		goldmark.WithRendererOptions(ghtml.WithUnsafe()),
209 | 	}
210 | 	if p.HeadingID {
211 | 		opts = append(opts, goldmark.WithParserOptions(gparser.WithHeadingAttribute()))
212 | 	}
213 | 	if p.Strikethrough {
214 | 		opts = append(opts, goldmark.WithExtensions(gext.Strikethrough))
215 | 	}
216 | 	if p.TaskList {
217 | 		opts = append(opts, goldmark.WithExtensions(gext.TaskList))
218 | 	}
219 | 	if p.AutoLinkText {
220 | 		opts = append(opts, goldmark.WithExtensions(gext.Linkify))
221 | 	}
222 | 	if p.Table {
223 | 		opts = append(opts, goldmark.WithExtensions(gext.Table))
224 | 	}
225 | 	return goldmark.New(opts...)
226 | }
227 | 
228 | func decode(s string) string {
229 | 	s = strings.ReplaceAll(s, "^J\n", "\n")
230 | 	s = strings.ReplaceAll(s, "^M", "\r")
231 | 	s = strings.ReplaceAll(s, "^D\n", "")
232 | 	s = strings.ReplaceAll(s, "^@", "\x00")
233 | 	return s
234 | }
235 | 
236 | func encode(s string) string {
237 | 	s = strings.ReplaceAll(s, "\r\n", "^M\n")
238 | 	s = strings.ReplaceAll(s, "\r", "^M^D\n")
239 | 	s = strings.ReplaceAll(s, " \n", " ^J\n")
240 | 	s = strings.ReplaceAll(s, "\t\n", "\t^J\n")
241 | 	s = strings.ReplaceAll(s, "\x00", "^@")
242 | 	if s != "" && !strings.HasSuffix(s, "\n") {
243 | 		s += "^D\n"
244 | 	}
245 | 	return s
246 | }
247 | 
248 | func parseParser(t *testing.T, data []byte) Parser {
249 | 	d := json.NewDecoder(bytes.NewReader(data))
250 | 	d.DisallowUnknownFields()
251 | 	var p Parser
252 | 	err := d.Decode(&p)
253 | 	if err != nil {
254 | 		t.Fatalf("reading parser.json: %v", err)
255 | 	}
256 | 	err = d.Decode(new(json.RawMessage))
257 | 	if err != io.EOF {
258 | 		t.Fatalf("junk on end of parser.json")
259 | 	}
260 | 	return p
261 | }
262 | 
263 | func TestFormat(t *testing.T) {
264 | 	files, err := filepath.Glob(filepath.Join("testdata", "*_fmt.txt"))
265 | 	if err != nil {
266 | 		t.Fatal(err)
267 | 	}
268 | 	for _, file := range files {
269 | 		t.Run(strings.TrimSuffix(filepath.Base(file), ".txt"), func(t *testing.T) {
270 | 			a, err := txtar.ParseFile(file)
271 | 			if err != nil {
272 | 				t.Fatal(err)
273 | 			}
274 | 			var p Parser
275 | 			for i := 0; i < len(a.Files); {
276 | 				if a.Files[i].Name == "parser.json" {
277 | 					p = parseParser(t, a.Files[i].Data)
278 | 					i++
279 | 					continue
280 | 				}
281 | 				// Each test case is a single markdown document that should render either as itself,
282 | 				// or if followed by a file named "want", then by that file.
283 | 				name := a.Files[i].Name
284 | 				in := a.Files[i].Data
285 | 				wantb := in
286 | 				i++
287 | 				if i < len(a.Files) && a.Files[i].Name == "want" {
288 | 					wantb = a.Files[i].Data
289 | 					i++
290 | 				}
291 | 				t.Run(name, func(t *testing.T) {
292 | 					doc := p.Parse(decode(string(in)))
293 | 					want := decode(string(wantb))
294 | 					docWant := p.Parse(want)
295 | 					if ToHTML(doc) != ToHTML(docWant) {
296 | 						t.Errorf("bad testdata: input and want are different markdown documents:\ninput:\n%s\n\nwant:\n%s", dump(doc), dump(docWant))
297 | 					}
298 | 					h := Format(doc)
299 | 					h = encode(h)
300 | 					if h != want {
301 | 						t.Errorf("input %q\nparse: \n%s\nhave %q\nwant %q", in, dump(doc), h, want)
302 | 					}
303 | 				})
304 | 			}
305 | 		})
306 | 	}
307 | 
308 | 	// Files ending in ".md" should render as themselves.
309 | 	files, err = filepath.Glob(filepath.Join("testdata", "*.md"))
310 | 	if err != nil {
311 | 		t.Fatal(err)
312 | 	}
313 | 	for _, file := range files {
314 | 		t.Run(strings.TrimSuffix(filepath.Base(file), ".md"), func(t *testing.T) {
315 | 			data, err := os.ReadFile(file)
316 | 			if err != nil {
317 | 				t.Fatal(err)
318 | 			}
319 | 			w := string(data)
320 | 			var p Parser
321 | 			doc := p.Parse(w)
322 | 			h := Format(doc)
323 | 			if h != w {
324 | 				t.Errorf("have:\n%s\nwant:\n%s", h, w)
325 | 				outfile := file + ".have"
326 | 				t.Logf("writing have to %s", outfile)
327 | 				if err := os.WriteFile(outfile, []byte(h), 0666); err != nil {
328 | 					t.Fatal(err)
329 | 				}
330 | 			}
331 | 		})
332 | 	}
333 | }
334 | 
335 | func TestInline(t *testing.T) {
336 | 	// Test that these don't crash,
337 | 	// and also "cover" the bodies.
338 | 	new(HardBreak).Inline()
339 | 	new(SoftBreak).Inline()
340 | 	new(HTMLTag).Inline()
341 | 	new(Plain).Inline()
342 | 	new(Code).Inline()
343 | 	new(Strong).Inline()
344 | 	new(Del).Inline()
345 | 	new(Emph).Inline()
346 | 	new(Emoji).Inline()
347 | 	new(AutoLink).Inline()
348 | 	new(Link).Inline()
349 | 	new(Image).Inline()
350 | 	new(Task).Inline()
351 | }
352 | 
353 | func findUnexported(v reflect.Value) (reflect.Value, bool) {
354 | 	if t := v.Type(); t.PkgPath() != "" && !token.IsExported(t.Name()) {
355 | 		return v, true
356 | 	}
357 | 	switch v.Kind() {
358 | 	case reflect.Interface, reflect.Pointer:
359 | 		if !v.IsNil() {
360 | 			if u, ok := findUnexported(v.Elem()); ok {
361 | 				return u, true
362 | 			}
363 | 		}
364 | 	case reflect.Struct:
365 | 		for i := 0; i < v.Type().NumField(); i++ {
366 | 			if !v.Type().Field(i).IsExported() {
367 | 				return v, true
368 | 			}
369 | 			if u, ok := findUnexported(v.Field(i)); ok {
370 | 				return u, true
371 | 			}
372 | 		}
373 | 	case reflect.Slice, reflect.Array:
374 | 		for i := 0; i < v.Len(); i++ {
375 | 			if u, ok := findUnexported(v.Index(i)); ok {
376 | 				return u, true
377 | 			}
378 | 		}
379 | 	}
380 | 	return v, false
381 | }
382 | 
383 | var (
384 | 	blockType   = reflect.TypeOf(new(Block)).Elem()
385 | 	blocksType  = reflect.TypeOf(new([]Block)).Elem()
386 | 	inlinesType = reflect.TypeOf(new(Inlines)).Elem()
387 | )
388 | 
389 | func printb(buf *bytes.Buffer, b Block, prefix string) {
390 | 	fmt.Fprintf(buf, "(%T", b)
391 | 	v := reflect.ValueOf(b)
392 | 	v = reflect.Indirect(v)
393 | 	if v.Kind() != reflect.Struct {
394 | 		fmt.Fprintf(buf, " %v", b)
395 | 	}
396 | 	t := v.Type()
397 | 	for i := 0; i < t.NumField(); i++ {
398 | 		tf := t.Field(i)
399 | 		if !tf.IsExported() {
400 | 			continue
401 | 		}
402 | 		if tf.Type == inlinesType {
403 | 			printis(buf, v.Field(i).Interface().(Inlines))
404 | 		} else if tf.Type.Kind() == reflect.Slice && tf.Type.Elem().Kind() == reflect.String {
405 | 			fmt.Fprintf(buf, " %s:%q", tf.Name, v.Field(i))
406 | 		} else if tf.Type != blocksType && !tf.Type.Implements(blockType) && tf.Type.Kind() != reflect.Slice {
407 | 			fmt.Fprintf(buf, " %s:%v", tf.Name, v.Field(i))
408 | 		}
409 | 	}
410 | 
411 | 	prefix += "\t"
412 | 	for i := 0; i < t.NumField(); i++ {
413 | 		tf := t.Field(i)
414 | 		if !tf.IsExported() {
415 | 			continue
416 | 		}
417 | 		if tf.Type.Implements(blockType) {
418 | 			fmt.Fprintf(buf, "\n%s", prefix)
419 | 			printb(buf, v.Field(i).Interface().(Block), prefix)
420 | 		} else if tf.Type == blocksType {
421 | 			vf := v.Field(i)
422 | 			for i := 0; i < vf.Len(); i++ {
423 | 				fmt.Fprintf(buf, "\n%s", prefix)
424 | 				printb(buf, vf.Index(i).Interface().(Block), prefix)
425 | 			}
426 | 		} else if tf.Type.Kind() == reflect.Slice && tf.Type != inlinesType && tf.Type.Elem().Kind() != reflect.String {
427 | 			fmt.Fprintf(buf, "\n%s%s:", prefix, t.Field(i).Name)
428 | 			printslice(buf, v.Field(i), prefix)
429 | 		}
430 | 	}
431 | 	fmt.Fprintf(buf, ")")
432 | }
433 | 
434 | func printslice(buf *bytes.Buffer, v reflect.Value, prefix string) {
435 | 	if v.Type().Elem().Kind() == reflect.Slice {
436 | 		for i := 0; i < v.Len(); i++ {
437 | 			fmt.Fprintf(buf, "\n%s#%d:", prefix, i)
438 | 			printslice(buf, v.Index(i), prefix+"\t")
439 | 		}
440 | 		return
441 | 	}
442 | 	for i := 0; i < v.Len(); i++ {
443 | 		fmt.Fprintf(buf, " ")
444 | 		printb(buf, v.Index(i).Interface().(Block), prefix+"\t")
445 | 	}
446 | }
447 | 
448 | func printi(buf *bytes.Buffer, in Inline) {
449 | 	fmt.Fprintf(buf, "%T(", in)
450 | 	v := reflect.ValueOf(in).Elem()
451 | 	label := v.FieldByName("Label")
452 | 	if label.IsValid() {
453 | 		fmt.Fprintf(buf, "%q", label)
454 | 	}
455 | 	text := v.FieldByName("Text")
456 | 	if text.IsValid() {
457 | 		fmt.Fprintf(buf, "%q", text)
458 | 	}
459 | 	inner := v.FieldByName("Inner")
460 | 	if inner.IsValid() {
461 | 		printis(buf, inner.Interface().(Inlines))
462 | 	}
463 | 	buf.WriteString(")")
464 | }
465 | 
466 | func printis(buf *bytes.Buffer, ins []Inline) {
467 | 	for _, in := range ins {
468 | 		buf.WriteByte(' ')
469 | 		printi(buf, in)
470 | 	}
471 | }
472 | 
473 | func dump(b Block) string {
474 | 	var buf bytes.Buffer
475 | 	printb(&buf, b, "")
476 | 	return buf.String()
477 | }
478 | 


--------------------------------------------------------------------------------
/html.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package markdown
  6 | 
  7 | import (
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"unicode"
 11 | )
 12 | 
 13 | // An HTMLBlock is a [Block] representing an [HTML block].
 14 | //
 15 | // [HTML block]: https://spec.commonmark.org/0.31.2/#html-blocks
 16 | type HTMLBlock struct {
 17 | 	Position
 18 | 	// TODO should these be 'Text string'?
 19 | 	Text []string // lines, without trailing newlines
 20 | }
 21 | 
 22 | func (*HTMLBlock) Block() {}
 23 | 
 24 | func (b *HTMLBlock) printHTML(p *printer) {
 25 | 	for _, s := range b.Text {
 26 | 		p.html(s)
 27 | 		p.html("\n")
 28 | 	}
 29 | }
 30 | 
 31 | func (b *HTMLBlock) printMarkdown(p *printer) {
 32 | 	p.maybeNL()
 33 | 	for i, line := range b.Text {
 34 | 		if i > 0 {
 35 | 			p.nl()
 36 | 		}
 37 | 		p.WriteString(line)
 38 | 		p.noTrim()
 39 | 	}
 40 | }
 41 | 
 42 | // An htmlBuilder is a [blockBuilder] for an [HTMLBlock].
 43 | // If endBlank is true, the block ends immediately before the first blank line.
 44 | // If endFunc is non-nil, the block ends immediately after the first line
 45 | // for which endFunc returns true.
 46 | type htmlBuilder struct {
 47 | 	endBlank bool
 48 | 	endFunc  func(string) bool
 49 | 	text     []string //accumulated text
 50 | }
 51 | 
 52 | func (c *htmlBuilder) extend(p *parser, s line) (line, bool) {
 53 | 	if c.endBlank && s.isBlank() {
 54 | 		return s, false
 55 | 	}
 56 | 	t := s.string()
 57 | 	c.text = append(c.text, t)
 58 | 	if c.endFunc != nil && c.endFunc(t) {
 59 | 		return line{}, false
 60 | 	}
 61 | 	return line{}, true
 62 | }
 63 | 
 64 | func (c *htmlBuilder) build(p *parser) Block {
 65 | 	return &HTMLBlock{
 66 | 		p.pos(),
 67 | 		c.text,
 68 | 	}
 69 | }
 70 | 
 71 | // An HTMLTag is an [Inline] representing a [raw HTML tag].
 72 | //
 73 | // [raw HTML tag]: https://spec.commonmark.org/0.31.2/#raw-html
 74 | type HTMLTag struct {
 75 | 	Text string // TODO rename to HTML?
 76 | }
 77 | 
 78 | func (*HTMLTag) Inline() {}
 79 | 
 80 | func (x *HTMLTag) printHTML(p *printer) {
 81 | 	p.html(x.Text)
 82 | }
 83 | 
 84 | func (x *HTMLTag) printMarkdown(p *printer) {
 85 | 	// TODO are there newlines? probably not
 86 | 	for i, line := range strings.Split(x.Text, "\n") {
 87 | 		if i > 0 {
 88 | 			p.nl()
 89 | 		}
 90 | 		p.WriteString(line)
 91 | 		p.noTrim()
 92 | 	}
 93 | }
 94 | 
 95 | func (x *HTMLTag) printText(p *printer) {}
 96 | 
 97 | // startHTMLBlock is a [starter] for an [HTMLBlock].
 98 | //
 99 | // See https://spec.commonmark.org/0.31.2/#html-blocks.
100 | func startHTMLBlock(p *parser, s line) (line, bool) {
101 | 	// Early out: block must start with a <.
102 | 	tt := s
103 | 	tt.trimSpace(0, 3, false) // TODO figure out trimSpace final argument
104 | 	if tt.peek() != '<' {
105 | 		return s, false
106 | 	}
107 | 	t := tt.string()
108 | 
109 | 	// Check all 7 block types.
110 | 	if startHTMLBlock1(p, s, t) ||
111 | 		startHTMLBlock2345(p, s, t) ||
112 | 		startHTMLBlock6(p, s, t) ||
113 | 		startHTMLBlock7(p, s, t) {
114 | 		return line{}, true
115 | 	}
116 | 
117 | 	return s, false
118 | }
119 | 
120 | const forceLower = 0x20 // ASCII letter | forceLower == ASCII lower-case
121 | 
122 | // startHTMLBlock1 handles HTML block type 1:
123 | // line starting with <pre, <script, <style, or <textarea
124 | // up through a (not necessarily matching) closing </pre> </script> </style> or </textarea>.
125 | //
126 | // s is the entire line, for saving if starting a block.
127 | // t is the line as a string, with leading spaces removed; it starts with <.
128 | func startHTMLBlock1(p *parser, s line, t string) bool {
129 | 	if len(t) < 2 {
130 | 		return false
131 | 	}
132 | 	if c := t[1] | forceLower; c != 'p' && c != 's' && c != 't' { // early out; check first letter
133 | 		return false
134 | 	}
135 | 	i := 2
136 | 	for i < len(t) && (t[i] != ' ' && t[i] != '\t' && t[i] != '>') {
137 | 		i++
138 | 	}
139 | 	if !isBlock1Tag(t[1:i]) {
140 | 		return false
141 | 	}
142 | 	b := &htmlBuilder{endFunc: endBlock1}
143 | 	p.addBlock(b)
144 | 	b.text = append(b.text, s.string())
145 | 	if endBlock1(t) {
146 | 		p.closeBlock()
147 | 	}
148 | 	return true
149 | }
150 | 
151 | // endBlock1 reports whether the string contains
152 | // </pre>, </script>, </style>, or </textarea>,
153 | // using ASCII case-insensitive matching.
154 | func endBlock1(s string) bool {
155 | 	start := -1
156 | 	for i := 0; i < len(s); i++ {
157 | 		if s[i] == '<' && i+1 < len(s) && s[i+1] == '/' {
158 | 			start = i + 2
159 | 		}
160 | 		if s[i] == '>' && start >= 0 {
161 | 			if isBlock1Tag(s[start:i]) {
162 | 				return true
163 | 			}
164 | 			start = -1
165 | 		}
166 | 	}
167 | 	return false
168 | }
169 | 
170 | // isBlock1Tag reports whether tag is a tag that can open or close
171 | // HTML block type 1.
172 | func isBlock1Tag(tag string) bool {
173 | 	return lowerEq(tag, "pre") || lowerEq(tag, "script") || lowerEq(tag, "style") || lowerEq(tag, "textarea")
174 | }
175 | 
176 | // lowerEq reports whether strings.ToLower(s) == lower
177 | // assuming lower is entirely ASCII lower-case letters.
178 | func lowerEq(s, lower string) bool {
179 | 	if len(s) != len(lower) {
180 | 		return false
181 | 	}
182 | 	lower = lower[:len(s)]
183 | 	for i := 0; i < len(s); i++ {
184 | 		if s[i]|forceLower != lower[i] {
185 | 			return false
186 | 		}
187 | 	}
188 | 	return true
189 | }
190 | 
191 | // startHTMLBlock2345 handles HTML blocks types 2, 3, 4, and 5,
192 | // the ones that start and end a specific string constant.
193 | //
194 | // s is the entire line, for saving if starting a block.
195 | // t is the line as a string, with leading spaces removed; it starts with <.
196 | func startHTMLBlock2345(p *parser, s line, t string) bool {
197 | 	var end string
198 | 	switch {
199 | 	default:
200 | 		return false
201 | 
202 | 	// type 2: <!-- .. -->, or <!--> or <!---> because of simplistic parsing.
203 | 	case strings.HasPrefix(t, "<!--"): // type 2
204 | 		end = "-->"
205 | 
206 | 	// type 3: <? ... ?>, or <?> because of simplistic parsing.
207 | 	case strings.HasPrefix(t, "<?"): // type 3
208 | 		end = "?>"
209 | 
210 | 	// type 4: <![CDATA[ ... ]]>
211 | 	case strings.HasPrefix(t, "<![CDATA["):
212 | 		end = "]]>"
213 | 
214 | 	// type 5: <!TEXT .. >
215 | 	// The spec says nothing about requiring a leading upper-case letter,
216 | 	// only that it should be an ASCII letter, but cmark-gfm, Goldmark,
217 | 	// and the Dingus all require upper-case, so we do too.
218 | 	// Presumably this is because the actual goal is to recognize the few
219 | 	// XML definitions that can appear, and they are all upper-case.
220 | 	// The result is that <!X> is an HTMLBlock but <!x> is an HTMLTag.
221 | 	// That's inconsistent, but Markdown is full of them, so we prioritize
222 | 	// consistency with all the existing implementations.
223 | 	case strings.HasPrefix(t, "<!") && len(t) >= 3 && 'A' <= t[2] && t[2] <= 'Z':
224 | 		end = ">"
225 | 	}
226 | 
227 | 	b := &htmlBuilder{endFunc: func(s string) bool { return strings.Contains(s, end) }}
228 | 	p.addBlock(b)
229 | 	b.text = append(b.text, s.string())
230 | 	if b.endFunc(t) {
231 | 		// If terminator appears on the starting line, we're done.
232 | 		p.closeBlock()
233 | 	}
234 | 	return true
235 | }
236 | 
237 | // startHTMLBlock6 handles HTML block type 6,
238 | // which starts with the start of a recognized tag
239 | // and ends at a blank line.
240 | //
241 | // s is the entire line, for saving if starting a block.
242 | // t is the line as a string, with leading spaces removed; it starts with <.
243 | func startHTMLBlock6(p *parser, s line, t string) bool {
244 | 	// Skip over < or </.
245 | 	start := 1
246 | 	if len(t) > 1 && t[1] == '/' {
247 | 		start = 2
248 | 	}
249 | 
250 | 	// Scan ASCII alphanumeric tag name;
251 | 	// must be followed by space, tab, >, />, or end of line.
252 | 	end := start
253 | 	for end < len(t) && end < 16 && isLetterDigit(t[end]) {
254 | 		end++
255 | 	}
256 | 	if end < len(t) {
257 | 		switch t[end] {
258 | 		default:
259 | 			return false
260 | 		case ' ', '\t', '>':
261 | 			// ok
262 | 		case '/':
263 | 			if end+1 >= len(t) || t[end+1] != '>' {
264 | 				return false
265 | 			}
266 | 		}
267 | 	}
268 | 
269 | 	// Check whether tag is a recognized name.
270 | 	tag := t[start:end]
271 | 	if tag == "" {
272 | 		return false
273 | 	}
274 | 	c := tag[0] | forceLower
275 | 	for _, name := range htmlTags {
276 | 		if name[0] == c && len(name) == len(tag) && lowerEq(tag, name) {
277 | 			if end < len(t) && t[end] == '\t' {
278 | 				// Goldmark recognizes space but not tab.
279 | 				// testdata/extra.txt 143.md
280 | 				p.corner = true
281 | 			}
282 | 			b := &htmlBuilder{endBlank: true}
283 | 			p.addBlock(b)
284 | 			b.text = append(b.text, s.string())
285 | 			return true
286 | 		}
287 | 	}
288 | 	return false
289 | }
290 | 
291 | // startHTMLBlock7 handles HTML block type 7,
292 | // which starts with a complete tag on a line by itself
293 | // and ends at a blank line.
294 | //
295 | // s is the entire line, for saving if starting a block.
296 | // t is the line as a string, with leading spaces removed; it starts with <.
297 | func startHTMLBlock7(p *parser, s line, t string) bool {
298 | 	// Type 7 blocks cannot interrupt a paragraph,
299 | 	// so that rewrapping a paragraph with inline tags
300 | 	// cannot change them into starting an HTML block.
301 | 	if p.para() != nil {
302 | 		return false
303 | 	}
304 | 
305 | 	if _, end, ok := parseHTMLOpenTag(p, t, 0); ok && skipSpace(t, end) == len(t) {
306 | 		if end != len(t) {
307 | 			// Goldmark disallows trailing space
308 | 			p.corner = true
309 | 		}
310 | 		b := &htmlBuilder{endBlank: true}
311 | 		p.addBlock(b)
312 | 		b.text = append(b.text, s.string())
313 | 		return true
314 | 	}
315 | 	if _, end, ok := parseHTMLClosingTag(p, t, 0); ok && skipSpace(t, end) == len(t) {
316 | 		b := &htmlBuilder{endBlank: true}
317 | 		p.addBlock(b)
318 | 		b.text = append(b.text, s.string())
319 | 		return true
320 | 	}
321 | 	return false
322 | }
323 | 
324 | // parseHTMLTag is an [inlineParser] for an [HTMLTag].
325 | // The caller has has checked that s[start] is '<'.
326 | func parseHTMLTag(p *parser, s string, start int) (x Inline, end int, ok bool) {
327 | 	// “An HTML tag consists of an open tag, a closing tag, an HTML comment,
328 | 	// a processing instruction, a declaration, or a CDATA section.”
329 | 	if len(s)-start < 3 || s[start] != '<' {
330 | 		return
331 | 	}
332 | 	switch s[start+1] {
333 | 	default:
334 | 		return parseHTMLOpenTag(p, s, start)
335 | 	case '/':
336 | 		return parseHTMLClosingTag(p, s, start)
337 | 	case '!':
338 | 		switch s[start+2] {
339 | 		case '-':
340 | 			return parseHTMLComment(p, s, start)
341 | 		case '[':
342 | 			return parseHTMLCDATA(p, s, start)
343 | 		default:
344 | 			return parseHTMLDecl(p, s, start)
345 | 		}
346 | 	case '?':
347 | 		return parseHTMLProcInst(p, s, start)
348 | 	}
349 | }
350 | 
351 | // parseHTMLOpenTag is an [inlineParser] for an HTML open tag.
352 | // The caller has has checked that s[start] is '<'.
353 | func parseHTMLOpenTag(p *parser, s string, i int) (x Inline, end int, ok bool) {
354 | 	// “An open tag consists of a < character, a tag name, zero or more attributes,
355 | 	// optional spaces, tabs, and up to one line ending, an optional / character, and a > character.”
356 | 
357 | 	// < character
358 | 	if i >= len(s) || s[i] != '<' {
359 | 		// unreachable unless called wrong
360 | 		return
361 | 	}
362 | 
363 | 	// tag name
364 | 	name, j, ok1 := parseTagName(s, i+1)
365 | 	if !ok1 {
366 | 		return
367 | 	}
368 | 	switch name {
369 | 	case "pre", "script", "style", "textarea":
370 | 		// Goldmark treats these as starting a new HTMLBlock
371 | 		// and ending the paragraph they appear in.
372 | 		p.corner = true
373 | 	}
374 | 
375 | 	// zero or more attributes
376 | 	for {
377 | 		if j >= len(s) || s[j] != ' ' && s[j] != '\t' && s[j] != '\n' && s[j] != '/' && s[j] != '>' {
378 | 			return
379 | 		}
380 | 		_, k, ok := parseAttr(p, s, skipSpace(s, j))
381 | 		if !ok {
382 | 			break
383 | 		}
384 | 		j = k
385 | 	}
386 | 
387 | 	// optional spaces, tabs, and up to one line ending
388 | 	k := skipSpace(s, j)
389 | 	if k != j {
390 | 		// Goldmark mishandles spaces before >.
391 | 		p.corner = true
392 | 	}
393 | 	j = k
394 | 
395 | 	// an optional / character
396 | 	if j < len(s) && s[j] == '/' {
397 | 		j++
398 | 	}
399 | 
400 | 	// and a > character.
401 | 	if j >= len(s) || s[j] != '>' {
402 | 		return
403 | 	}
404 | 
405 | 	return &HTMLTag{s[i : j+1]}, j + 1, true
406 | }
407 | 
408 | // parseHTMLClosingTag is an [inlineParser] for an HTML closing tag.
409 | // The caller has has checked that s[start:] begins with "</".
410 | func parseHTMLClosingTag(p *parser, s string, i int) (x Inline, end int, ok bool) {
411 | 	// “A closing tag consists of the string </, a tag name,
412 | 	// optional spaces, tabs, and up to one line ending, and the character >.”
413 | 	if i+2 >= len(s) || s[i] != '<' || s[i+1] != '/' {
414 | 		return
415 | 	}
416 | 	if skipSpace(s, i+2) != i+2 {
417 | 		// Goldmark allows spaces here but the spec and the Dingus do not.
418 | 		p.corner = true
419 | 	}
420 | 
421 | 	if _, j, ok := parseTagName(s, i+2); ok {
422 | 		j = skipSpace(s, j)
423 | 		if j < len(s) && s[j] == '>' {
424 | 			return &HTMLTag{s[i : j+1]}, j + 1, true
425 | 		}
426 | 	}
427 | 	return
428 | }
429 | 
430 | // parseTagName parses a leading tag name from s[start:],
431 | // returning the tag and the end location.
432 | func parseTagName(s string, start int) (tag string, end int, ok bool) {
433 | 	// “A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).”
434 | 	if start >= len(s) || !isLetter(s[start]) {
435 | 		return
436 | 	}
437 | 	end = start + 1
438 | 	for end < len(s) && isLDH(s[end]) {
439 | 		end++
440 | 	}
441 | 	return s[start:end], end, true
442 | }
443 | 
444 | // parseAttr parses a leading attr (or attr=value) from s[start:],
445 | // returning the entire attribute (including the =value) and the end location.
446 | func parseAttr(p *parser, s string, start int) (attr string, end int, ok bool) {
447 | 	// “An attribute consists of spaces, tabs, and up to one line ending,
448 | 	// an attribute name, and an optional attribute value specification.”
449 | 	_, end, ok = parseAttrName(s, start)
450 | 	if !ok {
451 | 		return
452 | 	}
453 | 	if endVal, ok := parseAttrValueSpec(p, s, end); ok {
454 | 		end = endVal
455 | 	}
456 | 	return s[start:end], end, true
457 | }
458 | 
459 | // parseAttrName parses a leading attribute name from s[start:],
460 | // returning the name and the end location.
461 | func parseAttrName(s string, start int) (name string, end int, ok bool) {
462 | 	// “An attribute name consists of an ASCII letter, _, or :,
463 | 	// followed by zero or more ASCII letters, digits, _, ., :, or -.”
464 | 	if start+1 >= len(s) || (!isLetter(s[start]) && s[start] != '_' && s[start] != ':') {
465 | 		return
466 | 	}
467 | 	end = start + 1
468 | 	for end < len(s) && (isLDH(s[end]) || s[end] == '_' || s[end] == '.' || s[end] == ':') {
469 | 		end++
470 | 	}
471 | 	return s[start:end], end, true
472 | }
473 | 
474 | // parseAttrValueSpec parses a leading attribute value specification
475 | // from s[start:], returning the end location.
476 | func parseAttrValueSpec(p *parser, s string, start int) (end int, ok bool) {
477 | 	// “An attribute value specification consists of
478 | 	// optional spaces, tabs, and up to one line ending,
479 | 	// a = character,
480 | 	// optional spaces, tabs, and up to one line ending,
481 | 	// and an attribute value.”
482 | 	end = skipSpace(s, start)
483 | 	if end >= len(s) || s[end] != '=' {
484 | 		return
485 | 	}
486 | 	end = skipSpace(s, end+1)
487 | 
488 | 	// “An attribute value consists of
489 | 	// an unquoted attribute value,
490 | 	// a single-quoted attribute value,
491 | 	// or a double-quoted attribute value.”
492 | 	// TODO: No escaping???
493 | 	if end < len(s) && (s[end] == '\'' || s[end] == '"') {
494 | 		// “A single-quoted attribute value consists of ',
495 | 		// zero or more characters not including ', and a final '.”
496 | 		// “A double-quoted attribute value consists of ",
497 | 		// zero or more characters not including ", and a final ".”
498 | 		i := strings.IndexByte(s[end+1:], s[end])
499 | 		if i < 0 {
500 | 			return
501 | 		}
502 | 		return end + 1 + i + 1, true
503 | 	}
504 | 
505 | 	// “An unquoted attribute value is a nonempty string of characters
506 | 	// not including spaces, tabs, line endings, ", ', =, <, >, or `.”
507 | 	isAttrVal := func(c byte) bool {
508 | 		return c != ' ' && c != '\t' && c != '\n' &&
509 | 			c != '"' && c != '\'' &&
510 | 			c != '=' && c != '<' && c != '>' && c != '`'
511 | 	}
512 | 	i := end
513 | 	for i < len(s) && isAttrVal(s[i]) {
514 | 		i++
515 | 	}
516 | 	if i == end {
517 | 		return
518 | 	}
519 | 	return i, true
520 | }
521 | 
522 | // parseHTMLComment is an [inlineParser] for an HTML comment.
523 | // The caller has has checked that s[start:] begins with "<!-".
524 | func parseHTMLComment(p *parser, s string, start int) (x Inline, end int, ok bool) {
525 | 	// “An HTML comment consists of <!-- + text + -->,
526 | 	// where text does not start with > or ->,
527 | 	// does not end with -, and does not contain --.”
528 | 	if strings.HasPrefix(s[start:], "<!-->") {
529 | 		end = start + len("<!-->")
530 | 		return &HTMLTag{s[start:end]}, end, true
531 | 	}
532 | 	if strings.HasPrefix(s[start:], "<!--->") {
533 | 		end = start + len("<!--->")
534 | 		return &HTMLTag{s[start:end]}, end, true
535 | 	}
536 | 	if x, end, ok := parseHTMLMarker(p, s, start, "<!--", "-->"); ok {
537 | 		return x, end, ok
538 | 	}
539 | 	return
540 | }
541 | 
542 | // parseHTMLCDATA is an [inlineParser] for an HTML CDATA section.
543 | // The caller has has checked that s[start:] begins with "<![".
544 | func parseHTMLCDATA(p *parser, s string, i int) (x Inline, end int, ok bool) {
545 | 	// “A CDATA section consists of the string <![CDATA[,
546 | 	// a string of characters not including the string ]]>, and the string ]]>.”
547 | 	return parseHTMLMarker(p, s, i, "<![CDATA[", "]]>")
548 | }
549 | 
550 | // parseHTMLDecl is an [inlineParser] for an HTML declaration section.
551 | // The caller has has checked that s[start:] begins with "<!".
552 | func parseHTMLDecl(p *parser, s string, i int) (x Inline, end int, ok bool) {
553 | 	// “A declaration consists of the string <!, an ASCII letter,
554 | 	// zero or more characters not including the character >, and the character >.”
555 | 	if i+2 < len(s) && isLetter(s[i+2]) {
556 | 		if 'a' <= s[i+2] && s[i+2] <= 'z' {
557 | 			p.corner = true // goldmark requires uppercase
558 | 		}
559 | 		return parseHTMLMarker(p, s, i, "<!", ">")
560 | 	}
561 | 	return
562 | }
563 | 
564 | // parseHTMLDecl is an [inlineParser] for an HTML processing instruction.
565 | // The caller has has checked that s[start:] begins with "<?".
566 | func parseHTMLProcInst(p *parser, s string, i int) (x Inline, end int, ok bool) {
567 | 	// “A processing instruction consists of the string <?,
568 | 	// a string of characters not including the string ?>, and the string ?>.”
569 | 	return parseHTMLMarker(p, s, i, "<?", "?>")
570 | }
571 | 
572 | // parseHTMLMarker is a generalized parser for the
573 | // various prefix/suffix-denote HTML markers.
574 | // If s[start:] starts with prefix and is followed eventually by suffix,
575 | // then parseHTMLMarker returns an HTMLTag for that section of s
576 | // along with start, end, ok to implement the result of an [inlineParser].
577 | func parseHTMLMarker(p *parser, s string, start int, prefix, suffix string) (x Inline, end int, ok bool) {
578 | 	if strings.HasPrefix(s[start:], prefix) {
579 | 		// To avoid quadratic behavior looking at <!-- <!-- <!-- <!-- ...
580 | 		// we record when a search for a terminator has failed on this line
581 | 		// and don't bother to search again.
582 | 		switch suffix[0] {
583 | 		case ']':
584 | 			if p.noCDATAEnd {
585 | 				return
586 | 			}
587 | 		case '>':
588 | 			if p.noDeclEnd {
589 | 				return
590 | 			}
591 | 		case '-':
592 | 			if p.noCommentEnd {
593 | 				return
594 | 			}
595 | 		case '?':
596 | 			if p.noProcInstEnd {
597 | 				return
598 | 			}
599 | 		}
600 | 
601 | 		if i := strings.Index(s[start+len(prefix):], suffix); i >= 0 {
602 | 			end = start + len(prefix) + i + len(suffix)
603 | 			return &HTMLTag{s[start:end]}, end, true
604 | 		}
605 | 
606 | 		p.noDeclEnd = true // no > on line
607 | 		switch suffix[0] {
608 | 		case ']':
609 | 			p.noCDATAEnd = true // no ]]> on line
610 | 		case '-':
611 | 			p.noCommentEnd = true // no --> on line
612 | 		case '?':
613 | 			p.noProcInstEnd = true // no ?> on line
614 | 		}
615 | 	}
616 | 	return
617 | }
618 | 
619 | // parseHTMLEntity is an [inlineParser] for an HTML entity reference,
620 | // such as &quot;, &#123;, or &#x12AB;.
621 | func parseHTMLEntity(_ *parser, s string, start int) (x Inline, end int, ok bool) {
622 | 	i := start
623 | 	if i+1 < len(s) && s[i+1] == '#' {
624 | 		i += 2
625 | 		var r int
626 | 		if i < len(s) && (s[i] == 'x' || s[i] == 'X') {
627 | 			// hex
628 | 			i++
629 | 			j := i
630 | 			for j < len(s) && isHexDigit(s[j]) {
631 | 				j++
632 | 			}
633 | 			if j-i < 1 || j-i > 6 || j >= len(s) || s[j] != ';' {
634 | 				return
635 | 			}
636 | 			r64, _ := strconv.ParseInt(s[i:j], 16, 0)
637 | 			r = int(r64)
638 | 			end = j + 1
639 | 		} else {
640 | 			// decimal
641 | 			j := i
642 | 			for j < len(s) && isDigit(s[j]) {
643 | 				j++
644 | 			}
645 | 			if j-i < 1 || j-i > 7 || j >= len(s) || s[j] != ';' {
646 | 				return
647 | 			}
648 | 			r, _ = strconv.Atoi(s[i:j])
649 | 			end = j + 1
650 | 		}
651 | 		if r > unicode.MaxRune || r == 0 {
652 | 			// Invalid code points and U+0000 are replaced by U+FFFD.
653 | 			r = unicode.ReplacementChar
654 | 		}
655 | 		return &Plain{string(rune(r))}, end, true
656 | 	}
657 | 
658 | 	// Max name in list is 32 bytes. Try for 64 for good measure.
659 | 	for j := i + 1; j < len(s) && j-i < 64; j++ {
660 | 		if s[j] == '&' { // Stop possible quadratic search on &&&&&&&.
661 | 			break
662 | 		}
663 | 		if s[j] == ';' {
664 | 			if r, ok := htmlEntity[s[i:j+1]]; ok {
665 | 				return &Plain{r}, j + 1, true
666 | 			}
667 | 			break
668 | 		}
669 | 	}
670 | 
671 | 	return
672 | }
673 | 


--------------------------------------------------------------------------------