├── .gitignore ├── Makefile ├── parser ├── nodetype.go ├── element.go ├── nodetext.go ├── nodeinfo.go ├── nodenormal.go ├── nodevoidelement.go ├── node.go └── ast.go ├── go.mod ├── tests └── components │ └── greeting.t.html ├── log.txt ├── logi └── logi.go ├── token ├── token.go ├── htmltoken_test.go └── htmltoken.go ├── main.go ├── .air.toml ├── stur └── stur.go ├── lexer ├── lexer_test.go └── lexer.go └── go.sum /.gitignore: -------------------------------------------------------------------------------- 1 | tmp -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | combine: 2 | find . -type f -name '*.go' -print0 | sort -z | xargs -0 cat > combined.go 3 | -------------------------------------------------------------------------------- /parser/nodetype.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | type NodeType string 4 | 5 | const ( 6 | Root NodeType = "Root" 7 | Void NodeType = "Void" 8 | Normal NodeType = "Normal" 9 | ) -------------------------------------------------------------------------------- /parser/element.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | type Element interface { 4 | GetTagName() string 5 | GetAttiribute(name string) (Attribute, bool) 6 | GetAttributes() []Attribute 7 | } -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/phillip-england/gtml 2 | 3 | go 1.23.3 4 | 5 | require ( 6 | github.com/PuerkitoBio/goquery v1.10.3 // indirect 7 | github.com/andybalholm/cascadia v1.3.3 // indirect 8 | golang.org/x/net v0.39.0 // indirect 9 | ) 10 | -------------------------------------------------------------------------------- /parser/nodetext.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | type NodeText struct { 4 | Info *NodeInfo 5 | } 6 | 7 | func (n *NodeText) GetInfo() *NodeInfo { 8 | return n.Info 9 | } 10 | 11 | func NewNodeText(s string, t NodeType) *NodeText { 12 | info := NewNodeInfo(s, t) 13 | return &NodeText{ 14 | Info: info, 15 | } 16 | } -------------------------------------------------------------------------------- /parser/nodeinfo.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | type NodeInfo struct { 4 | Value string 5 | Children []Node 6 | Type NodeType 7 | TextContent string 8 | } 9 | 10 | func NewNodeInfo(val string, t NodeType) *NodeInfo { 11 | return &NodeInfo{ 12 | Value: val, 13 | Children: make([]Node, 0), 14 | Type: t, 15 | } 16 | } -------------------------------------------------------------------------------- /parser/nodenormal.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | type NodeNormal struct { 4 | Info *NodeInfo 5 | } 6 | 7 | func (n *NodeNormal) GetInfo() *NodeInfo { 8 | return n.Info 9 | } 10 | 11 | func NewNodeNormal(s string, t NodeType) Node { 12 | info := NewNodeInfo(s, t) 13 | return &NodeNormal{ 14 | Info: info, 15 | } 16 | } -------------------------------------------------------------------------------- /parser/nodevoidelement.go: -------------------------------------------------------------------------------- 1 | package parser 2 | 3 | type NodeVoid struct { 4 | Info *NodeInfo 5 | } 6 | 7 | func (n *NodeVoid) GetInfo() *NodeInfo { 8 | return n.Info 9 | } 10 | 11 | func NewNodeVoid(s string, t NodeType) Node { 12 | info := NewNodeInfo(s, t) 13 | return &NodeVoid{ 14 | Info: info, 15 | } 16 | } -------------------------------------------------------------------------------- /tests/components/greeting.t.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
%s age%
5 | 6 |%s friend.Name%
9 |you all can drink together
11 | ::? 12 |you all cannot drink together
13 |I am %s age% years old. How old are you?
19 |I am %s age% years old. How old are you?
28 | I am %s age% years old. How old are you? 29 | { } 30 | false 31 | -------------------------------------------------------------------------------- /logi/logi.go: -------------------------------------------------------------------------------- 1 | package logi 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | ) 7 | 8 | func Log(lines ...any) error { 9 | f, err := os.OpenFile("log.txt", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 10 | if err != nil { 11 | return err 12 | } 13 | defer f.Close() 14 | 15 | for _, line := range lines { 16 | _, err := f.WriteString(fmt.Sprint(line) + "\n") 17 | if err != nil { 18 | return err 19 | } 20 | } 21 | return nil 22 | } 23 | 24 | func Clear() error { 25 | return os.WriteFile("log.txt", []byte{}, 0644) 26 | } 27 | -------------------------------------------------------------------------------- /token/token.go: -------------------------------------------------------------------------------- 1 | package token 2 | 3 | import "github.com/phillip-england/gtml/logi" 4 | 5 | type Token interface { 6 | GetLexeme() string 7 | GetType() HtmlTokenType 8 | GetLine() int 9 | GetColumn() int 10 | } 11 | 12 | func LogTokens(toks []Token) { 13 | logi.Log(Construct(toks)) 14 | for _, tok := range toks { 15 | logi.Log(tok.GetType(), tok.GetLexeme()) 16 | } 17 | } 18 | 19 | func Construct(toks[]Token) string { 20 | out := "" 21 | for _, tok := range toks { 22 | out += tok.GetLexeme() 23 | } 24 | return out 25 | } 26 | 27 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/phillip-england/gtml/logi" 7 | "github.com/phillip-england/gtml/parser" 8 | "github.com/phillip-england/gtml/token" 9 | ) 10 | 11 | func main() { 12 | 13 | logi.Clear() 14 | 15 | toks, err := token.TokenizeHtml([]rune(` 16 | 17 | 18 | 19 | 20 |I am %s age% years old. How old are you?
23 |Hello, World!
\n") 10 | l := NewLexer(input) 11 | // does stepping work as expected? 12 | l.Step() 13 | if l.Char() != "p" { 14 | t.Errorf(`expected lexer to be positioned at "p" but instead it is positioned at "%s"`, l.Char()) 15 | } 16 | // what if we step back? 17 | l.StepBack() 18 | if l.Char() != "<" { 19 | t.Errorf(`expected lexer to be positioned at "<" but instead it is positioned at "%s"`, l.Char()) 20 | } 21 | // what characters have been spent? (should be none) 22 | if len(l.Spent) != 0 { 23 | t.Errorf(`expected lexer to have 0 runes spent, but instead it had %d runes spent`, len(l.Spent)) 24 | } 25 | // but if we step again we should have a len of 1 26 | l.Step() 27 | if len(l.Spent) != 1 { 28 | t.Errorf(`expected lexer to have 1 rune spent, but instead it had %d runes spent`, len(l.Spent)) 29 | } 30 | // and the spent string should match 31 | if l.SpentString() != "<" { 32 | t.Errorf(`expected the lexers SpendString to equal "<" but it was "%s"`, l.SpentString()) 33 | } 34 | // and if we step wildly 35 | l.Step() 36 | l.Step() 37 | l.Step() 38 | l.StepBack() 39 | l.StepBack() 40 | l.Step() 41 | if l.SpentString() != "" { 42 | t.Errorf(`expected the lexers SpendString to equal "
" but it was "%s"`, l.SpentString()) 43 | } 44 | // and if we collect from mark (the marked pos should be 0) 45 | // we should get
H 46 | if string(l.CollectFromMark()) != "
H" { 47 | t.Errorf(`expected CollectFromMark to output "
H" but it output %s instead`, string(l.CollectFromMark())) 48 | } 49 | // now lets mark and step 50 | l.Mark() 51 | l.Step() 52 | if string(l.CollectFromMark()) != "He" { 53 | t.Errorf(`expected CollectFromMark to output "He" but instead it output %s`, string(l.CollectFromMark())) 54 | } 55 | // if we step and collect again we should get Hel 56 | l.Step() 57 | if string(l.CollectFromMark()) != "Hel" { 58 | t.Errorf(`expected CollectFromMark to output "Hel" but instead it output %s`, string(l.CollectFromMark())) 59 | } 60 | // jump to the "<" in "
" 61 | l.WalkUntil('<') 62 | if string(l.CollectFromMark()) != "Hello, World!<" { 63 | t.Errorf(`expected CollectFromMark to output "Hello, World<" but instead it output %s`, string(l.CollectFromMark())) 64 | } 65 | // we should still be on the first line 66 | if l.Line != 1 { 67 | t.Errorf(`expected to be on the first line, but we are actually on line %d`, l.Line) 68 | } 69 | // but after jumping to the next "<" we should be on the second line 70 | l.WalkUntil('<') 71 | if l.Line != 2 { 72 | t.Errorf(`expected to be on second line but we are actually on line: %d`, l.Line) 73 | } 74 | // and when we walk to the end the SpentString should equal the input 75 | l.WalkToEnd() 76 | if l.SpentString() != string(input) { 77 | t.Errorf(`expected the SpentString to be equal to the %s, but it was equal to %s`, string(input), l.SpentString()) 78 | } 79 | // we should be on line two column twenty 80 | if l.Line != 2 && l.Column != 20 { 81 | t.Errorf(`expected to be at line 2 and column 20 but found ourselves at line: %d and column: %d`, l.Line, l.Column) 82 | } 83 | // the current char should be ">" 84 | if l.Char() != ">" { 85 | t.Errorf(`expected the lexers char to be ">" but it was: %s`, l.Char()) 86 | } 87 | // go to the start 88 | l.WalkBackToStart() 89 | // the position should be 0 90 | if l.Pos != 0 { 91 | t.Errorf(`walked to start and expected 0 position but we got %d`, l.Pos) 92 | } 93 | // count all the "<" runes 94 | for { 95 | if l.Terminated { 96 | break 97 | } 98 | if l.Char() == "<" { 99 | l.Count() 100 | } 101 | l.Step() 102 | } 103 | // we should have 3 of them 104 | if l.GetCount('<') != 3 { 105 | t.Errorf(`counted all "<" runes and expected 3 but we have %d`, l.GetCount('<')) 106 | } 107 | // what if we set the position to 100000? 108 | l.Pos = 100000000 109 | // lets try to step 110 | l.Step() 111 | // our position should be reset back to the len of our source - 1 112 | if l.Pos != len(l.Source) -1 { 113 | t.Errorf(`expected our position to be 39 but it is %d`, l.Pos) 114 | } 115 | // we should be on the last char ">" 116 | if l.Char() != ">" { 117 | t.Errorf(`explicitly set position past len of input and then requested current char, expect the last char ">" but instead got: %s`, l.Char()) 118 | } 119 | // after we step back, our Position should get patched and equal 38 120 | l.StepBack() 121 | if l.Pos != 38 { 122 | t.Errorf(`expected Position to be 38 but it is %d`, l.Pos) 123 | } 124 | if l.Char() != "'" { 125 | t.Errorf(`expected Char to be ' but it is %s`, l.Char()) 126 | } 127 | // and if we set ourselves back to -100000 128 | l.Pos = -1000000000 129 | l.Step() 130 | l.StepBack() 131 | // we should be on the first char again 132 | if l.Char() != "<" { 133 | t.Errorf(`expected char to be "<" but it is %s`, l.Char()) 134 | } 135 | // and if we go to the end and mark 136 | l.WalkToEnd() 137 | l.Mark() 138 | // then walk back to the start 139 | l.WalkBackToStart() 140 | // our flushed buffer should equal the input 141 | if len(l.FlushFromMark()) != len(input) { 142 | t.Errorf(`FlushFromMark should be equal to the len of the input but its not input len is %d and FlushFromMark len is %d`, len(input), len(l.FlushFromMark())) 143 | } 144 | } -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= 2 | github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= 3 | github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= 4 | github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= 5 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 6 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 7 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 8 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 9 | golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= 10 | golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= 11 | golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= 12 | golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= 13 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 14 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 15 | golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 16 | golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 17 | golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= 18 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 19 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 20 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 21 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 22 | golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= 23 | golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= 24 | golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= 25 | golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= 26 | golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= 27 | golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= 28 | golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= 29 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 30 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 31 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 32 | golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= 33 | golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 34 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 35 | golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 36 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 37 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 38 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 39 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 40 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 41 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 42 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 43 | golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 44 | golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 45 | golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 46 | golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 47 | golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= 48 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 49 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 50 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 51 | golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= 52 | golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= 53 | golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= 54 | golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= 55 | golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= 56 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 57 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 58 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 59 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 60 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 61 | golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= 62 | golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 63 | golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= 64 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= 65 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 66 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 67 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 68 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 69 | golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= 70 | golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= 71 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 72 | -------------------------------------------------------------------------------- /lexer/lexer.go: -------------------------------------------------------------------------------- 1 | package lexer 2 | 3 | // Lexer represents a simple rune-based lexer for walking over text input. 4 | type Lexer struct { 5 | Pos int // Current position in the source. 6 | Source []rune // Source runes being lexed. 7 | Current rune // Current rune being analyzed. 8 | Terminated bool // Whether lexer has finished walking through the source. 9 | Buffer []rune // Optional buffer to collect runes manually. 10 | MarkedPos int // Marked position for later collection. 11 | State string 12 | CharCounter map[rune]int // Track rune counts. 13 | Spent []rune 14 | Line int 15 | Column int 16 | } 17 | 18 | // NewLexer creates and initializes a new Lexer from the given rune slice. 19 | func NewLexer(runes []rune) *Lexer { 20 | l := &Lexer{ 21 | Pos: 0, 22 | Source: runes, 23 | MarkedPos: 0, 24 | CharCounter: make(map[rune]int), 25 | Spent: []rune{}, 26 | } 27 | if len(runes) == 0 { 28 | l.Current = 0 29 | l.Terminated = true 30 | } else { 31 | l.Current = l.Source[0] 32 | } 33 | return l 34 | } 35 | 36 | // Step advances the lexer to the next rune. 37 | func (l *Lexer) Step() { 38 | if l.Pos < 0 { 39 | l.Pos = 0 40 | } 41 | if l.Terminated { 42 | l.Pos = len(l.Source)-1 43 | return 44 | } 45 | if l.Pos+1 >= len(l.Source) { 46 | l.Terminated = true 47 | l.Pos = len(l.Source)-1 48 | return 49 | } 50 | l.Pos++ 51 | l.Current = l.Source[l.Pos] 52 | l.updateSpent() 53 | l.updateLineAndColumn() 54 | } 55 | 56 | 57 | // StepBack moves the lexer one rune backward. 58 | func (l *Lexer) StepBack() { 59 | if l.Pos <= 0 { 60 | l.Pos = 0 61 | return 62 | } 63 | if l.Pos > len(l.Source)-1 { 64 | l.Pos = len(l.Source)-1 65 | } 66 | l.Pos-- 67 | l.Current = l.Source[l.Pos] 68 | l.Terminated = false 69 | l.updateSpent() 70 | l.updateLineAndColumn() 71 | } 72 | 73 | 74 | // Mark saves the current position. 75 | func (l *Lexer) Mark() { 76 | l.MarkedPos = l.Pos 77 | } 78 | 79 | // JumpToMark repositions the lexer back to the last marked position. 80 | func (l *Lexer) JumpToMark() { 81 | if l.MarkedPos >= 0 && l.MarkedPos < len(l.Source) { 82 | l.Pos = l.MarkedPos 83 | l.Current = l.Source[l.Pos] 84 | l.Terminated = false 85 | l.updateSpent() 86 | l.updateLineAndColumn() 87 | } 88 | } 89 | 90 | 91 | // CollectFromMark returns all runes from MarkedPos up to current Pos. 92 | func (l *Lexer) CollectFromMark() []rune { 93 | if l.MarkedPos < 0 || l.Pos >= len(l.Source) { 94 | return nil 95 | } 96 | if l.Pos+1 > l.MarkedPos { 97 | return l.Source[l.MarkedPos : l.Pos+1] 98 | } else { 99 | return l.Source[l.Pos : l.MarkedPos+1] 100 | } 101 | } 102 | 103 | // Push adds the current rune to the buffer. 104 | func (l *Lexer) Push() { 105 | l.Buffer = append(l.Buffer, l.Current) 106 | } 107 | 108 | // Flush clears and returns the buffer. 109 | func (l *Lexer) Flush() []rune { 110 | out := l.Buffer 111 | l.Buffer = []rune{} 112 | return out 113 | } 114 | 115 | // FlushFromMark collects runes from mark and clears the buffer. 116 | func (l *Lexer) FlushFromMark() []rune { 117 | collected := l.CollectFromMark() 118 | l.Buffer = []rune{} 119 | return collected 120 | } 121 | 122 | // WalkToEnd steps until termination. 123 | func (l *Lexer) WalkToEnd() { 124 | for !l.Terminated { 125 | l.Step() 126 | } 127 | } 128 | 129 | // WalkUntil stops when target rune is found. 130 | func (l *Lexer) WalkUntil(target rune) bool { 131 | for !l.Terminated { 132 | l.Step() 133 | if l.Current == target { 134 | return true 135 | } 136 | } 137 | return false 138 | } 139 | 140 | // WalkBackUntil steps back until target rune is found. 141 | func (l *Lexer) WalkBackUntil(target rune) bool { 142 | for l.Pos > 0 { 143 | if l.Current == target { 144 | return true 145 | } 146 | l.StepBack() 147 | } 148 | return false 149 | } 150 | 151 | // Char returns the current rune as string. 152 | func (l *Lexer) Char() string { 153 | return string(l.Current) 154 | } 155 | 156 | // CharIs checks if current rune matches. 157 | func (l *Lexer) CharIs(char string) bool { 158 | return l.Char() == char 159 | } 160 | 161 | // Peek looks ahead or behind without moving. 162 | func (l *Lexer) Peek(offset int) rune { 163 | targetPos := l.Pos + offset 164 | if targetPos < 0 || targetPos >= len(l.Source) { 165 | return 0 166 | } 167 | return l.Source[targetPos] 168 | } 169 | 170 | // SkipWhiteSpace steps through space, tab, newline. 171 | func (l *Lexer) SkipWhiteSpace() { 172 | for !l.Terminated && (l.Current == ' ' || l.Current == '\t' || l.Current == '\n') { 173 | l.Step() 174 | } 175 | } 176 | 177 | // WalkUntilSkipQuotes skips quoted targets. 178 | func (l *Lexer) WalkUntilSkipQuotes(target rune) bool { 179 | inSingleQuote := false 180 | inDoubleQuote := false 181 | 182 | for !l.Terminated { 183 | if !inSingleQuote && !inDoubleQuote && l.Current == target { 184 | return true 185 | } 186 | if l.Current == '\'' && !inDoubleQuote { 187 | inSingleQuote = !inSingleQuote 188 | } else if l.Current == '"' && !inSingleQuote { 189 | inDoubleQuote = !inDoubleQuote 190 | } 191 | l.Step() 192 | } 193 | return false 194 | } 195 | 196 | // Count tracks current rune. 197 | func (l *Lexer) Count() { 198 | l.CharCounter[l.Current]++ 199 | } 200 | 201 | // GetCount returns count of a rune. 202 | func (l *Lexer) GetCount(r rune) int { 203 | return l.CharCounter[r] 204 | } 205 | 206 | // ResetCount clears all rune counts. 207 | func (l *Lexer) ResetCount() { 208 | l.CharCounter = make(map[rune]int) 209 | } 210 | 211 | // IsEscaped checks if current rune is escaped. 212 | func (l *Lexer) IsEscaped() bool { 213 | return l.Pos > 0 && l.Source[l.Pos-1] == '\\' 214 | } 215 | 216 | 217 | // updateSpent recalculates the runes from start to current Pos. 218 | func (l *Lexer) updateSpent() { 219 | if l.Pos == len(l.Source)-1 { 220 | l.Spent = l.Source 221 | return 222 | } 223 | if l.Pos >= 0 && l.Pos < len(l.Source) { 224 | l.Spent = l.Source[0 : l.Pos] 225 | } else if l.Pos >= len(l.Source) { 226 | l.Spent = l.Source 227 | } else { 228 | l.Spent = []rune{} 229 | } 230 | } 231 | 232 | func (l *Lexer) updateLineAndColumn() { 233 | l.Line = 1 234 | l.Column = 1 235 | for i := 0; i < l.Pos; i++ { 236 | if l.Source[i] == '\n' { 237 | l.Line++ 238 | l.Column = 1 239 | } else { 240 | l.Column++ 241 | } 242 | } 243 | } 244 | 245 | // SpentString returns the spent runes as a string. 246 | func (l *Lexer) SpentString() string { 247 | return string(l.Spent) 248 | } 249 | 250 | // StepBackToStart resets the lexer to the start of the source. 251 | func (l *Lexer) WalkBackToStart() { 252 | l.Pos = 0 253 | l.Terminated = len(l.Source) == 0 254 | if !l.Terminated { 255 | l.Current = l.Source[0] 256 | } 257 | l.updateSpent() 258 | l.updateLineAndColumn() 259 | } 260 | 261 | 262 | -------------------------------------------------------------------------------- /token/htmltoken.go: -------------------------------------------------------------------------------- 1 | package token 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | 7 | "github.com/phillip-england/gtml/lexer" 8 | "github.com/phillip-england/gtml/stur" 9 | ) 10 | 11 | type HtmlTokenType string 12 | 13 | const ( 14 | // EmptySpace HtmlTokenType = "EmptySpace" 15 | HtmlOpen HtmlTokenType = "HtmlOpen" 16 | HtmlClose HtmlTokenType = "HtmlClose" 17 | HtmlVoid HtmlTokenType = "HtmlVoid" 18 | Text HtmlTokenType = "Text" 19 | ) 20 | 21 | type HtmlToken struct { 22 | Lexeme string 23 | Type HtmlTokenType 24 | Line int 25 | Column int 26 | } 27 | 28 | // GetLexeme returns the string content of the token. 29 | func (tok HtmlToken) GetLexeme() string { 30 | return tok.Lexeme 31 | } 32 | 33 | // GetType returns the type of the token as a string. 34 | func (tok HtmlToken) GetType() HtmlTokenType { 35 | return tok.Type 36 | } 37 | 38 | func (tok HtmlToken) GetLine() int { 39 | return tok.Line 40 | } 41 | 42 | func (tok HtmlToken) GetColumn() int { 43 | return tok.Column 44 | } 45 | 46 | // TokenizeHtml tokenizes a slice of runes representing HTML input 47 | // into a list of tokens through two passes: raw token extraction 48 | // and structural classification (e.g., identifying void elements). 49 | func TokenizeHtml(input []rune) ([]Token, error) { 50 | err := validateTokenInput(input) 51 | toks, err := firstPass(input) 52 | if err != nil { 53 | return toks, err 54 | } 55 | toks, err = secondPass(toks) 56 | if err != nil { 57 | return toks, err 58 | } 59 | return toks, nil 60 | } 61 | 62 | func validateTokenInput(input []rune) error { 63 | return nil 64 | } 65 | 66 | // secondPass processes tokens from the first pass and determines if 67 | // HtmlOpen tokens are actually HtmlVoid (self-closing) by checking 68 | // for corresponding HtmlClose tags later in the sequence. 69 | func secondPass(toks []Token) ([]Token, error) { 70 | out := []Token{} 71 | for i, tok := range toks { 72 | if tok.GetType() != HtmlOpen { 73 | out = append(out, tok) 74 | continue 75 | } 76 | closingTag, _, err := GetClosingTag(tok, i, toks) 77 | if err != nil { 78 | return out, err 79 | } 80 | if closingTag == nil { 81 | out = append(out, HtmlToken{ 82 | Lexeme: tok.GetLexeme(), 83 | Type: HtmlVoid, 84 | Line: tok.GetLine(), 85 | Column: tok.GetColumn(), 86 | }) 87 | } else { 88 | out = append(out, tok) 89 | } 90 | } 91 | 92 | return out, nil 93 | } 94 | 95 | // GetClosingTag searches for the matching HtmlClose token corresponding 96 | // to the given HtmlOpen token, respecting nesting depth. 97 | // Returns nil if no matching closing tag is found. 98 | func GetClosingTag(tok Token, i int, toks []Token) (Token, int, error) { 99 | if tok.GetType() == HtmlVoid { 100 | return nil, -1, nil 101 | } 102 | if tok.GetType() != HtmlOpen { 103 | return tok, -1, fmt.Errorf(`attempted to extract the closing tag from an invalid token: %s`, tok.GetLexeme()) 104 | } 105 | name := GetTagName(tok) 106 | found := 1 107 | for i1, tok1 := range toks { 108 | if i1 <= i { 109 | continue 110 | } 111 | if tok1.GetType() != HtmlOpen && tok1.GetType() != HtmlClose { 112 | continue 113 | } 114 | name1 := GetTagName(tok1) 115 | if name != name1 { 116 | continue 117 | } 118 | if tok1.GetType() == HtmlOpen { 119 | found += 1 120 | } 121 | if tok1.GetType() == HtmlClose { 122 | found -= 1 123 | } 124 | if found == 0 && tok1.GetType() == HtmlClose { 125 | return tok1, i1, nil 126 | } 127 | } 128 | // failed to find a matching closing tag 129 | return nil, -1, nil 130 | } 131 | 132 | func IsSelfContained(toks []Token) (bool, error) { 133 | innerToks, err := ShedOuterHtml(toks) 134 | if err != nil { 135 | return false, err 136 | } 137 | if len(innerToks) == len(toks) { 138 | return false, nil 139 | } 140 | return true, nil 141 | } 142 | 143 | // Extracts the full element from a slice of tokens 144 | // if a full element cannot be derived from the token set 145 | // an empty string will be returned 146 | func ExtractFullElement(tok Token, i int, toks []Token) (string, error) { 147 | if len(toks) == 0 { 148 | return "", nil 149 | } 150 | firstTok := toks[0] 151 | _, endI, err := GetClosingTag(firstTok, i, toks) 152 | if err != nil { 153 | return "", err 154 | } 155 | if endI > len(toks)-1 { 156 | return "", fmt.Errorf(`GetClosingTag resulted in an endI index which is out of bounds`) 157 | } 158 | subSlice := toks[0 : endI+1] 159 | out := Construct(subSlice) 160 | return out, nil 161 | } 162 | 163 | func ShedOuterHtml(toks []Token) ([]Token, error) { 164 | out := []Token{} 165 | if len(toks) == 0 { 166 | return toks, nil 167 | } 168 | firstTok := toks[0] 169 | if firstTok.GetType() == HtmlClose { 170 | return out, fmt.Errorf("you cannot shed the outerhtml of an html closing tag: %s", firstTok.GetType()) 171 | } 172 | _, closeTagIndex, err := GetClosingTag(toks[0], 0, toks) 173 | if err != nil { 174 | return out, err 175 | } 176 | if closeTagIndex == len(toks)-1 { 177 | toks = toks[1:] 178 | toks = toks[:len(toks)-1] 179 | return toks, nil 180 | } else { 181 | return toks, nil 182 | } 183 | } 184 | 185 | // GetTagName extracts the tag name from an HtmlOpen or HtmlClose token's lexeme. 186 | // Strips angle brackets, slashes, and attributes, returning just the tag name. 187 | func GetTagName(tok Token) string { 188 | if HtmlTokenType(tok.GetType()) != HtmlOpen && HtmlTokenType(tok.GetType()) != HtmlClose && HtmlTokenType(tok.GetType()) != HtmlVoid { 189 | return "" 190 | } 191 | s := tok.GetLexeme() 192 | s = strings.Replace(s, "<", "", 1) 193 | s = stur.ReplaceLast(s, '>', "") 194 | s = stur.ReplaceLast(s, '/', "") 195 | parts := strings.Split(s, " ") 196 | for _, part := range parts { 197 | sq := stur.Squeeze(part) 198 | if sq != "" { 199 | return part 200 | } 201 | } 202 | return "" 203 | } 204 | 205 | // firstPass performs an initial walk over the input runes and splits the input 206 | // into basic tokens: HtmlOpen, HtmlClose, Text, and EmptySpace. 207 | // It uses the lexer to handle quote-skipping and whitespace correctly. 208 | func firstPass(input []rune) ([]Token, error) { 209 | toks := []Token{} 210 | l := lexer.NewLexer(input) 211 | for { 212 | if l.Terminated { 213 | break 214 | } 215 | // avoid capturing the leading '<' as text 216 | if l.Pos != 0 { 217 | l.Mark() 218 | l.WalkUntilSkipQuotes('<') 219 | l.StepBack() 220 | buf := string(l.FlushFromMark()) 221 | if len(stur.Squeeze(buf)) == 0 { 222 | // OPTING OUT OF COLLECTING EMPTY SPACE 223 | // toks = append(toks, HtmlToken{ 224 | // Lexeme: buf, 225 | // Type: EmptySpace, 226 | // Line: l.Line, 227 | // Column: l.Column, 228 | // }) 229 | } else { 230 | toks = append(toks, HtmlToken{ 231 | Lexeme: buf, 232 | Type: Text, 233 | Line: l.Line, 234 | Column: l.Column - len(buf), 235 | }) 236 | } 237 | l.Step() 238 | } 239 | if l.CharIs("<") { 240 | l.Mark() 241 | found := l.WalkUntilSkipQuotes('>') 242 | if !found { 243 | return toks, fmt.Errorf(`SYNTAX ERROR: failed to close html element: %s`, string(input)) 244 | } 245 | buf := string(l.FlushFromMark()) 246 | sq := stur.Squeeze(buf) 247 | if len(sq) > 2 && sq[1] == '/' { 248 | toks = append(toks, HtmlToken{ 249 | Lexeme: buf, 250 | Type: HtmlClose, 251 | Line: l.Line, 252 | Column: l.Column - len(buf), 253 | }) 254 | } else { 255 | toks = append(toks, HtmlToken{ 256 | Lexeme: buf, 257 | Type: HtmlOpen, 258 | Line: l.Line, 259 | Column: l.Column - len(buf), 260 | }) 261 | } 262 | l.Step() 263 | continue 264 | } 265 | l.Step() 266 | } 267 | return toks, nil 268 | } 269 | --------------------------------------------------------------------------------