├── .gitignore ├── lua └── libraryUtil.lua ├── main.go ├── readme.md ├── static ├── favicon.png ├── gopher-front.svg └── style.css ├── templates.go ├── templates ├── article.html ├── base.html ├── error.html └── source.html └── wikitext ├── debug.go ├── pegTokenizer.pegjs ├── rules_test.go ├── tokens.go ├── tokens_test.go ├── url.go ├── wikitext.go ├── wikitext.peg ├── wikitext.peg.go └── wikitext_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | *.bleve 2 | -------------------------------------------------------------------------------- /lua/libraryUtil.lua: -------------------------------------------------------------------------------- 1 | local libraryUtil = {} 2 | 3 | function libraryUtil.checkType( name, argIdx, arg, expectType, nilOk ) 4 | if arg == nil and nilOk then 5 | return 6 | end 7 | if type( arg ) ~= expectType then 8 | local msg = string.format( "bad argument #%d to '%s' (%s expected, got %s)", 9 | argIdx, name, expectType, type( arg ) 10 | ) 11 | error( msg, 3 ) 12 | end 13 | end 14 | 15 | function libraryUtil.checkTypeForIndex( index, value, expectType ) 16 | if type( value ) ~= expectType then 17 | local msg = string.format( "value for index '%s' must be %s, %s given", 18 | index, expectType, type( value ) 19 | ) 20 | error( msg, 3 ) 21 | end 22 | end 23 | 24 | function libraryUtil.makeCheckSelfFunction( libraryName, varName, selfObj, selfObjDesc ) 25 | return function ( self, method ) 26 | if self ~= selfObj then 27 | error( string.format( 28 | "%s: invalid %s. Did you call %s with a dot instead of a colon, i.e. " .. 29 | "%s.%s() instead of %s:%s()?", 30 | libraryName, selfObjDesc, method, varName, method, varName, method 31 | ), 3 ) 32 | end 33 | end 34 | end 35 | 36 | return libraryUtil 37 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "compress/bzip2" 6 | "encoding/xml" 7 | "flag" 8 | "fmt" 9 | "html/template" 10 | "io" 11 | "log" 12 | "net/http" 13 | _ "net/http/pprof" 14 | "os" 15 | "path" 16 | "path/filepath" 17 | "strconv" 18 | "strings" 19 | "sync" 20 | 21 | "github.com/blevesearch/bleve" 22 | "github.com/creachadair/cityhash" 23 | pbzip2 "github.com/d4l3k/go-pbzip2" 24 | "github.com/d4l3k/wikigopher/wikitext" 25 | "github.com/pkg/errors" 26 | ) 27 | 28 | var ( 29 | indexFile = flag.String("index", "enwiki-latest-pages-articles-multistream-index.txt.bz2", "the index file to load") 30 | articlesFile = flag.String("articles", "enwiki-latest-pages-articles-multistream.xml.bz2", "the article dump file to load") 31 | search = flag.Bool("search", false, "whether or not to build a search index") 32 | searchIndexFile = flag.String("searchIndex", "index.bleve", "the search index file") 33 | httpAddr = flag.String("http", ":8080", "the address to bind HTTP to") 34 | ) 35 | 36 | var tmpls = map[string]*template.Template{} 37 | 38 | func loadTemplates() error { 39 | files, err := filepath.Glob("templates/*") 40 | if err != nil { 41 | return err 42 | } 43 | for _, file := range files { 44 | name := filepath.Base(file) 45 | tmpls[name], err = template.ParseFiles("templates/base.html", file) 46 | if err != nil { 47 | return err 48 | } 49 | } 50 | return nil 51 | } 52 | 53 | func executeTemplate(wr io.Writer, name string, data interface{}) error { 54 | return tmpls[name].ExecuteTemplate(wr, "base", data) 55 | } 56 | 57 | type indexEntry struct { 58 | id, seek int 59 | } 60 | 61 | var mu = struct { 62 | sync.Mutex 63 | 64 | offsets map[uint64]indexEntry 65 | offsetSize map[int]int 66 | }{ 67 | offsets: map[uint64]indexEntry{}, 68 | offsetSize: map[int]int{}, 69 | } 70 | var index bleve.Index 71 | 72 | func loadIndex() error { 73 | mapping := bleve.NewIndexMapping() 74 | os.RemoveAll(*searchIndexFile) 75 | var err error 76 | index, err = bleve.New(*searchIndexFile, mapping) 77 | if err != nil { 78 | return err 79 | } 80 | f, err := os.Open(*indexFile) 81 | if err != nil { 82 | return err 83 | } 84 | defer f.Close() 85 | r, err := pbzip2.NewReader(f) 86 | if err != nil { 87 | return err 88 | } 89 | defer r.Close() 90 | 91 | scanner := bufio.NewScanner(r) 92 | 93 | log.Printf("Reading index file...") 94 | i := 0 95 | for scanner.Scan() { 96 | parts := strings.Split(scanner.Text(), ":") 97 | if len(parts) < 3 { 98 | return errors.Errorf("expected at least 3 parts, got: %#v", parts) 99 | } 100 | seek, err := strconv.Atoi(parts[0]) 101 | if err != nil { 102 | return err 103 | } 104 | id, err := strconv.Atoi(parts[1]) 105 | if err != nil { 106 | return err 107 | } 108 | title := strings.Join(parts[2:], ":") 109 | entry := indexEntry{ 110 | id: id, 111 | seek: seek, 112 | } 113 | titleHash := cityhash.Hash64([]byte(title)) 114 | 115 | mu.Lock() 116 | mu.offsets[titleHash] = entry 117 | mu.offsetSize[entry.seek]++ 118 | mu.Unlock() 119 | 120 | i++ 121 | if i%100000 == 0 { 122 | log.Printf("read %d entries", i) 123 | } 124 | } 125 | if err := scanner.Err(); err != nil { 126 | return err 127 | } 128 | log.Printf("Done reading!") 129 | 130 | if !*search { 131 | return nil 132 | } 133 | 134 | /* 135 | log.Printf("Indexing titles...") 136 | i = 0 137 | batch := index.NewBatch() 138 | 139 | mu.Lock() 140 | defer mu.Unlock() 141 | 142 | for key, entry := range mu.offsets { 143 | mu.Unlock() 144 | 145 | if err := batch.Index(key, entry); err != nil { 146 | mu.Lock() 147 | return err 148 | } 149 | i++ 150 | if i%100000 == 0 { 151 | if err := index.Batch(batch); err != nil { 152 | mu.Lock() 153 | return err 154 | } 155 | batch.Reset() 156 | log.Printf("indexed %d entries", i) 157 | } 158 | 159 | mu.Lock() 160 | } 161 | 162 | log.Printf("Done indexing!") 163 | */ 164 | 165 | return nil 166 | } 167 | 168 | /* 169 | Example: 170 | 171 | AccessibleComputing 172 | 0 173 | 10 174 | 175 | 176 | 834079434 177 | 767284433 178 | 2018-04-03T20:38:02Z 179 | 180 | امیر اعوانی 181 | 8214454 182 | 183 | 184 | wikitext 185 | text/x-wiki 186 | #REDIRECT [[Computer accessibility]] 187 | 188 | {{Redirect category shell}} 189 | {{R from move}} 190 | {{R from CamelCase}} 191 | {{R unprintworthy}} 192 | qdiw0cwardl0qpkyeutu3pd77fwym8y 193 | 194 | 195 | */ 196 | 197 | type redirect struct { 198 | Title string `xml:"title,attr"` 199 | } 200 | 201 | type page struct { 202 | XMLName xml.Name `xml:"page"` 203 | Title string `xml:"title"` 204 | NS int `xml:"ns"` 205 | ID int `xml:"id"` 206 | Redirect []redirect `xml:"redirect"` 207 | RevisionID string `xml:"revision>id"` 208 | Timestamp string `xml:"revision>timestamp"` 209 | Username string `xml:"revision>contributor>username"` 210 | UserID string `xml:"revision>contributor>id"` 211 | Model string `xml:"revision>model"` 212 | Format string `xml:"revision>format"` 213 | Text string `xml:"revision>text"` 214 | } 215 | 216 | func readArticle(meta indexEntry) (page, error) { 217 | f, err := os.Open(*articlesFile) 218 | if err != nil { 219 | return page{}, err 220 | } 221 | defer f.Close() 222 | 223 | mu.Lock() 224 | maxTries := mu.offsetSize[meta.seek] 225 | mu.Unlock() 226 | 227 | r := bzip2.NewReader(f) 228 | 229 | if _, err := f.Seek(int64(meta.seek), 0); err != nil { 230 | return page{}, err 231 | } 232 | 233 | d := xml.NewDecoder(r) 234 | 235 | var p page 236 | for i := 0; i < maxTries; i++ { 237 | if err := d.Decode(&p); err != nil { 238 | return page{}, err 239 | } 240 | if p.ID == meta.id { 241 | return p, nil 242 | } 243 | } 244 | 245 | return page{}, errors.Errorf("failed to find page after %d tries", maxTries) 246 | } 247 | 248 | func fetchArticle(name string) (indexEntry, error) { 249 | mu.Lock() 250 | defer mu.Unlock() 251 | 252 | articleMeta, ok := mu.offsets[cityhash.Hash64([]byte(name))] 253 | if ok { 254 | return articleMeta, nil 255 | } 256 | articleMeta, ok = mu.offsets[cityhash.Hash64([]byte(strings.Title(strings.ToLower(name))))] 257 | if ok { 258 | return articleMeta, nil 259 | } 260 | return indexEntry{}, statusErrorf(http.StatusNotFound, "article not found: %q", name) 261 | } 262 | 263 | func randomArticleHash() (uint64, error) { 264 | mu.Lock() 265 | defer mu.Unlock() 266 | 267 | for hash := range mu.offsets { 268 | return hash, nil 269 | } 270 | return 0, errors.Errorf("no articles") 271 | } 272 | 273 | func randomArticle() (page, error) { 274 | hash, err := randomArticleHash() 275 | if err != nil { 276 | return page{}, err 277 | } 278 | 279 | mu.Lock() 280 | meta := mu.offsets[hash] 281 | mu.Unlock() 282 | 283 | return readArticle(meta) 284 | } 285 | 286 | type statusError int 287 | 288 | func (s statusError) Error() string { 289 | return fmt.Sprintf("%d - %s", int(s), http.StatusText(int(s))) 290 | } 291 | 292 | func statusErrorf(code int, str string, args ...interface{}) error { 293 | return errors.Wrapf(statusError(code), str, args...) 294 | } 295 | 296 | func errorHandler(f func(w http.ResponseWriter, r *http.Request) error) http.HandlerFunc { 297 | return func(w http.ResponseWriter, r *http.Request) { 298 | if err := f(w, r); err != nil { 299 | cause := errors.Cause(err) 300 | status := http.StatusInternalServerError 301 | if cause, ok := cause.(statusError); ok { 302 | status = int(cause) 303 | } 304 | if err := executeTemplate(w, "error.html", struct { 305 | Title, Error string 306 | }{ 307 | Title: err.Error(), 308 | Error: fmt.Sprintf("%+v", err), 309 | }); err != nil { 310 | http.Error(w, err.Error(), http.StatusInternalServerError) 311 | return 312 | } 313 | w.WriteHeader(status) 314 | } 315 | } 316 | 317 | } 318 | 319 | func handleArticle(w http.ResponseWriter, r *http.Request) error { 320 | articleName := wikitext.URLToTitle(path.Base(r.URL.Path)) 321 | 322 | if articleName == "Special:Random" { 323 | article, err := randomArticle() 324 | if err != nil { 325 | return err 326 | } 327 | http.Redirect(w, r, path.Join("/wiki/", wikitext.TitleToURL(article.Title)), http.StatusTemporaryRedirect) 328 | return nil 329 | } 330 | 331 | articleMeta, err := fetchArticle(articleName) 332 | if err != nil { 333 | return err 334 | } 335 | 336 | p, err := readArticle(articleMeta) 337 | if err != nil { 338 | return err 339 | } 340 | 341 | if p.Title != articleName { 342 | http.Redirect(w, r, path.Join("/wiki/", wikitext.TitleToURL(p.Title)), http.StatusTemporaryRedirect) 343 | return nil 344 | } 345 | 346 | body, err := wikitext.Convert( 347 | []byte(p.Text), 348 | wikitext.TemplateHandler(p.templateHandler), 349 | ) 350 | if err != nil { 351 | return err 352 | } 353 | if err := executeTemplate(w, "article.html", struct { 354 | Title string 355 | Body template.HTML 356 | }{ 357 | Title: articleName, 358 | Body: template.HTML(body), 359 | }); err != nil { 360 | return err 361 | } 362 | return nil 363 | } 364 | 365 | func handleSource(w http.ResponseWriter, r *http.Request) error { 366 | articleName := wikitext.URLToTitle(path.Base(r.URL.Path)) 367 | 368 | articleMeta, err := fetchArticle(articleName) 369 | if err != nil { 370 | return err 371 | } 372 | p, err := readArticle(articleMeta) 373 | if err != nil { 374 | return err 375 | } 376 | return executeTemplate(w, "source.html", p) 377 | } 378 | 379 | func handleIndex(w http.ResponseWriter, r *http.Request) error { 380 | http.Redirect(w, r, "/wiki/Main_Page", http.StatusTemporaryRedirect) 381 | return nil 382 | } 383 | 384 | func main() { 385 | if err := run(); err != nil { 386 | log.Fatalf("%+v", err) 387 | } 388 | } 389 | 390 | func run() error { 391 | flag.Parse() 392 | log.SetFlags(log.Flags() | log.Lshortfile) 393 | 394 | go func() { 395 | if err := loadIndex(); err != nil { 396 | log.Fatalf("%+v", err) 397 | } 398 | }() 399 | 400 | if err := loadTemplates(); err != nil { 401 | return err 402 | } 403 | 404 | http.Handle("/static/", http.StripPrefix("/static/", http.FileServer(http.Dir("./static")))) 405 | http.HandleFunc("/source/", errorHandler(handleSource)) 406 | http.HandleFunc("/wiki/", errorHandler(handleArticle)) 407 | http.HandleFunc("/", errorHandler(handleIndex)) 408 | 409 | log.Printf("Listening on %s...", *httpAddr) 410 | return http.ListenAndServe(*httpAddr, nil) 411 | } 412 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # wikigopher 2 | 3 | A fully self contained server that can read Wikipedia database dumps and display 4 | them. It also contains a wikitext -> html converter. 5 | 6 | ## Install 7 | 8 | ``` 9 | $ go get -u github.com/d4l3k/wikigopher 10 | ``` 11 | 12 | ## Download Wikipedia Database Dumps 13 | 14 | You need to download the multistream article dumps 15 | 16 | * enwiki-latest-pages-articles-multistream-index.txt.bz2 17 | * enwiki-latest-pages-articles-multistream.xml.bz2 18 | 19 | from https://dumps.wikimedia.org/enwiki/latest/ 20 | 21 | You'll need to place these in the wikigopher directory or specify their location 22 | with `-index=....txt.bz2 -articles=....xml.bz2`. 23 | 24 | The multistream varients are required. The index file is a mapping between 25 | article titles and their locations in the multistream xml file. 26 | 27 | More information can be found at https://en.wikipedia.org/wiki/Wikipedia:Database_download#Where_do_I_get_it? 28 | 29 | ## License 30 | 31 | wikigopher is licensed under the MIT license. 32 | 33 | ## Attributions 34 | 35 | The gopher image used was created by Takuya Ueda (https://twitter.com/tenntenn). Licensed under the Creative Commons 3.0 Attributions license. 36 | 37 | Some CSS styles have been borrowed from MediaWiki. 38 | -------------------------------------------------------------------------------- /static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d4l3k/wikigopher/95ca9e7b979357263800dcabc8462c8ce8ef5ee6/static/favicon.png -------------------------------------------------------------------------------- /static/gopher-front.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /static/style.css: -------------------------------------------------------------------------------- 1 | html { 2 | font-size: 100%; 3 | } 4 | 5 | body { 6 | font-family: sans-serif; 7 | color: #222222; 8 | line-height: 1.6; 9 | display: flex; 10 | margin: 0; 11 | padding: 0; 12 | background-color: #f6f6f6; 13 | } 14 | 15 | nav { 16 | margin: 1.6em; 17 | } 18 | 19 | pre { 20 | white-space: pre-wrap; 21 | } 22 | 23 | h1, h2, h3, h4, h5, h6 { 24 | color: #000; 25 | background: none; 26 | font-weight: normal; 27 | margin: 0; 28 | margin-bottom: 0px; 29 | overflow: hidden; 30 | padding-top: 0.5em; 31 | padding-bottom: 0.17em; 32 | border-bottom: 1px solid #a2a9b1; 33 | } 34 | 35 | h1, h2 { 36 | font-family: 'Linux Libertine','Georgia','Times',serif; 37 | line-height: 1.3; 38 | margin-bottom: 0.25em; 39 | padding: 0; 40 | } 41 | 42 | h3, h4, h5, h6 { 43 | border-bottom: 0; 44 | font-weight: bold; 45 | } 46 | 47 | h1 { 48 | font-size: 1.8em; 49 | } 50 | 51 | h2 { 52 | margin-top: 1em; 53 | font-size: 1.5em; 54 | } 55 | 56 | h3 { 57 | font-size: 1.2em; 58 | } 59 | 60 | p { 61 | line-height: inherit; 62 | margin: 0.5em 0; 63 | } 64 | 65 | .content { 66 | margin: 2em 0; 67 | border: 1px solid #a7d7f9; 68 | border-right: none; 69 | background-color: white; 70 | padding: 1.25em 1.5em 1.5em 1.5em; 71 | flex-grow: 10000; 72 | } 73 | 74 | .content-nav { 75 | font-size: 0.5em; 76 | font-family: sans-serif; 77 | float: right; 78 | } 79 | 80 | .body { 81 | font-size: 0.875em; 82 | position: relative; 83 | } 84 | 85 | a.external::after { 86 | content: " 🔗"; 87 | text-decoration: none; 88 | display: inline-block; 89 | margin-left: 2px; 90 | position: relative; 91 | bottom: 4px; 92 | font-size: 0.9em; 93 | } 94 | 95 | .image { 96 | border: 1px solid #c8ccd1; 97 | padding: 3px; 98 | background-color: #f8f9fa; 99 | font-size: 94%; 100 | text-align: center; 101 | overflow: hidden; 102 | width: 300px; 103 | margin: 0.5em 0 1.3em 1.4em; 104 | float: right; 105 | } 106 | 107 | a { 108 | text-decoration: none; 109 | color: #0645ad; 110 | background: none; 111 | } 112 | 113 | a:visited { 114 | color:#0b0080; 115 | } 116 | 117 | a:active { 118 | color:#faa700; 119 | } 120 | 121 | a:hover, a:focus { 122 | text-decoration: underline; 123 | } 124 | 125 | .image .caption { 126 | text-align: left; 127 | } 128 | 129 | .brand, .brand:hover, .brand:focus, .brand:active, .brand:visited { 130 | text-decoration: none; 131 | font-family: monospace; 132 | font-size: 1.5em; 133 | color: inherit; 134 | padding-bottom: 1.5em; 135 | display: block; 136 | text-align: center; 137 | } 138 | 139 | nav > * { 140 | font-size: 0.875em; 141 | } 142 | 143 | nav > a { 144 | display: block; 145 | } 146 | 147 | ref { 148 | vertical-align: super; 149 | font-size: smaller; 150 | } 151 | 152 | ref::before { 153 | content:"["; 154 | } 155 | 156 | ref::after { 157 | content:"]"; 158 | } 159 | -------------------------------------------------------------------------------- /templates.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "log" 6 | "path" 7 | "strconv" 8 | "strings" 9 | 10 | lua "github.com/Shopify/go-lua" 11 | "github.com/d4l3k/wikigopher/wikitext" 12 | "github.com/davecgh/go-spew/spew" 13 | "github.com/pkg/errors" 14 | ) 15 | 16 | var templateFuncs = map[string]func(attrs []wikitext.Attribute) (interface{}, error){ 17 | "ifeq": func(attrs []wikitext.Attribute) (interface{}, error) { 18 | if len(attrs) < 3 || len(attrs) > 4 { 19 | return nil, errors.Errorf("must have 3 or 4 arguments to #ifeq, got %d", len(attrs)) 20 | } 21 | 22 | var trueVal interface{} 23 | var falseVal interface{} 24 | if len(attrs) >= 3 { 25 | trueVal = attrs[2].Key 26 | } 27 | if len(attrs) == 4 { 28 | falseVal = attrs[3].Key 29 | } 30 | 31 | a := wikitext.Concat(attrs[0].Key) 32 | b := wikitext.Concat(attrs[1].Key) 33 | aVal, err := strconv.ParseFloat(a, 64) 34 | if err == nil { 35 | bVal, err := strconv.ParseFloat(b, 64) 36 | if err == nil { 37 | if aVal == bVal { 38 | return trueVal, nil 39 | } 40 | return falseVal, nil 41 | } 42 | } 43 | 44 | if a == b { 45 | return trueVal, nil 46 | } 47 | return falseVal, nil 48 | }, 49 | 50 | "if": func(attrs []wikitext.Attribute) (interface{}, error) { 51 | if len(attrs) < 2 || len(attrs) > 3 { 52 | return nil, errors.Errorf("must have 2 or 3 arguments to #if, got %d", len(attrs)) 53 | } 54 | 55 | a := strings.TrimSpace(wikitext.Concat(attrs[0].Key)) 56 | if len(a) > 0 { 57 | return attrs[1].Key, nil 58 | } 59 | if len(attrs) > 2 { 60 | return attrs[2].Key, nil 61 | } 62 | return nil, nil 63 | }, 64 | 65 | "invoke": func(attrs []wikitext.Attribute) (interface{}, error) { 66 | if len(attrs) < 1 { 67 | return nil, errors.Errorf("must have at least one attribute") 68 | } 69 | 70 | module, err := loadModule(wikitext.Concat(attrs[0])) 71 | if err != nil { 72 | return nil, err 73 | } 74 | methodName := wikitext.Concat(attrs[1]) 75 | 76 | l := lua.NewState() 77 | 78 | lua.OpenLibraries(l) 79 | /* 80 | lua.BaseOpen(l) 81 | lua.StringOpen(l) 82 | lua.MathOpen(l) 83 | lua.TableOpen(l) 84 | lua.Bit32Open(l) 85 | */ 86 | 87 | l.Global("require") 88 | l.SetGlobal("oldRequire") 89 | 90 | l.Register("require", func(l *lua.State) int { 91 | moduleName := lua.CheckString(l, 0) 92 | log.Printf("require called! %q", moduleName) 93 | 94 | if moduleName == "libraryUtil" { 95 | if err := lua.DoFile(l, path.Join("lua", moduleName+".lua")); err != nil { 96 | lua.Errorf(l, errors.Wrapf(err, "executing module %q", moduleName).Error()) 97 | return 0 98 | } 99 | return lua.MultipleReturns 100 | } else if strings.HasPrefix(moduleName, "Module:") { 101 | body, err := articleBody(moduleName) 102 | if err != nil { 103 | lua.Errorf(l, errors.Wrapf(err, "loading module %q", moduleName).Error()) 104 | } 105 | if err := lua.DoString(l, body); err != nil { 106 | lua.Errorf(l, errors.Wrapf(err, "executing module %q", moduleName).Error()) 107 | return 0 108 | } 109 | spew.Dump(l.ToValue(0)) 110 | spew.Dump(l.ToValue(-1)) 111 | return lua.MultipleReturns 112 | } 113 | 114 | l.Global("oldRequire") 115 | l.PushString(moduleName) 116 | l.Call(1, 1) 117 | return 1 118 | }) 119 | if err := lua.DoString(l, module); err != nil { 120 | return nil, errors.Wrapf(err, "DoString") 121 | } 122 | log.Printf("module loaded") 123 | l.Field(-1, methodName) 124 | l.PushString("args") 125 | if err := l.ProtectedCall(1, 1, 0); err != nil { 126 | return nil, errors.Wrapf(err, "calling %q", methodName) 127 | } 128 | return lua.CheckString(l, 0), nil 129 | }, 130 | } 131 | 132 | func loadModule(name string) (string, error) { 133 | name = "Module:" + name 134 | return articleBody(name) 135 | } 136 | 137 | func stripComments(code string) (string, error) { 138 | scanner := bufio.NewScanner(strings.NewReader(code)) 139 | var b strings.Builder 140 | for scanner.Scan() { 141 | line := scanner.Text() 142 | if strings.HasPrefix(line, "--") { 143 | continue 144 | } 145 | b.WriteString(line) 146 | b.WriteRune('\n') 147 | } 148 | if err := scanner.Err(); err != nil { 149 | return "", err 150 | } 151 | return b.String(), nil 152 | } 153 | 154 | func articleBody(name string) (string, error) { 155 | articleMeta, err := fetchArticle(name) 156 | if err != nil { 157 | return "", err 158 | } 159 | p, err := readArticle(articleMeta) 160 | if err != nil { 161 | return "", err 162 | } 163 | return p.Text, nil 164 | } 165 | 166 | func templateFuncHandler(name string, attrs []wikitext.Attribute) (interface{}, error) { 167 | f, ok := templateFuncs[name] 168 | if ok { 169 | v, err := f(attrs) 170 | if err != nil { 171 | log.Printf("Error executing func %q: %+v", name, err) 172 | return nil, err 173 | } 174 | return v, nil 175 | } 176 | return nil, errors.Errorf("unknown func: %q, args: %v", name, attrs) 177 | } 178 | 179 | func (p page) templateHandler(name string, attrs []wikitext.Attribute) (interface{}, error) { 180 | if name == "NAMESPACE" { 181 | parts := strings.Split(p.Title, ":") 182 | if len(parts) > 1 { 183 | return parts[0], nil 184 | } 185 | return nil, nil 186 | 187 | } else if name == "NUMBEROFARTICLES" { 188 | mu.Lock() 189 | defer mu.Unlock() 190 | 191 | return len(mu.offsets), nil 192 | 193 | } else if strings.HasPrefix(name, "#") { 194 | parts := strings.SplitN(name, ":", 2) 195 | if len(parts) > 1 { 196 | attrs = append([]wikitext.Attribute{ 197 | {Key: parts[1]}, 198 | }, attrs...) 199 | } 200 | return templateFuncHandler(parts[0][1:], attrs) 201 | } 202 | 203 | /* 204 | templateBody, err := articleBody("Template:" + name) 205 | if err != nil { 206 | return nil, errors.Wrapf(err, "unknown template: %q, args: %v", name, attrs) 207 | } 208 | 209 | body, err := wikitext.Convert( 210 | []byte(templateBody), 211 | wikitext.TemplateHandler(p.templateHandler), 212 | ) 213 | if err != nil { 214 | return nil, err 215 | } 216 | doc, err := html.Parse(bytes.NewReader(body)) 217 | if err != nil { 218 | return nil, err 219 | } 220 | 221 | return doc, nil 222 | */ 223 | 224 | return nil, errors.Errorf("unknown template: %q, args: %v", name, attrs) 225 | } 226 | -------------------------------------------------------------------------------- /templates/article.html: -------------------------------------------------------------------------------- 1 | {{define "nav"}} 2 | Source 3 | {{end}} 4 | 5 | {{define "content"}} 6 | {{.Body}} 7 | {{end}} 8 | -------------------------------------------------------------------------------- /templates/base.html: -------------------------------------------------------------------------------- 1 | {{ define "base" }} 2 | 3 | 4 | {{block "title" .}}{{.Title}}{{end}} - wikigopher 5 | 6 | 7 | 8 | 9 | 19 |
20 |

21 | {{block "title" .}} {{end}} 22 | 23 | {{block "nav" .}}{{end}} 24 | 25 |

26 | 27 |
28 | {{template "content" .}} 29 |
30 |
31 | 32 | 33 | {{ end }} 34 | -------------------------------------------------------------------------------- /templates/error.html: -------------------------------------------------------------------------------- 1 | {{define "title"}}Error: {{.Title}}{{end}} 2 | 3 | {{define "content"}} 4 |
{{.Error}}
5 | {{end}} 6 | -------------------------------------------------------------------------------- /templates/source.html: -------------------------------------------------------------------------------- 1 | {{define "title"}}Source: {{.Title}}{{end}} 2 | 3 | {{define "nav"}} 4 | Article 5 | {{end}} 6 | 7 | 8 | {{define "content"}} 9 |
{{.Text}}
10 | {{end}} 11 | -------------------------------------------------------------------------------- /wikitext/debug.go: -------------------------------------------------------------------------------- 1 | package wikitext 2 | 3 | import ( 4 | "log" 5 | "reflect" 6 | "runtime" 7 | ) 8 | 9 | func debugRules(compute bool) { 10 | for _, rule := range g.rules { 11 | debugExpr(rule.expr, compute) 12 | } 13 | } 14 | 15 | func debugExpr(e interface{}, compute bool) { 16 | switch e := e.(type) { 17 | case *actionExpr: 18 | oldRun := e.run 19 | name := getFunctionName(e.run) 20 | e.run = func(p *parser) (interface{}, error) { 21 | log.Printf("run %q", name) 22 | stack := p.vstack[len(p.vstack)-1] 23 | r := debugRun{ 24 | Name: name, 25 | Stack: stack, 26 | Text: string(p.cur.text), 27 | } 28 | if compute { 29 | p.vstack[len(p.vstack)-1] = shuckStack(stack) 30 | val, err := oldRun(p) 31 | if err != nil { 32 | return nil, err 33 | } 34 | p.vstack[len(p.vstack)-1] = stack 35 | r.Value = val 36 | } 37 | 38 | return r, nil 39 | } 40 | debugExpr(e.expr, compute) 41 | 42 | case *labeledExpr: 43 | debugExpr(e.expr, compute) 44 | 45 | case *expr: 46 | debugExpr(e.expr, compute) 47 | 48 | case *andExpr: 49 | debugExpr(e.expr, compute) 50 | 51 | case *notExpr: 52 | debugExpr(e.expr, compute) 53 | 54 | case *zeroOrOneExpr: 55 | debugExpr(e.expr, compute) 56 | 57 | case *zeroOrMoreExpr: 58 | debugExpr(e.expr, compute) 59 | 60 | case *oneOrMoreExpr: 61 | debugExpr(e.expr, compute) 62 | 63 | case *seqExpr: 64 | for _, e := range e.exprs { 65 | debugExpr(e, compute) 66 | } 67 | 68 | case *choiceExpr: 69 | for _, e := range e.alternatives { 70 | debugExpr(e, compute) 71 | } 72 | 73 | case *ruleRefExpr, *litMatcher, *andCodeExpr, *charClassMatcher, *anyMatcher, *notCodeExpr, *stateCodeExpr: 74 | 75 | default: 76 | log.Fatalf("debugExpr: unsupported type %T: %#v", e, e) 77 | } 78 | } 79 | 80 | // from https://stackoverflow.com/questions/7052693/how-to-get-the-name-of-a-function-in-go 81 | func getFunctionName(i interface{}) string { 82 | return runtime.FuncForPC(reflect.ValueOf(i).Pointer()).Name() 83 | } 84 | 85 | type debugRun struct { 86 | Name string 87 | Stack map[string]interface{} 88 | Text string 89 | Value interface{} 90 | } 91 | 92 | func shuck(v interface{}) interface{} { 93 | switch v := v.(type) { 94 | case debugRun: 95 | return v.Value 96 | 97 | case []interface{}: 98 | return shuckArr(v) 99 | 100 | default: 101 | return v 102 | } 103 | } 104 | 105 | func shuckArr(arr []interface{}) []interface{} { 106 | var out []interface{} 107 | for _, val := range arr { 108 | out = append(out, shuck(val)) 109 | } 110 | return out 111 | } 112 | 113 | func shuckStack(stack map[string]interface{}) map[string]interface{} { 114 | out := map[string]interface{}{} 115 | for k, v := range stack { 116 | out[k] = shuck(v) 117 | } 118 | return out 119 | } 120 | -------------------------------------------------------------------------------- /wikitext/pegTokenizer.pegjs: -------------------------------------------------------------------------------- 1 | /** 2 | * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several 3 | * chunks of tokens (one chunk per top-level block matched) and eventually an 4 | * end event. Tokens map to HTML tags as far as possible, with custom tokens 5 | * used where further processing on the token stream is needed. 6 | */ 7 | { 8 | 9 | var pegIncludes = options.pegIncludes; 10 | var pegTokenizer = options.pegTokenizer; 11 | 12 | var env = pegTokenizer.env; 13 | var pipelineOpts = pegTokenizer.options; 14 | 15 | var DU = pegIncludes.DOMUtils; 16 | var Util = pegIncludes.Util; 17 | var JSUtils = pegIncludes.JSUtils; 18 | var PegTokenizer = pegIncludes.PegTokenizer; 19 | var defines = pegIncludes.defines; 20 | var constants = pegIncludes.constants; 21 | var tu = pegIncludes.tu; 22 | 23 | // define some constructor shortcuts 24 | var KV = defines.KV; 25 | var TagTk = defines.TagTk; 26 | var SelfclosingTagTk = defines.SelfclosingTagTk; 27 | var EndTagTk = defines.EndTagTk; 28 | var NlTk = defines.NlTk; 29 | var CommentTk = defines.CommentTk; 30 | var EOFTk = defines.EOFTk; 31 | var lastItem = JSUtils.lastItem; 32 | 33 | var inlineBreaks = tu.inlineBreaks; 34 | var stops = new tu.SyntaxStops(); 35 | 36 | var prevOffset = 0; 37 | 38 | // Some shorthands for legibility 39 | var startOffset = function() { 40 | return location().start.offset; 41 | }; 42 | var endOffset = function() { 43 | return location().end.offset; 44 | }; 45 | var tsrOffsets = function(flag) { 46 | return tu.tsrOffsets(location(), flag); 47 | }; 48 | 49 | /* 50 | * Emit a chunk of tokens to our consumers. Once this has been done, the 51 | * current expression can return an empty list (true). 52 | */ 53 | var emitChunk = function(tokens) { 54 | if (env.immutable) { 55 | // Tokens placed in the tokenizer's cache have been frozen to 56 | // to catch any mutations while testing, which may have led to 57 | // subtle, spooky action at a distance. 58 | tokens = Util.unFreeze(tokens, true); 59 | } 60 | 61 | // Shift tsr of all tokens by the pipeline offset 62 | Util.shiftTokenTSR(tokens, options.pipelineOffset); 63 | env.log("trace/peg", pegTokenizer.pipelineId, "----> ", tokens); 64 | 65 | var i; 66 | var n = tokens.length; 67 | 68 | // Enforce parsing resource limits 69 | for (i = 0; i < n; i++) { 70 | tu.enforceParserResourceLimits(env, tokens[i]); 71 | } 72 | 73 | // limit the size of individual chunks 74 | var chunkLimit = 100000; 75 | if (n > chunkLimit) { 76 | i = 0; 77 | while (i < n) { 78 | options.cb(tokens.slice(i, i + chunkLimit)); 79 | i += chunkLimit; 80 | } 81 | } else { 82 | options.cb(tokens); 83 | } 84 | }; 85 | 86 | /* ------------------------------------------------------------------------ 87 | * Extension tags should be parsed with higher priority than anything else. 88 | * 89 | * The trick we use is to strip out the content inside a matching tag-pair 90 | * and not tokenize it. The content, if it needs to parsed (for example, 91 | * for , <*include*> tags), is parsed in a fresh tokenizer context 92 | * which means any error correction that needs to happen is restricted to 93 | * the scope of the extension content and doesn't spill over to the higher 94 | * level. Ex: ).)*-->/g, "") 557 | // but, as always, things around here are a little more complicated. 558 | // 559 | // We accept the same comments, but because we emit them as HTML comments 560 | // instead of deleting them, we have to encode the data to ensure that 561 | // we always emit a valid HTML5 comment. See the encodeComment helper 562 | // for further details. 563 | 564 | comment 565 | = '" .)* ('-->' / eof) { 566 | var data = DU.encodeComment(c); 567 | return [new CommentTk(data, { tsr: tsrOffsets() })]; 568 | } 569 | 570 | 571 | // Behavior switches. See: 572 | // https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches 573 | behavior_switch 574 | = bs:$('__' behavior_text '__') { 575 | if (env.conf.wiki.isMagicWord(bs)) { 576 | return [ 577 | new SelfclosingTagTk('behavior-switch', [ new KV('word', bs) ], 578 | { tsr: tsrOffsets(), src: bs, magicSrc: bs } 579 | ), 580 | ]; 581 | } else { 582 | return [ bs ]; 583 | } 584 | } 585 | 586 | // Instead of defining a charset, php's doDoubleUnderscore concats a regexp of 587 | // all the language specific aliases of the behavior switches and then does a 588 | // match and replace. Just be as permissive as possible and let the 589 | // BehaviorSwitchPreprocessor back out of any overreach. 590 | behavior_text = $( !'__' [^'"<~[{\n\r:;\]}|!=] )+ 591 | 592 | 593 | /************************************************************** 594 | * External (bracketed and autolinked) links 595 | **************************************************************/ 596 | 597 | autolink 598 | = ! { return stops.onStack('extlink'); } 599 | // this must be a word boundary, so previous character must be non-word 600 | ! { return /\w/.test(input[endOffset() - 1] || ''); } 601 | r:( 602 | // urllink, inlined 603 | target:autourl { 604 | var res = [new SelfclosingTagTk('urllink', [new KV('href', target)], { tsr: tsrOffsets() })]; 605 | return res; 606 | } 607 | / autoref 608 | / isbn) { return r; } 609 | 610 | extlink "extlink" 611 | = ! { return stops.onStack('extlink'); } // extlink cannot be nested 612 | r:( 613 | "[" 614 | & { return stops.push('extlink', true); } 615 | addr:(url_protocol urladdr / "") 616 | target:(extlink_preprocessor_text / "") 617 | & { 618 | // Protocol must be valid and there ought to be at least one 619 | // post-protocol character. So strip last char off target 620 | // before testing protocol. 621 | var flat = tu.flattenString([addr, target]); 622 | if (Array.isArray(flat)) { 623 | // There are templates present, alas. 624 | return flat.length > 0; 625 | } 626 | return Util.isProtocolValid(flat.slice(0, -1), env); 627 | } 628 | sp:$( space / unispace )* 629 | targetOff:( "" { return endOffset(); }) 630 | content:inlineline? 631 | "]" { 632 | stops.pop('extlink'); 633 | return [ 634 | new SelfclosingTagTk('extlink', [ 635 | new KV('href', tu.flattenString([addr, target])), 636 | new KV('mw:content', content || ''), 637 | new KV('spaces', sp), 638 | ], { 639 | targetOff: targetOff, 640 | tsr: tsrOffsets(), 641 | contentOffsets: [targetOff, endOffset() - 1], 642 | }), 643 | ]; 644 | } 645 | / "[" & { return stops.pop('extlink'); } 646 | ) { return r; } 647 | 648 | autoref 649 | = ref:('RFC' / 'PMID') sp:space_or_nbsp+ identifier:$[0-9]+ end_of_word 650 | { 651 | var base_urls = { 652 | 'RFC': 'https://tools.ietf.org/html/rfc%s', 653 | 'PMID': '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract', 654 | }; 655 | return [ 656 | new SelfclosingTagTk('extlink', [ 657 | new KV('href', tu.sprintf(base_urls[ref], identifier)), 658 | new KV('mw:content', tu.flattenString([ref, sp, identifier])), 659 | new KV('typeof', 'mw:ExtLink/' + ref), 660 | ], 661 | { stx: "magiclink", tsr: tsrOffsets() }), 662 | ]; 663 | } 664 | 665 | isbn 666 | = 'ISBN' sp:space_or_nbsp+ isbn:( 667 | [0-9] 668 | (s:space_or_nbsp_or_dash &[0-9] { return s; } / [0-9])+ 669 | ((space_or_nbsp_or_dash / "") [xX] / "") 670 | ) isbncode:( 671 | end_of_word 672 | { 673 | // Convert isbn token-and-entity array to stripped string. 674 | return tu.flattenStringlist(isbn).filter(function(e) { 675 | return e.constructor === String; 676 | }).join('').replace(/[^\dX]/ig, '').toUpperCase(); 677 | } 678 | ) &{ 679 | // ISBNs can only be 10 or 13 digits long (with a specific format) 680 | return isbncode.length === 10 || 681 | (isbncode.length === 13 && /^97[89]/.test(isbncode)); 682 | } { 683 | return [ 684 | new SelfclosingTagTk('extlink', [ 685 | new KV('href', 'Special:BookSources/' + isbncode), 686 | new KV('mw:content', tu.flattenString(['ISBN', sp, isbn])), 687 | new KV('typeof', 'mw:WikiLink/ISBN'), 688 | ], 689 | { stx: "magiclink", tsr: tsrOffsets() }), 690 | ]; 691 | } 692 | 693 | 694 | /* Default URL protocols in MediaWiki (see DefaultSettings). Normally 695 | * these can be configured dynamically. */ 696 | 697 | url_protocol = 698 | & { return Util.isProtocolValid(input.substr(endOffset()), env); } 699 | p:$( '//' / [A-Za-z] [-A-Za-z0-9+.]* ':' '//'? ) { return p; } 700 | 701 | // no punctuation, and '{<' to trigger directives 702 | no_punctuation_char = [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{] 703 | 704 | // this is the general url rule 705 | // on the PHP side, the path part matches EXT_LINK_URL_CLASS 706 | // which is '[^][<>"\\x00-\\x20\\x7F\p{Zs}]' 707 | // the 's' and 'r' pieces below match the characters in 708 | // EXT_LINK_URL_CLASS which aren't included in no_punctuation_char 709 | url "url" 710 | = proto:url_protocol 711 | addr:(urladdr / "") 712 | path:( ( !inline_breaks 713 | c:no_punctuation_char 714 | { return c; } 715 | ) 716 | / s:[.:,'] { return s; } 717 | / comment 718 | / tplarg_or_template 719 | / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" ) 720 | r:( 721 | & "&" he:htmlentity { return he; } 722 | / [&%{] 723 | ) { return r; } 724 | )* 725 | // Must be at least one character after the protocol 726 | & { return addr.length > 0 || path.length > 0; } 727 | { 728 | return tu.flattenString([proto, addr].concat(path)); 729 | } 730 | 731 | // this is the somewhat-restricted rule used in autolinks 732 | // See Parser::doMagicLinks and Parser.php::makeFreeExternalLink. 733 | // The `path` portion matches EXT_LINK_URL_CLASS, as in the general 734 | // url rule. As in PHP, we do some fancy fixup to yank out 735 | // trailing punctuation, perhaps including parentheses. 736 | // The 's' and 'r' pieces match the characters in EXT_LINK_URL_CLASS 737 | // which aren't included in no_punctuation_char 738 | autourl 739 | = &{ return stops.push('autourl', { sawLParen: false }); } 740 | ! '//' // protocol-relative autolinks not allowed (T32269) 741 | r:( 742 | proto:url_protocol 743 | addr:(urladdr / "") 744 | path:( ( !inline_breaks 745 | ! "(" 746 | c:no_punctuation_char 747 | { return c; } 748 | ) 749 | / "(" { stops.onStack('autourl').sawLParen = true; return "("; } 750 | / [.:,] 751 | / $(['] ![']) // single quotes are ok, double quotes are bad 752 | / comment 753 | / tplarg_or_template 754 | / ! ( rhe:raw_htmlentity &{ return /^[<>\u00A0]$/.test(rhe); } ) 755 | r:( 756 | & "&" he:htmlentity { return he; } 757 | / [&%{] 758 | ) { return r; } 759 | )* 760 | { 761 | // as in Parser.php::makeFreeExternalLink, we're going to 762 | // yank trailing punctuation out of this match. 763 | var url = tu.flattenStringlist([proto, addr].concat(path)); 764 | // only need to look at last element; HTML entities are strip-proof. 765 | var last = lastItem(url); 766 | var trim = 0; 767 | if (last && last.constructor === String) { 768 | var strip = ',;\\.:!?'; 769 | if (!stops.onStack('autourl').sawLParen) { 770 | strip += ')'; 771 | } 772 | strip = new RegExp('[' + JSUtils.escapeRegExp(strip) + ']*$'); 773 | trim = strip.exec(last)[0].length; 774 | url[url.length - 1] = last.slice(0, last.length - trim); 775 | } 776 | url = tu.flattenStringlist(url); 777 | if (url.length === 1 && url[0].constructor === String && url[0].length <= proto.length) { 778 | return null; // ensure we haven't stripped everything: T106945 779 | } 780 | peg$currPos -= trim; 781 | stops.pop('autourl'); 782 | return url; 783 | } ) &{ return r !== null; } {return r; } 784 | / &{ return stops.pop('autourl'); } 785 | 786 | // This is extracted from EXT_LINK_ADDR in Parser.php: a simplified 787 | // expression to match an IPv6 address. The IPv4 address and "at least 788 | // one character of a host name" portions are punted to the `path` 789 | // component of the `autourl` and `url` productions 790 | urladdr 791 | = $( "[" [0-9A-Fa-f:.]+ "]" ) 792 | 793 | /************************************************************** 794 | * Templates, -arguments and wikilinks 795 | **************************************************************/ 796 | 797 | /* 798 | * Precedence: template arguments win over templates. See 799 | * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence 800 | * 4: {{{{·}}}} → {·{{{·}}}·} 801 | * 5: {{{{{·}}}}} → {{·{{{·}}}·}} 802 | * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}} 803 | * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·} 804 | * This is only if close has > 3 braces; otherwise we just match open 805 | * and close as we find them. 806 | */ 807 | tplarg_or_template 808 | = &'{{' &{ 809 | // Refuse to recurse beyond `maxDepth` levels. Default in the PHP parser 810 | // is $wgMaxTemplateDepth = 40; This is to prevent crashing from 811 | // buggy wikitext with lots of unclosed template calls, as in 812 | // eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094 813 | if (stops.onCount('templatedepth') === undefined || 814 | stops.onCount('templatedepth') < env.conf.parsoid.maxDepth) { 815 | return true; 816 | } else { 817 | return false; 818 | } 819 | } t:tplarg_or_template_guarded { return t; } 820 | 821 | tplarg_or_template_guarded 822 | = &{ return stops.inc('templatedepth'); } 823 | r:( &('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return a; } 824 | / a:$('{' &('{{{'+ !'{'))? b:tplarg { return [a].concat(b); } 825 | / a:$('{' &('{{' !'{'))? b:template { return [a].concat(b); } 826 | / a:broken_template { return a; } 827 | ) { 828 | stops.dec('templatedepth'); 829 | return r; 830 | } 831 | / & { return stops.dec('templatedepth'); } 832 | 833 | tplarg_or_template_or_bust "tplarg_or_template_or_bust" 834 | = r:(tplarg_or_template / .)+ { return tu.flattenIfArray(r); } 835 | 836 | template 837 | = stopLen:("" { return stops.push('preproc', /* {{ */'}}'); }) 838 | t:( template_preproc / &{ return stops.popTo('preproc', stopLen); } ) 839 | { stops.popTo('preproc', stopLen); return t; } 840 | 841 | // The PHP preprocessor maintains a single stack of "closing token we 842 | // are currently looking for", with no backtracking. This means that 843 | // once you see `[[ {{` you are looking only for `}}` -- if that template 844 | // turns out to be broken you will never pop the `}}` and there is no way 845 | // to close the `[[`. Since the PEG tokenizer in Parsoid uses backtracking 846 | // and parses in a single pass (instead of PHP's split preprocessor/parser) 847 | // we have to be a little more careful when we emulate this behavior. 848 | // If we use a rule like: 849 | // template = "{{" tplname tplargs* "}}"? 850 | // Then we end up having to reinterpret `tplname tplargs*` as a tlb if it 851 | // turns out we never find the `}}`, which involves a lot of tedious gluing 852 | // tokens back together with fingers crossed we haven't discarded any 853 | // significant newlines/whitespace/etc. An alternative would be a rule like: 854 | // broken_template = "{{" tlb 855 | // but again, `template` is used in many different contexts; `tlb` isn't 856 | // necessarily the right one to recursively invoke. Instead we get the 857 | // broken template off of the PEGjs production stack by returning immediately 858 | // after `{{`, but we leave a "broken token" on top of the preprocessor 859 | // stops stack to indicate we're "still in" the {{ context and shouldn't 860 | // ever inlineBreak for any closing tokens above this one. For example: 861 | // [[Foo{{Bar]] 862 | // This will match as: 863 | // wikilink->text,template->text --> FAILS looking for }} 864 | // backtracks, popping "]]" and "}}" off preproc stack 865 | // wikilink->text,broken_template,text --> FAILS looking for ]] 866 | // backtracks, popping "]]" and "broken" off preproc stack 867 | // broken_wikilink,text,broken_template,text --> OK 868 | // with ["broken", "broken"] left on the preproc stops stack 869 | // Note that we use stops.popTo() to make sure the preproc stack is 870 | // cleaned up properly during backtracking, even if there were broken-FOO 871 | // productions taken which (deliberately) left elements on the preproc stack. 872 | 873 | broken_template 874 | = &"{{" &{ return stops.push('preproc', 'broken'); } 875 | // for broken-template, deliberately fail to pop the preproc stops stack 876 | t:"{{" { return t; } 877 | 878 | template_preproc 879 | = "{{" nl_comment_space* 880 | target:template_param_value 881 | params:(nl_comment_space* "|" 882 | r:( p0:("" { return endOffset(); }) 883 | v:nl_comment_space* 884 | p:("" { return endOffset(); }) 885 | &("|" / "}}") 886 | { return new KV('', tu.flattenIfArray(v), [p0, p0, p0, p]); } // empty argument 887 | / template_param 888 | ) { return r; } 889 | )* 890 | nl_comment_space* 891 | inline_breaks "}}" { 892 | // Insert target as first positional attribute, so that it can be 893 | // generically expanded. The TemplateHandler then needs to shift it out 894 | // again. 895 | params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets)); 896 | var obj = new SelfclosingTagTk('template', params, { tsr: tsrOffsets(), src: text() }); 897 | return obj; 898 | } / $('{{' space_or_newline* '}}') 899 | 900 | tplarg 901 | = stopLen:("" { return stops.push('preproc', /* {{ */'}}'); }) 902 | t:(tplarg_preproc / &{ return stops.popTo('preproc', stopLen); } ) 903 | { stops.popTo('preproc', stopLen); return t; } 904 | 905 | tplarg_preproc 906 | = "{{{" 907 | p:("" { return endOffset(); }) 908 | target:template_param_value? 909 | params:(nl_comment_space* "|" 910 | r:( p0:("" { return endOffset(); }) 911 | v:nl_comment_space* 912 | p1:("" { return endOffset(); }) 913 | &("|" / "}}}") 914 | { return { tokens: v, srcOffsets: [p0, p1] }; } // empty argument 915 | / template_param_value 916 | ) { return r; } 917 | )* 918 | nl_comment_space* 919 | inline_breaks "}}}" { 920 | params = params.map(function(o) { 921 | var s = o.srcOffsets; 922 | return new KV('', tu.flattenIfArray(o.tokens), [s[0], s[0], s[0], s[1]]); 923 | }); 924 | if (target === null) { target = { tokens: '', srcOffsets: [p, p, p, p] }; } 925 | // Insert target as first positional attribute, so that it can be 926 | // generically expanded. The TemplateHandler then needs to shift it out 927 | // again. 928 | params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets)); 929 | var obj = new SelfclosingTagTk('templatearg', params, { tsr: tsrOffsets(), src: text() }); 930 | return obj; 931 | } 932 | 933 | template_param 934 | = name:template_param_name 935 | val:( 936 | kEndPos:("" { return endOffset(); }) 937 | optionalSpaceToken 938 | "=" 939 | vStartPos:("" { return endOffset(); }) 940 | optionalSpaceToken 941 | tpv:template_param_value? { 942 | return { kEndPos: kEndPos, vStartPos: vStartPos, value: (tpv && tpv.tokens) || [] }; 943 | } 944 | )? { 945 | if (val !== null) { 946 | if (val.value !== null) { 947 | return new KV(name, tu.flattenIfArray(val.value), [startOffset(), val.kEndPos, val.vStartPos, endOffset()]); 948 | } else { 949 | return new KV(tu.flattenIfArray(name), '', [startOffset(), val.kEndPos, val.vStartPos, endOffset()]); 950 | } 951 | } else { 952 | return new KV('', tu.flattenIfArray(name), [startOffset(), startOffset(), startOffset(), endOffset()]); 953 | } 954 | } 955 | // empty parameter 956 | / & [|}] { 957 | return new KV('', '', [startOffset(), startOffset(), startOffset(), endOffset()]); 958 | } 959 | 960 | template_param_name 961 | = & { return stops.push('equal', true); } 962 | tpt:(template_param_text / &'=' { return ''; }) 963 | { 964 | stops.pop('equal'); 965 | return tpt; 966 | } 967 | 968 | / & { return stops.pop('equal'); } 969 | 970 | template_param_value 971 | = & { return stops.push('equal', false); } 972 | tpt:template_param_text 973 | { 974 | stops.pop('equal'); 975 | return { tokens: tpt, srcOffsets: tsrOffsets() }; 976 | } 977 | / & { return stops.pop('equal'); } 978 | 979 | template_param_text 980 | = & { // re-enable tables within template parameters 981 | stops.push('table', false); 982 | stops.push('extlink', false); 983 | stops.push('templateArg', true); 984 | stops.push('tableCellArg', false); 985 | return stops.inc('template'); 986 | } 987 | il:(nested_block / newlineToken)+ { 988 | stops.pop('table'); 989 | stops.pop('extlink'); 990 | stops.pop('templateArg'); 991 | stops.pop('tableCellArg'); 992 | stops.dec('template'); 993 | // il is guaranteed to be an array -- so, tu.flattenIfArray will 994 | // always return an array 995 | var r = tu.flattenIfArray(il); 996 | if (r.length === 1 && r[0].constructor === String) { 997 | r = r[0]; 998 | } 999 | return r; 1000 | } 1001 | / & { stops.pop('table'); 1002 | stops.pop('extlink'); 1003 | stops.pop('templateArg'); 1004 | stops.pop('tableCellArg'); 1005 | return stops.dec('template'); 1006 | } 1007 | 1008 | //// Language converter block markup of language variants: -{ ... }- 1009 | 1010 | // Note that "rightmost opening" precedence rule (see 1011 | // https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means 1012 | // that neither -{{ nor -{{{ are parsed as a -{ token, although 1013 | // -{{{{ is (since {{{ has precedence over {{). 1014 | 1015 | lang_variant_or_tpl 1016 | = &('-{' &('{{{'+ !'{') tplarg) a:lang_variant { return a; } 1017 | / a:$('-' &('{{{'+ !'{')) b:tplarg { return [a].concat(b); } 1018 | / a:$('-' &('{{' '{{{'* !'{')) b:template { return [a].concat(b); } 1019 | / &'-{' a:lang_variant { return a; } 1020 | 1021 | broken_lang_variant 1022 | = &{ return stops.push('preproc', 'broken'); } 1023 | // for broken-lang-variant, deliberately fail to pop the stops stack 1024 | r:"-{" { return r; } 1025 | 1026 | lang_variant 1027 | = stopLen:("" { return stops.push('preproc', /* -{ */ '}-'); }) 1028 | lv:(lang_variant_preproc / &{ return stops.popTo('preproc', stopLen); }) 1029 | { stops.popTo('preproc', stopLen); return lv; } 1030 | / broken_lang_variant 1031 | 1032 | lang_variant_preproc 1033 | = lv0:("-{" { return startOffset(); }) 1034 | f:( 1035 | &{ return env.langConverterEnabled(); } 1036 | ff:opt_lang_variant_flags { 1037 | // Avoid mutating cached expression results 1038 | ff = Util.clone(ff, true); 1039 | // if flags contains 'R', then don't treat ; or : specially inside. 1040 | if (ff.flags) { 1041 | ff.raw = ff.flags.has('R') || ff.flags.has('N'); 1042 | } else if (ff.variants) { 1043 | ff.raw = true; 1044 | } 1045 | return ff; 1046 | } / 1047 | &{ return !env.langConverterEnabled(); } 1048 | "" { 1049 | // if language converter not enabled, don't try to parse inside. 1050 | return { raw: true }; 1051 | } 1052 | ) 1053 | ts:( 1054 | &{ return f.raw; } lv:lang_variant_text { return [{ text: lv }]; } 1055 | / 1056 | &{ return !f.raw; } lv:lang_variant_option_list { return lv; } 1057 | ) 1058 | inline_breaks 1059 | lv1:("}-" { return endOffset(); }) { 1060 | 1061 | if (!env.langConverterEnabled()) { 1062 | return [ "-{", ts[0].text.tokens, "}-" ]; 1063 | } 1064 | var lvsrc = input.substring(lv0, lv1); 1065 | var attribs = []; 1066 | 1067 | // Do a deep clone since we may be destructively modifying 1068 | // (the `t[fld] = name;` below) the result of a cached expression 1069 | ts = Util.clone(ts, true); 1070 | 1071 | ts.forEach(function(t) { 1072 | // move token strings into KV attributes so that they are 1073 | // properly expanded by early stages of the token pipeline 1074 | ['text','from','to'].forEach(function(fld) { 1075 | if (t[fld] === undefined) { return; } 1076 | var name = 'mw:lv' + attribs.length; 1077 | attribs.push(new KV(name, t[fld].tokens, t[fld].srcOffsets)); 1078 | t[fld] = name; 1079 | }); 1080 | }); 1081 | return [ 1082 | new SelfclosingTagTk( 1083 | 'language-variant', 1084 | attribs, 1085 | { 1086 | tsr: [lv0, lv1], 1087 | src: lvsrc, 1088 | flags: f.flags && Array.from(f.flags).sort(), 1089 | variants: f.variants && Array.from(f.variants).sort(), 1090 | original: f.original, 1091 | flagSp: f.sp, 1092 | texts: ts, 1093 | }), 1094 | ]; 1095 | } 1096 | 1097 | opt_lang_variant_flags 1098 | = f:( ff:lang_variant_flags "|" { return ff; } )? { 1099 | // Collect & separate flags and variants into a set and ordered list 1100 | var flags = new Set(); 1101 | var variants = new Set(); 1102 | var flagList = []; 1103 | var flagSpace = []; 1104 | var variantList = []; 1105 | var variantSpace = []; 1106 | var useVariants = false; 1107 | var internalSp = []; // internal whitespace, for round-tripping 1108 | if (f !== null) { 1109 | // lang_variant_flags returns arrays in reverse order. 1110 | f.flags.reverse(); 1111 | f.sp.reverse(); 1112 | var spPtr = 0; 1113 | f.flags.forEach(function(item) { 1114 | if (item.flag) { 1115 | flagSpace.push(f.sp[spPtr++]); 1116 | flags.add(item.flag); 1117 | flagList.push(item.flag); 1118 | flagSpace.push(f.sp[spPtr++]); 1119 | } 1120 | if (item.variant) { 1121 | variantSpace.push(f.sp[spPtr++]); 1122 | variants.add(item.variant); 1123 | variantList.push(item.variant); 1124 | variantSpace.push(f.sp[spPtr++]); 1125 | } 1126 | }); 1127 | if (spPtr < f.sp.length) { 1128 | // handle space after a trailing semicolon 1129 | flagSpace.push(f.sp[spPtr]); 1130 | variantSpace.push(f.sp[spPtr]); 1131 | } 1132 | } 1133 | // Parse flags (this logic is from core/languages/ConverterRule.php 1134 | // in the parseFlags() function) 1135 | if (flags.size === 0 && variants.size === 0) { 1136 | flags.add('$S'); 1137 | } else if (flags.has('R')) { 1138 | flags = new Set(['R']); // remove other flags 1139 | } else if (flags.has('N')) { 1140 | flags = new Set(['N']); // remove other flags 1141 | } else if (flags.has('-')) { 1142 | flags = new Set(['-']); // remove other flags 1143 | } else if (flags.has('T') && flags.size === 1) { 1144 | flags.add('H'); 1145 | } else if (flags.has('H')) { 1146 | // Replace A flag, and remove other flags except T and D 1147 | var nf = new Set(['$+', 'H']); 1148 | if (flags.has('T')) { nf.add('T'); } 1149 | if (flags.has('D')) { nf.add('D'); } 1150 | flags = nf; 1151 | } else if (variants.size > 0) { 1152 | useVariants = true; 1153 | } else { 1154 | if (flags.has('A')) { 1155 | flags.add('$+'); 1156 | flags.add('$S'); 1157 | } 1158 | if (flags.has('D')) { 1159 | flags.delete('$S'); 1160 | } 1161 | } 1162 | if (useVariants) { 1163 | return { variants: variants, original: variantList, sp: variantSpace }; 1164 | } else { 1165 | return { flags: flags, original: flagList, sp: flagSpace }; 1166 | } 1167 | } 1168 | 1169 | lang_variant_flags 1170 | = sp1:(space_or_newline*) f:lang_variant_flag sp2:(space_or_newline*) 1171 | more:( ";" lang_variant_flags? )? { 1172 | var r = more && more[1] ? more[1] : { sp: [], flags: [] }; 1173 | // Note that sp and flags are in reverse order, since we're using 1174 | // right recursion and want to push instead of unshift. 1175 | r.sp.push(sp2.join('')); 1176 | r.sp.push(sp1.join('')); 1177 | r.flags.push(f); 1178 | return r; 1179 | } 1180 | / sp:(space_or_newline*) { 1181 | return { sp: [ sp.join('') ], flags: [] }; 1182 | } 1183 | 1184 | lang_variant_flag 1185 | = f:[-+A-Z] { return { flag: f }; } 1186 | / v:lang_variant_name { return { variant: v }; } 1187 | / b:(!space_or_newline !nowiki [^{}|;])+ { return { bogus: b.join('') }; /* bad flag */} 1188 | 1189 | lang_variant_name // language variant name, like zh, zh-cn, etc. 1190 | = h:[a-z] t:[-a-z]+ { return h + t.join(''); } 1191 | // Escaped otherwise-unrepresentable language names 1192 | // Primarily for supporting html2html round trips; PHP doesn't support 1193 | // using nowikis here (yet!) 1194 | / nowiki_text 1195 | 1196 | lang_variant_option_list 1197 | = o:lang_variant_option rest:( ";" oo:lang_variant_option { return oo; })* 1198 | tr:( ";" space_or_newline* )? // optional trailing semicolon 1199 | { 1200 | var r = [ o ].concat(rest); 1201 | if (tr) { r.push({ semi: true, sp: tr[1].join('') }); } 1202 | return r; 1203 | } 1204 | / lvtext:lang_variant_text { return [{ text: lvtext }]; } 1205 | 1206 | lang_variant_option 1207 | = sp1:(space_or_newline*) lang:lang_variant_name 1208 | sp2:(space_or_newline*) ":" 1209 | sp3:(space_or_newline*) 1210 | lvtext:(lang_variant_nowiki / lang_variant_text_no_semi) 1211 | { 1212 | return { 1213 | twoway: true, 1214 | lang: lang, 1215 | text: lvtext, 1216 | sp: [sp1.join(''), sp2.join(''), sp3.join('')] 1217 | }; 1218 | } 1219 | / sp1:(space_or_newline*) 1220 | from:(lang_variant_nowiki / lang_variant_text_no_semi_or_arrow) 1221 | "=>" 1222 | sp2:(space_or_newline*) lang:lang_variant_name 1223 | sp3:(space_or_newline*) ":" 1224 | sp4:(space_or_newline*) 1225 | to:(lang_variant_nowiki / lang_variant_text_no_semi) 1226 | { 1227 | return { 1228 | oneway: true, 1229 | from: from, 1230 | lang: lang, 1231 | to: to, 1232 | sp: [sp1.join(''), sp2.join(''), sp3.join(''), sp4.join('')] 1233 | }; 1234 | } 1235 | 1236 | // html2wt support: If a language name or conversion string can't be 1237 | // represented w/o breaking wikitext, just wrap it in a . 1238 | // PHP doesn't support this (yet), but Parsoid does. 1239 | lang_variant_nowiki 1240 | = start:("" {return startOffset();}) 1241 | n:nowiki_text 1242 | end:("" { return endOffset();}) 1243 | space_or_newline* { 1244 | return { tokens: [ n ], srcOffsets: [start, end] }; 1245 | } 1246 | 1247 | lang_variant_text 1248 | = start:("" {return startOffset();}) 1249 | tokens:(inlineline / "|" )* 1250 | end:("" {return endOffset();}) 1251 | { return { tokens: tokens || [], srcOffsets: [start, end] }; } 1252 | 1253 | lang_variant_text_no_semi 1254 | = & { return stops.push('semicolon', true); } 1255 | lvtext:lang_variant_text 1256 | { stops.pop('semicolon'); return lvtext; } 1257 | / & { return stops.pop('semicolon'); } 1258 | 1259 | lang_variant_text_no_semi_or_arrow 1260 | = & { return stops.push('arrow', true); } 1261 | lvtext:lang_variant_text_no_semi { stops.pop('arrow'); return lvtext; } 1262 | / & { return stops.pop('arrow'); } 1263 | 1264 | wikilink_content 1265 | = ( pipe startPos:("" { return endOffset(); }) lt:link_text? { 1266 | var maybeContent = new KV('mw:maybeContent', lt, [startPos, endOffset()]); 1267 | maybeContent.vsrc = input.substring(startPos, endOffset()); 1268 | return maybeContent; 1269 | } )* 1270 | 1271 | wikilink 1272 | = stopLen:("" { return stops.push('preproc', ']]'); }) 1273 | w:(wikilink_preproc / &{ return stops.popTo('preproc', stopLen); }) 1274 | { stops.popTo('preproc', stopLen); return w; } 1275 | / broken_wikilink 1276 | 1277 | // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the 1278 | // second bracket could start an extlink. Deliberately leave entry 1279 | // on preproc stack since we haven't seen a double-close bracket. 1280 | // (See full explanation above broken_template production.) 1281 | broken_wikilink 1282 | = &"[[" &{ return stops.push('preproc', 'broken'); } 1283 | a:("[" (extlink / "[")) { return a; } 1284 | 1285 | wikilink_preproc 1286 | = "[[" 1287 | target:wikilink_preprocessor_text? 1288 | tpos:("" { return endOffset(); }) 1289 | lcs:wikilink_content 1290 | inline_breaks "]]" 1291 | { 1292 | var pipeTrick = (lcs.length === 1 && lcs[0].v === null); 1293 | var textTokens = []; 1294 | if (target === null || pipeTrick) { 1295 | textTokens.push("[["); 1296 | if (target) { 1297 | textTokens.push(target); 1298 | } 1299 | lcs.forEach(function(a) { 1300 | // a is a mw:maybeContent attribute 1301 | textTokens.push("|"); 1302 | if (a.v !== null) { textTokens.push(a.v); } 1303 | }); 1304 | textTokens.push("]]"); 1305 | return textTokens; 1306 | } 1307 | var obj = new SelfclosingTagTk('wikilink'); 1308 | var hrefKV = new KV('href', target); 1309 | hrefKV.vsrc = input.substring(startOffset() + 2, tpos); 1310 | // XXX: Point to object with path, revision and input information 1311 | // obj.source = input; 1312 | obj.attribs.push(hrefKV); 1313 | obj.attribs = obj.attribs.concat(lcs); 1314 | obj.dataAttribs = { 1315 | tsr: tsrOffsets(), 1316 | src: text(), 1317 | }; 1318 | return [obj]; 1319 | } 1320 | 1321 | // Tables are allowed inside image captions. 1322 | link_text 1323 | = & { 1324 | // Suppress the flag temporarily in this rule to consume the '=' here. 1325 | stops.push('equal', false); 1326 | return stops.push('linkdesc', true); 1327 | } 1328 | c:( // This group is similar to "block_line" but "list_item" 1329 | // is omitted since `doBlockLevels` happens after 1330 | // `replaceInternalLinks2`, where newlines are stripped. 1331 | (sol (heading / hr / full_table_in_link_caption)) 1332 | / urltext 1333 | / (!inline_breaks 1334 | r:( inline_element / '[' text_char+ ']' $(&(!']' / ']]')) / . ) { return r; } 1335 | ) 1336 | )+ { 1337 | stops.pop('equal'); 1338 | stops.pop('linkdesc'); 1339 | return tu.flattenStringlist(c); 1340 | } 1341 | / & { stops.pop('equal'); return stops.pop('linkdesc'); } 1342 | 1343 | /* Generic quote rule for italic and bold, further processed in a token 1344 | * stream transformation in doQuotes. Relies on NlTk tokens being emitted 1345 | * for each line of text to balance quotes per line. 1346 | * 1347 | * We are not using a simple pair rule here as we need to support mis-nested 1348 | * bolds/italics and MediaWiki's special heuristics for apostrophes, which are 1349 | * all not context free. */ 1350 | quote = quotes:$("''" "'"*) { 1351 | // sequences of four or more than five quotes are assumed to start 1352 | // with some number of plain-text apostrophes. 1353 | var plainticks = 0; 1354 | var result = []; 1355 | if (quotes.length === 4) { 1356 | plainticks = 1; 1357 | } else if (quotes.length > 5) { 1358 | plainticks = quotes.length - 5; 1359 | } 1360 | if (plainticks > 0) { 1361 | result.push(quotes.substring(0, plainticks)); 1362 | } 1363 | // mw-quote token Will be consumed in token transforms 1364 | var tsr = tsrOffsets(); 1365 | tsr[0] += plainticks; 1366 | var mwq = new SelfclosingTagTk('mw-quote', [], { tsr: tsr }); 1367 | mwq.value = quotes.substring(plainticks); 1368 | result.push(mwq); 1369 | return result; 1370 | } 1371 | 1372 | 1373 | /*********************************************************** 1374 | * Pre and xmlish tags 1375 | ***********************************************************/ 1376 | 1377 | extension_tag = 1378 | &{ return !stops.onStack('extTag'); } 1379 | extToken:xmlish_tag 1380 | // Account for `maybeExtensionTag` returning unmatched start / end tags 1381 | &{ return extToken.name === 'extension'; } 1382 | { return extToken; } 1383 | 1384 | nowiki 1385 | = extToken:extension_tag 1386 | &{ return extToken.getAttribute('name') === 'nowiki'; } 1387 | { return extToken; } 1388 | 1389 | // Used by nowiki extension to tokenize html entities. 1390 | nowiki_content "nowiki_content" 1391 | = c:(htmlentity / .)* { return tu.flattenIfArray(c); } 1392 | 1393 | // Used by lang_variant productions to protect special language names or 1394 | // conversion strings. 1395 | nowiki_text 1396 | = extToken:nowiki 1397 | { 1398 | var txt = Util.getExtArgInfo(extToken).dict.body.extsrc; 1399 | return Util.decodeEntities(txt); 1400 | } 1401 | 1402 | /* Generic XML-like tags 1403 | * 1404 | * These also cover extensions (including Cite), which will hook into the 1405 | * token stream for further processing. The content of extension tags is 1406 | * parsed as regular inline, but the source positions of the tag are added 1407 | * to allow reconstructing the unparsed text from the input. */ 1408 | 1409 | // See http://www.w3.org/TR/html5/syntax.html#tag-open-state and 1410 | // following paragraphs. 1411 | tag_name_chars = [^\t\n\v />\0] 1412 | tag_name = $([A-Za-z] tag_name_chars*) 1413 | 1414 | xmlish_tag 1415 | = & { 1416 | // By the time we get to `doTableStuff` in the php parser, we've already 1417 | // safely encoded element attributes. See 55313f4e in core. 1418 | stops.push('table', false); 1419 | stops.push('tableCellArg', false); 1420 | return true; 1421 | } 1422 | "<" end:"/"? 1423 | name:$(tn:tag_name & { 1424 | return isXMLTag(tn, false); // NOTE: 'extTag' stop was pushed. 1425 | }) 1426 | attribs:generic_newline_attributes 1427 | space_or_newline* // No need to preserve this -- canonicalize on RT via dirty diff 1428 | selfclose:"/"? 1429 | space* // not preserved - canonicalized on RT via dirty diff 1430 | ">" { 1431 | stops.pop('table'); 1432 | stops.pop('tableCellArg'); 1433 | stops.pop('extTag'); 1434 | 1435 | var lcName = name.toLowerCase(); 1436 | 1437 | // Extension tags don't necessarily have the same semantics as html tags, 1438 | // so don't treat them as void elements. 1439 | var isVoidElt = Util.isVoidElement(lcName) && !env.conf.wiki.extensionTags.has(lcName); 1440 | 1441 | // Support
1442 | if (lcName === 'br' && end) { 1443 | end = null; 1444 | } 1445 | 1446 | var res = tu.buildXMLTag(name, lcName, attribs, end, !!selfclose || isVoidElt, tsrOffsets()); 1447 | 1448 | // change up data-attribs in one scenario 1449 | // void-elts that aren't self-closed ==> useful for accurate RT-ing 1450 | if (!selfclose && isVoidElt) { 1451 | res.dataAttribs.selfClose = undefined; 1452 | res.dataAttribs.noClose = true; 1453 | } 1454 | 1455 | return maybeExtensionTag(res); 1456 | } 1457 | / "<" "/"? tag_name & { return stops.pop('extTag'); } 1458 | / & { stops.pop('table'); return stops.pop('tableCellArg'); } 1459 | 1460 | /* 1461 | * A variant of xmlish_tag, but also checks if the tag name is a block-level 1462 | * tag as defined in 1463 | * http://www.w3.org/TR/html5/syntax.html#tag-open-state and 1464 | * following paragraphs. 1465 | */ 1466 | block_tag 1467 | = & { 1468 | // By the time we get to `doTableStuff` in the php parser, we've already 1469 | // safely encoded element attributes. See 55313f4e in core. 1470 | stops.push('table', false); 1471 | stops.push('tableCellArg', false); 1472 | return true; 1473 | } 1474 | "<" end:"/"? 1475 | name:$(tn:tag_name & { 1476 | return isXMLTag(tn, true); // NOTE: 'extTag' stop was pushed. 1477 | }) 1478 | attribs:generic_newline_attributes 1479 | space_or_newline* 1480 | selfclose:"/"? 1481 | ">" { 1482 | stops.pop('table'); 1483 | stops.pop('tableCellArg'); 1484 | stops.pop('extTag'); 1485 | var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, !!selfclose, tsrOffsets()); 1486 | return [maybeExtensionTag(t)]; 1487 | } 1488 | / "<" "/"? tag_name & { return stops.pop('extTag'); } 1489 | / & { stops.pop('table'); return stops.pop('tableCellArg'); } 1490 | 1491 | // A generic attribute that can span multiple lines. 1492 | generic_newline_attribute 1493 | = s:space_or_newline* 1494 | namePos0:("" { return endOffset(); }) 1495 | name:generic_attribute_name 1496 | namePos:("" { return endOffset(); }) 1497 | vd:(space_or_newline* "=" v:generic_att_value? { return v; })? 1498 | { 1499 | // NB: Keep in sync w/ table_attibute 1500 | var res; 1501 | // Encapsulate protected attributes. 1502 | if (typeof name === 'string') { 1503 | name = tu.protectAttrs(name); 1504 | } 1505 | if (vd !== null) { 1506 | res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]); 1507 | res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]); 1508 | } else { 1509 | res = new KV(name, '', [namePos0, namePos, namePos, namePos]); 1510 | } 1511 | if (Array.isArray(name)) { 1512 | res.ksrc = input.substring(namePos0, namePos); 1513 | } 1514 | return res; 1515 | } 1516 | 1517 | // A single-line attribute. 1518 | table_attribute 1519 | = s:optionalSpaceToken 1520 | namePos0:("" { return endOffset(); }) 1521 | name:table_attribute_name 1522 | namePos:("" { return endOffset(); }) 1523 | vd:(optionalSpaceToken "=" v:table_att_value? { return v; })? 1524 | { 1525 | // NB: Keep in sync w/ generic_newline_attribute 1526 | var res; 1527 | // Encapsulate protected attributes. 1528 | if (typeof name === 'string') { 1529 | name = tu.protectAttrs(name); 1530 | } 1531 | if (vd !== null) { 1532 | res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]); 1533 | res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]); 1534 | } else { 1535 | res = new KV(name, '', [namePos0, namePos, namePos, namePos]); 1536 | } 1537 | if (Array.isArray(name)) { 1538 | res.ksrc = input.substring(namePos0, namePos); 1539 | } 1540 | return res; 1541 | } 1542 | 1543 | // The arrangement of chars is to emphasize the split between what's disallowed 1544 | // by html5 and what's necessary to give directive a chance. 1545 | // See: http://www.w3.org/TR/html5/syntax.html#attributes-0 1546 | generic_attribute_name 1547 | = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive 1548 | r:( $[^ \t\r\n\0/=><&{}\-!|]+ 1549 | / !inline_breaks 1550 | // \0/=> is the html5 attribute name set we do not want. 1551 | t:( directive / !( space_or_newline / [\0/=>] ) c:. { return c; } 1552 | ) { return t; } 1553 | )* 1554 | & { return r.length > 0 || q.length > 0; } 1555 | { return tu.flattenString([q].concat(r)); } 1556 | 1557 | // Also accept these chars in a wikitext table or tr attribute name position. 1558 | // They are normally not matched by the table_attribute_name. 1559 | broken_table_attribute_name_char = c:[\0/=>] { return new KV(c, ''); } 1560 | 1561 | // Same as generic_attribute_name, except for accepting tags and wikilinks. 1562 | // (That doesn't make sense (ie. match php) in the generic case.) 1563 | // We also give a chance to break on \[ (see T2553). 1564 | table_attribute_name 1565 | = q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive 1566 | r:( $[^ \t\r\n\0/=><&{}\-!|\[]+ 1567 | / !inline_breaks 1568 | // \0/=> is the html5 attribute name set we do not want. 1569 | t:( $wikilink 1570 | / directive 1571 | // Accept insane tags-inside-attributes as attribute names. 1572 | // The sanitizer will strip and shadow them for roundtripping. 1573 | // Example: generated with.. 1574 | / &xmlish_tag ill:inlineline { return ill; } 1575 | / !( space_or_newline / [\0/=>] ) c:. { return c; } 1576 | ) { return t; } 1577 | )* 1578 | & { return r.length > 0 || q.length > 0; } 1579 | { return tu.flattenString([q].concat(r)); } 1580 | 1581 | // Attribute value, quoted variants can span multiple lines. 1582 | // Missing end quote: accept /> look-ahead as heuristic. 1583 | // These need to be kept in sync with the attribute_preprocessor_text_* 1584 | generic_att_value 1585 | = s:$(space_or_newline* "'") t:attribute_preprocessor_text_single? q:$("'" / &('/'? '>')) { 1586 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length); 1587 | } 1588 | / s:$(space_or_newline* '"') t:attribute_preprocessor_text_double? q:$('"' / &('/'? '>')) { 1589 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length); 1590 | } 1591 | / s:$space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') { 1592 | return tu.getAttrVal(t, startOffset() + s.length, endOffset()); 1593 | } 1594 | 1595 | // Attribute value, restricted to a single line. 1596 | // Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic. 1597 | // These need to be kept in sync with the table_attribute_preprocessor_text_* 1598 | table_att_value 1599 | = s:$(space* "'") t:table_attribute_preprocessor_text_single? q:$("'" / &('!!' / [|\r\n])) { 1600 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length); 1601 | } 1602 | / s:$(space* '"') t:table_attribute_preprocessor_text_double? q:$('"' / &('!!' / [|\r\n])) { 1603 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length); 1604 | } 1605 | / s:$space* t:table_attribute_preprocessor_text &(space_or_newline/ eof / '!!' / '|') { 1606 | return tu.getAttrVal(t, startOffset() + s.length, endOffset()); 1607 | } 1608 | 1609 | /********************************************************* 1610 | * Lists 1611 | *********************************************************/ 1612 | list_item = dtdd / hacky_dl_uses / li 1613 | 1614 | li = bullets:list_char+ 1615 | c:inlineline? 1616 | // The inline_break is to check if we've hit a template end delimiter. 1617 | &(eolf / inline_breaks) 1618 | { 1619 | // Leave bullets as an array -- list handler expects this 1620 | var tsr = tsrOffsets('start'); 1621 | tsr[1] += bullets.length; 1622 | var li = new TagTk('listItem', [], { tsr: tsr }); 1623 | li.bullets = bullets; 1624 | return [ li ].concat(c || []); 1625 | } 1626 | 1627 | /* 1628 | * This rule is required to support wikitext of this form 1629 | * ::{|border="1"|foo|bar|baz|} 1630 | * where the leading colons are used to indent the entire table. 1631 | * This hack was added back in 2006 in commit 1632 | * a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl 1633 | * Fürstenberg. 1634 | */ 1635 | hacky_dl_uses = bullets:":"+ 1636 | tbl:(table_line (sol table_line)*) 1637 | line:inlineline? 1638 | &comment_space_eolf 1639 | { 1640 | // Leave bullets as an array -- list handler expects this 1641 | var tsr = tsrOffsets('start'); 1642 | tsr[1] += bullets.length; 1643 | var li = new TagTk('listItem', [], { tsr: tsr }); 1644 | li.bullets = bullets; 1645 | return tu.flattenIfArray([li, tbl || [], line || []]); 1646 | } 1647 | 1648 | dtdd 1649 | = bullets:(!(";" !list_char) lc:list_char { return lc; })* 1650 | ";" 1651 | & {return stops.inc('colon');} 1652 | c:inlineline? 1653 | cpos:(":" { return endOffset(); }) 1654 | // Fortunately dtdds cannot be nested, so we can simply set the flag 1655 | // back to 0 to disable it. 1656 | & { stops.counters.colon = 0; return true;} 1657 | d:inlineline? 1658 | &eolf { 1659 | // Leave bullets as an array -- list handler expects this 1660 | // TSR: +1 for the leading ";" 1661 | var numBullets = bullets.length + 1; 1662 | var tsr = tsrOffsets('start'); 1663 | tsr[1] += numBullets; 1664 | var li1 = new TagTk('listItem', [], { tsr: tsr }); 1665 | li1.bullets = bullets.slice(); 1666 | li1.bullets.push(";"); 1667 | // TSR: -1 for the intermediate ":" 1668 | var li2 = new TagTk('listItem', [], { tsr: [cpos - 1, cpos], stx: 'row' }); 1669 | li2.bullets = bullets.slice(); 1670 | li2.bullets.push(":"); 1671 | 1672 | return [ li1 ].concat(c || [], [ li2 ], d || []); 1673 | } 1674 | // Fall-back case to clear the colon flag 1675 | / & { stops.counters.colon = 0; return false; } 1676 | 1677 | 1678 | list_char = [*#:;] 1679 | 1680 | 1681 | 1682 | /****************************************************************************** 1683 | * Tables 1684 | * ------ 1685 | * Table rules are geared to support independent parsing of fragments in 1686 | * templates (the common table start / row / table end use case). The tokens 1687 | * produced by these fragments then match up to a table while building the 1688 | * DOM tree. For similar reasons, table rows do not emit explicit end tag 1689 | * tokens. 1690 | * 1691 | * The separate table_line rule is faster than moving those rules 1692 | * directly to block_lines. 1693 | * 1694 | * Notes about the full_table_in_link_caption rule 1695 | * ----------------------------------------------------- 1696 | * However, for link-tables, we have introduced a stricter parse wherein 1697 | * we require table-start and table-end tags to not come from a template. 1698 | * In addition, this new rule doesn't accept fosterable-content in 1699 | * the table unlike the more lax (sol table_line)+ rule. 1700 | * 1701 | * This is the best we can do at this time since we cannot distinguish 1702 | * between table rows and image options entirely in the tokenizer. 1703 | * 1704 | * Consider the following examples: 1705 | * 1706 | * Example 1: 1707 | * 1708 | * [[Image:Foo.jpg|left|30px|Example 1 1709 | * {{This-template-returns-a-table-start-tag}} 1710 | * |foo 1711 | * {{This-template-returns-a-table-end-tag}} 1712 | * ]] 1713 | * 1714 | * Example 2: 1715 | * 1716 | * [[Image:Foo.jpg|left|30px|Example 1 1717 | * {{echo|a}} 1718 | * |foo 1719 | * {{echo|b}} 1720 | * ]] 1721 | * 1722 | * So, we cannot know a priori (without preprocessing or fully expanding 1723 | * all templates) if "|foo" in the two examples is a table cell or an image 1724 | * option. This is a limitation of our tokenizer-based approach compared to 1725 | * the preprocessing-based approach of the PHP parser. 1726 | * 1727 | * Given this limitation, we are okay forcing a full-table context in 1728 | * link captions (if necessary, we can relax the fosterable-content requirement 1729 | * but that is broken wikitext anyway, so we can force that edge-case wikitext 1730 | * to get fixed by rejecting it). 1731 | ******************************************************************************/ 1732 | 1733 | full_table_in_link_caption 1734 | = (! inline_breaks / & '{{!}}' ) 1735 | r:( 1736 | // Note that "linkdesc" is suppressed here to provide a nested parsing 1737 | // context in which to parse the table. Otherwise, we may break on 1738 | // on pipes in the `table_start_tag` and `table_row_tag` attributes. 1739 | // However, as a result, this can be more permissive than the current 1740 | // php implementation, but likelier to match the users intent. 1741 | & { stops.push('linkdesc', false); return stops.push('table', true); } 1742 | tbl:( 1743 | table_start_tag optionalNewlines 1744 | // Accept multiple end tags since a nested table may have been 1745 | // opened in the table content line. 1746 | ((sol (table_content_line / tplarg_or_template) optionalNewlines)* 1747 | sol table_end_tag)+ 1748 | ){ 1749 | stops.pop('linkdesc'); 1750 | stops.pop('table'); 1751 | return tbl; 1752 | } 1753 | / & { stops.pop('linkdesc'); return stops.pop('table'); } 1754 | ) { return r; } 1755 | 1756 | // This rule assumes start-of-line position! 1757 | table_line 1758 | = (! inline_breaks / & '{{!}}' ) 1759 | r:( 1760 | & { return stops.push('table', true); } 1761 | tl:( 1762 | table_start_tag optionalNewlines 1763 | / table_content_line optionalNewlines 1764 | / table_end_tag 1765 | ) { 1766 | stops.pop('table'); 1767 | return tl; 1768 | } 1769 | / & { return stops.pop('table'); } 1770 | ) { return r; } 1771 | 1772 | table_content_line = (space / comment)* ( 1773 | table_heading_tags 1774 | / table_row_tag 1775 | / table_data_tags 1776 | / table_caption_tag 1777 | ) 1778 | 1779 | table_start_tag "table_start_tag" 1780 | = sc:(space / comment)* startPos:("" { return endOffset(); }) b:"{" p:pipe 1781 | // ok to normalize away stray |} on rt (see T59360) 1782 | & { return stops.push('table', false); } 1783 | ta:table_attributes 1784 | tsEndPos:("" { stops.pop('table'); return endOffset(); }) 1785 | { 1786 | var coms = tu.popComments(ta); 1787 | if (coms) { 1788 | tsEndPos = coms.commentStartPos; 1789 | } 1790 | 1791 | var da = { tsr: [startPos, tsEndPos] }; 1792 | if (p !== "|") { 1793 | // Variation from default 1794 | da.startTagSrc = b + p; 1795 | } 1796 | 1797 | sc.push(new TagTk('table', ta, da)); 1798 | if (coms) { 1799 | sc = sc.concat(coms.buf); 1800 | } 1801 | return sc; 1802 | } 1803 | 1804 | // FIXME: Not sure if we want to support it, but this should allow columns. 1805 | table_caption_tag 1806 | // avoid recursion via nested_block_in_table 1807 | = ! { return stops.onStack('tableDataBlock'); } 1808 | p:pipe "+" 1809 | args:row_syntax_table_args? 1810 | tagEndPos:("" { return endOffset(); }) 1811 | c:nested_block_in_table* { 1812 | return tu.buildTableTokens("caption", "|+", args, [startOffset(), tagEndPos], endOffset(), c, true); 1813 | } 1814 | 1815 | table_row_tag 1816 | = // avoid recursion via nested_block_in_table 1817 | ! { return stops.onStack('tableDataBlock'); } 1818 | p:pipe dashes:$"-"+ 1819 | & { return stops.push('table', false); } 1820 | a:table_attributes 1821 | tagEndPos:("" { stops.pop('table'); return endOffset(); }) 1822 | { 1823 | var coms = tu.popComments(a); 1824 | if (coms) { 1825 | tagEndPos = coms.commentStartPos; 1826 | } 1827 | 1828 | var da = { 1829 | tsr: [ startOffset(), tagEndPos ], 1830 | startTagSrc: p + dashes, 1831 | }; 1832 | 1833 | // We rely on our tree builder to close the row as needed. This is 1834 | // needed to support building tables from fragment templates with 1835 | // individual cells or rows. 1836 | var trToken = new TagTk('tr', a, da); 1837 | 1838 | var res = [ trToken ]; 1839 | if (coms) { 1840 | res = res.concat(coms.buf); 1841 | } 1842 | return res; 1843 | } 1844 | 1845 | tds 1846 | = ( pp:( pipe_pipe / p:pipe & row_syntax_table_args { return p; } ) 1847 | tdt:table_data_tag { 1848 | var da = tdt[0].dataAttribs; 1849 | da.stx = "row"; 1850 | da.tsr[0] -= pp.length; // include "||" 1851 | if (pp !== "||" || (da.startTagSrc && da.startTagSrc !== pp)) { 1852 | // Variation from default 1853 | da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : ''); 1854 | } 1855 | return tdt; 1856 | } 1857 | )* 1858 | 1859 | table_data_tags 1860 | // avoid recursion via nested_block_in_table 1861 | = ! { return stops.onStack('tableDataBlock'); } 1862 | p:pipe 1863 | ![+-] td:table_data_tag 1864 | tagEndPos:("" { return endOffset(); }) 1865 | tds:tds { 1866 | var da = td[0].dataAttribs; 1867 | da.tsr[0] -= p.length; // include "|" 1868 | if (p !== "|") { 1869 | // Variation from default 1870 | da.startTagSrc = p; 1871 | } 1872 | return td.concat(tds); 1873 | } 1874 | 1875 | table_data_tag 1876 | = ! "}" 1877 | arg:row_syntax_table_args? 1878 | // use inline_breaks to break on tr etc 1879 | tagEndPos:("" { return endOffset(); }) 1880 | td:nested_block_in_table* 1881 | { 1882 | return tu.buildTableTokens("td", "|", arg, [startOffset(), tagEndPos], endOffset(), td); 1883 | } 1884 | 1885 | table_heading_tags 1886 | = "!" 1887 | & { return stops.push('th', endOffset()); } 1888 | th:table_heading_tag 1889 | ths:( pp:("!!" / pipe_pipe) tht:table_heading_tag { 1890 | var da = tht[0].dataAttribs; 1891 | da.stx = 'row'; 1892 | da.tsr[0] -= pp.length; // include "!!" or "||" 1893 | 1894 | if (pp !== "!!" || (da.startTagSrc && da.startTagSrc !== pp)) { 1895 | // Variation from default 1896 | da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : ''); 1897 | } 1898 | return tht; 1899 | } 1900 | )* { 1901 | stops.pop('th'); 1902 | th[0].dataAttribs.tsr[0]--; // include "!" 1903 | return th.concat(ths); 1904 | } 1905 | / & { return stops.onStack('th') !== false ? stops.pop('th') : false; } 1906 | 1907 | table_heading_tag 1908 | = arg:row_syntax_table_args? 1909 | tagEndPos:("" { return endOffset(); }) 1910 | c:( & { 1911 | // This SyntaxStop is only true until we hit the end of the line. 1912 | if (stops.onStack('th') !== false && 1913 | /\n/.test(input.substring(stops.onStack('th'), endOffset()))) { 1914 | // There's been a newline. Remove the break and continue 1915 | // tokenizing nested_block_in_tables. 1916 | stops.pop('th'); 1917 | } 1918 | return true; 1919 | } d:nested_block_in_table { return d; } )* { 1920 | return tu.buildTableTokens("th", "!", arg, [startOffset(), tagEndPos], endOffset(), c); 1921 | } 1922 | 1923 | table_end_tag 1924 | = sc:(space / comment)* startPos:("" { return endOffset(); }) p:pipe b:"}" { 1925 | var tblEnd = new EndTagTk('table', [], { tsr: [startPos, endOffset()] }); 1926 | if (p !== "|") { 1927 | // p+"" is triggering some bug in pegJS 1928 | // I cannot even use that expression in the comment! 1929 | tblEnd.dataAttribs.endTagSrc = p + b; 1930 | } 1931 | return sc.concat([tblEnd]); 1932 | } 1933 | 1934 | /** 1935 | * Table parameters separated from the content by a single pipe. Does *not* 1936 | * match if followed by double pipe (row-based syntax). 1937 | */ 1938 | row_syntax_table_args 1939 | = & { return stops.push('tableCellArg', true); } 1940 | as:table_attributes s:space* p:pipe !pipe { 1941 | stops.pop('tableCellArg'); 1942 | return [as, s, p]; 1943 | } 1944 | / & { return stops.pop('tableCellArg'); } 1945 | 1946 | 1947 | /******************************************************************* 1948 | * Text variants and other general rules 1949 | *******************************************************************/ 1950 | 1951 | /* All chars that cannot start syntactic structures in the middle of a line 1952 | * XXX: ] and other end delimiters should probably only be activated inside 1953 | * structures to avoid unnecessarily leaving the text rule on plain 1954 | * content. 1955 | * 1956 | * TODO: Much of this is should really be context-dependent (syntactic 1957 | * flags). The wikilink_preprocessor_text rule is an example where 1958 | * text_char is not quite right and had to be augmented. Try to minimize / 1959 | * clarify this carefully! 1960 | */ 1961 | 1962 | text_char = [^-'<~[{\n\r:;\]}|!=] 1963 | 1964 | /* Legend 1965 | * ' quotes (italic/bold) 1966 | * < start of xmlish_tag 1967 | * ~ signatures/dates 1968 | * [ start of links 1969 | * { start of parser functions, transclusion and template args 1970 | * \n all sort of block-level markup at start of line 1971 | * \r ditto 1972 | * A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC) 1973 | * 1974 | * _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related) 1975 | * ! and | table cell delimiters, might be better to specialize those 1976 | * = headings - also specialize those! 1977 | * 1978 | * The following chars are also included for now, but only apply in some 1979 | * contexts and should probably be enabled only in those: 1980 | * : separate definition in ; term : definition 1981 | * ] end of link 1982 | * } end of parser func/transclusion/template arg 1983 | * - start of lang_variant -{ ... }- 1984 | * ; separator in lang_variant 1985 | */ 1986 | 1987 | urltext = ( $[^-'<~[{\n/A-Za-z_|!:;\]} &=]+ 1988 | / & [/A-Za-z] al:autolink { return al; } 1989 | / & "&" he:htmlentity { return he; } 1990 | // Convert trailing space into   1991 | // XXX: This should be moved to a serializer 1992 | // This is a hack to force a whitespace display before the colon 1993 | / ' ' & ':' { 1994 | var toks = Util.placeholder('\u00a0', { 1995 | src: ' ', 1996 | tsr: tsrOffsets('start'), 1997 | isDisplayHack: true, 1998 | }, { tsr: tsrOffsets('end'), isDisplayHack: true }); 1999 | var typeOf = toks[0].getAttribute('typeof'); 2000 | toks[0].setAttribute('typeof', 'mw:DisplaySpace ' + typeOf); 2001 | return toks; 2002 | } 2003 | / & ('__') bs:behavior_switch { return bs; } 2004 | // About 96% of text_char calls originate here. 2005 | // pegjs 0.8 inlines this simple rule automatically. 2006 | / text_char )+ 2007 | 2008 | raw_htmlentity = m:$("&" [#0-9a-zA-Z]+ ";") { 2009 | return Util.decodeEntities(m); 2010 | } 2011 | 2012 | htmlentity = cc:raw_htmlentity { 2013 | // if this is an invalid entity, don't tag it with 'mw:Entity' 2014 | if (cc.length > 2 /* decoded entity would be 1 or 2 UTF-16 characters */) { 2015 | return cc; 2016 | } 2017 | return [ 2018 | new TagTk('span', [new KV('typeof', 'mw:Entity')], { src: text(), srcContent: cc, tsr: tsrOffsets('start') }), 2019 | cc, 2020 | new EndTagTk('span', [], { tsr: tsrOffsets('end') }), 2021 | ]; 2022 | } 2023 | 2024 | spaces 2025 | = $[ \t]+ 2026 | 2027 | space = [ \t] 2028 | 2029 | optionalSpaceToken 2030 | = s:$space* { 2031 | if (s.length) { 2032 | return [s]; 2033 | } else { 2034 | return []; 2035 | } 2036 | } 2037 | 2038 | /* This rule corresponds to \s in the PHP preg_* functions, 2039 | * which is used frequently in the PHP parser. The inclusion of 2040 | * form feed (but not other whitespace, like vertical tab) is a quirk 2041 | * of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular 2042 | * Expressions) library. 2043 | */ 2044 | space_or_newline 2045 | = [ \t\n\r\x0c] 2046 | 2047 | /* This rule corresponds to \b in the PHP preg_* functions, 2048 | * after a word character. That is, it's a zero-width lookahead that 2049 | * the next character is not a word character. 2050 | */ 2051 | end_of_word 2052 | = eof / ![A-Za-z0-9_] 2053 | 2054 | // Unicode "separator, space" category. It covers the \u0020 space as well 2055 | // as \u3000 IDEOGRAPHIC SPACE (see bug 19052). In PHP this is \p{Zs}. 2056 | // Keep this up-to-date with the characters tagged ;Zs; in 2057 | // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt 2058 | unispace = [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000] 2059 | 2060 | // Non-newline whitespace, including non-breaking spaces. Used for magic links. 2061 | space_or_nbsp 2062 | = space // includes \t 2063 | / unispace 2064 | / he:htmlentity &{ return Array.isArray(he) && /^\u00A0$/.test(he[1]); } 2065 | { return he; } 2066 | 2067 | // Used within ISBN magic links 2068 | space_or_nbsp_or_dash 2069 | = space_or_nbsp / "-" 2070 | 2071 | // Extra newlines followed by at least another newline. Usually used to 2072 | // compress surplus newlines into a meta tag, so that they don't trigger 2073 | // paragraphs. 2074 | optionalNewlines 2075 | = spc:$([\n\r\t ] &[\n\r])* { 2076 | if (spc.length) { 2077 | return [spc]; 2078 | } else { 2079 | return []; 2080 | } 2081 | } 2082 | 2083 | comment_or_includes = (comment / ( 2084 | ( & { return stops.push("sol_il", true); } 2085 | i:include_limits 2086 | & { stops.pop("sol_il"); return true; } 2087 | ) { return i; } 2088 | / & { return stops.pop("sol_il"); } 2089 | ))* 2090 | 2091 | sol = (empty_line_with_comments / sol_prefix) comment_or_includes 2092 | 2093 | sol_prefix 2094 | = newlineToken 2095 | / & { 2096 | // Use the sol flag only at the start of the input 2097 | // NOTE: Explicitly check for 'false' and not a falsy value 2098 | return endOffset() === 0 && options.sol !== false; 2099 | } { return []; } 2100 | 2101 | empty_line_with_comments 2102 | = sp:sol_prefix p:("" { return endOffset(); }) c:(space* comment (space / comment)* newline)+ { 2103 | return [ 2104 | sp, 2105 | new SelfclosingTagTk("meta", [new KV('typeof', 'mw:EmptyLine')], { 2106 | tokens: tu.flattenIfArray(c), 2107 | tsr: [p, endOffset()], 2108 | }), 2109 | ]; 2110 | } 2111 | 2112 | comment_space = comment / space 2113 | 2114 | nl_comment_space = newlineToken / comment_space 2115 | 2116 | /** 2117 | * noinclude / includeonly / onlyinclude rules. These are normally 2118 | * handled by the xmlish_tag rule, except where generic tags are not 2119 | * allowed- for example in directives, which are allowed in various attribute 2120 | * names and -values. 2121 | * 2122 | * Example test case: 2123 | * {| 2124 | * |- 2125 | * foo 2126 | * 2127 | * |Hello 2128 | * |} 2129 | */ 2130 | 2131 | include_limits = 2132 | il:("<" c:"/"? name:$(n:$[oyinclude]i+ & { 2133 | var incl = n.toLowerCase(); 2134 | return incl === "noinclude" || incl === "onlyinclude" || 2135 | incl === "includeonly"; 2136 | }) space_or_newline* ">" { 2137 | var incl = name.toLowerCase(); 2138 | var dp = { tsr: tsrOffsets() }; 2139 | 2140 | // Record variant since tag is not in normalized lower case 2141 | if (name !== incl) { 2142 | dp.srcTagName = name; 2143 | } 2144 | 2145 | // End tag only 2146 | if (c) { 2147 | return new EndTagTk(name, [], dp); 2148 | } 2149 | 2150 | var restOfInput = input.substring(endOffset()); 2151 | var tagContent = restOfInput.match(new RegExp("^([\\s\\S]*?)(?:)", "m")); 2152 | 2153 | // Start tag only 2154 | if (!tagContent || !tagContent[1]) { 2155 | return new TagTk(name, [], dp); 2156 | } 2157 | 2158 | // Get the content 2159 | var inclContent = tagContent[1]; 2160 | 2161 | // Preserve SOL where necessary (for onlyinclude and noinclude) 2162 | // Note that this only works because we encounter <*include*> tags in 2163 | // the toplevel content and we rely on the php preprocessor to expand 2164 | // templates, so we shouldn't ever be tokenizing inInclude. 2165 | // Last line should be empty (except for comments) 2166 | if (incl !== "includeonly" && stops.onStack("sol_il")) { 2167 | var last = lastItem(inclContent.split('\n')); 2168 | if (!/^()*$/.test(last)) { 2169 | return false; 2170 | } 2171 | } 2172 | 2173 | // Tokenize include content in a new tokenizer 2174 | var inclContentToks = (new PegTokenizer(env)).tokenizeSync(inclContent); 2175 | inclContentToks = Util.stripEOFTkfromTokens(inclContentToks); 2176 | 2177 | // Shift tsr 2178 | Util.shiftTokenTSR(inclContentToks, endOffset()); 2179 | 2180 | // Skip past content 2181 | peg$currPos += inclContent.length; 2182 | 2183 | return [new TagTk(name, [], dp)].concat(inclContentToks); 2184 | }) & { return !!il; } { return il; } 2185 | 2186 | // Start of file 2187 | sof = & { return endOffset() === 0 && !options.pipelineOffset; } 2188 | 2189 | // End of file 2190 | eof = & { return endOffset() === input.length; } 2191 | 2192 | newline = '\n' / '\r\n' 2193 | 2194 | newlineToken = newline { return [new NlTk(tsrOffsets())]; } 2195 | 2196 | eolf = newline / eof 2197 | 2198 | comment_space_eolf = (space+ / comment)* eolf 2199 | 2200 | // 'Preprocessor' directive- higher-level things that can occur in otherwise 2201 | // plain-text content. 2202 | directive 2203 | = comment 2204 | / extension_tag 2205 | / tplarg_or_template 2206 | / & "-{" v:lang_variant_or_tpl { return v; } 2207 | / & "&" e:htmlentity { return e; } 2208 | / include_limits 2209 | 2210 | wikilink_preprocessor_text 2211 | = r:( t:$[^<[{\n\r\t|!\]}{ &\-]+ 2212 | // XXX gwicke: any more chars we need to allow here? 2213 | / !inline_breaks wr:( directive / $( !"]]" ( text_char / [!<\-\}\]\n\r] ) ) ) 2214 | { return wr; } 2215 | )+ { 2216 | return tu.flattenStringlist(r); 2217 | } 2218 | 2219 | extlink_preprocessor_text 2220 | // added special separator character class inline: separates url from 2221 | // description / text 2222 | = & { 2223 | // Prevent breaking on pipes when we're in a link description. 2224 | // See the test, 'Images with the "|" character in the comment'. 2225 | return stops.push('linkdesc', false); 2226 | } 2227 | r:( $[^'<~[{\n\r|!\]}\-\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ 2228 | / !inline_breaks s:( directive / no_punctuation_char / [&|{\-] ) { return s; } 2229 | /// urlencoded_char 2230 | // !inline_breaks no_punctuation_char 2231 | / $([.:,] !(space / eolf)) 2232 | / $(['] ![']) // single quotes are ok, double quotes are bad 2233 | )+ { 2234 | stops.pop('linkdesc'); 2235 | return tu.flattenString(r); 2236 | } 2237 | / & { return stops.pop('linkdesc'); } 2238 | 2239 | // Attribute values with preprocessor support 2240 | 2241 | // n.b. / is a permissible char in the three rules below. 2242 | // We only break on />, enforced by the negated expression. 2243 | // Hence, it isn't included in the stop set. 2244 | 2245 | // The stop set is space_or_newline and > which matches generic_att_value. 2246 | attribute_preprocessor_text 2247 | = r:( $[^{}&<\-|/ \t\n\r\x0c>]+ 2248 | / !inline_breaks 2249 | !'/>' 2250 | s:( directive / [{}&<\-|/] ) { return s; } 2251 | )+ { 2252 | return tu.flattenString(r); 2253 | } 2254 | 2255 | // The stop set is '> which matches generic_att_value. 2256 | attribute_preprocessor_text_single 2257 | = r:( $[^{}&<\-|/'>]+ 2258 | / !inline_breaks 2259 | !'/>' 2260 | s:( directive / [{}&<\-|/] ) { return s; } 2261 | )* { 2262 | return tu.flattenString(r); 2263 | } 2264 | 2265 | // The stop set is "> which matches generic_att_value. 2266 | attribute_preprocessor_text_double 2267 | = r:( $[^{}&<\-|/">]+ 2268 | / !inline_breaks 2269 | !'/>' 2270 | s:( directive / [{}&<\-|/] ) { return s; } 2271 | )* { 2272 | return tu.flattenString(r); 2273 | } 2274 | 2275 | // Variants with the entire attribute on a single line 2276 | 2277 | // n.b. ! is a permissible char in the three rules below. 2278 | // We only break on !! in th, enforced by the inline break. 2279 | // Hence, it isn't included in the stop set. 2280 | // [ is also permissible but we give a chance to break 2281 | // for the [[ special case in php's doTableStuff (See T2553). 2282 | 2283 | // The stop set is space_or_newline and | which matches table_att_value. 2284 | table_attribute_preprocessor_text 2285 | = r:( $[^{}&<\-!\[ \t\n\r\x0c|]+ 2286 | / !inline_breaks s:( directive / [{}&<\-!\[] ) { return s; } 2287 | )+ { 2288 | return tu.flattenString(r); 2289 | } 2290 | 2291 | // The stop set is '\r\n| which matches table_att_value. 2292 | table_attribute_preprocessor_text_single 2293 | = r:( $[^{}&<\-!\['\r\n|]+ 2294 | / !inline_breaks s:( directive / [{}&<\-!\[] ) { return s; } 2295 | )* { 2296 | return tu.flattenString(r); 2297 | } 2298 | 2299 | // The stop set is "\r\n| which matches table_att_value. 2300 | table_attribute_preprocessor_text_double 2301 | = r:( $[^{}&<\-!\["\r\n|]+ 2302 | / !inline_breaks s:( directive / [{}&<\-!\[] ) { return s; } 2303 | )* { 2304 | return tu.flattenString(r); 2305 | } 2306 | 2307 | // Special-case support for those pipe templates 2308 | pipe = "|" / "{{!}}" 2309 | 2310 | // SSS FIXME: what about |{{!}} and {{!}}| 2311 | pipe_pipe = "||" / "{{!}}{{!}}" 2312 | -------------------------------------------------------------------------------- /wikitext/rules_test.go: -------------------------------------------------------------------------------- 1 | package wikitext 2 | 3 | import ( 4 | "path" 5 | "testing" 6 | ) 7 | 8 | func TestRules(t *testing.T) { 9 | cases := []struct { 10 | rule string 11 | input string 12 | match string 13 | }{ 14 | { 15 | "wikilink_preprocessor_text", 16 | "asdf", 17 | "asdf", 18 | }, 19 | { 20 | "wikilink_preprocessor_text", 21 | "asdf|asdf", 22 | "asdf", 23 | }, 24 | { 25 | "wikilink_preproc", 26 | "[[asdf]]", 27 | `asdf`, 28 | }, 29 | { 30 | "wikilink_preproc", 31 | "[[a|b]]", 32 | `b`, 33 | }, 34 | { 35 | "template", 36 | "{{reflink}}", 37 | "", 38 | }, 39 | { 40 | "block_lines", 41 | "* foo", 42 | "
  • foo
  • ", 43 | }, 44 | { 45 | "heading", 46 | "== Foos ==", 47 | "

    Foos

    ", 48 | }, 49 | { 50 | "inlineline", 51 | "Foo's", 52 | "Foo's", 53 | }, 54 | { 55 | "heading", 56 | "== Foo's ==", 57 | "

    Foo's

    ", 58 | }, 59 | { 60 | "extlink", 61 | "[http://example.com/ Yes Foo Bar]", 62 | `Yes Foo Bar`, 63 | }, 64 | { 65 | "xmlish_tag", 66 | "
    foo
    ", 67 | `
    `, 68 | }, 69 | { 70 | "xmlish_tag", 71 | "", 72 | `
    `, 73 | }, 74 | { 75 | "xmlish_tag", 76 | "
    ", 77 | "
    ", 78 | }, 79 | { 80 | "xmlish_tag", 81 | `
    `, 82 | `
    `, 83 | }, 84 | } 85 | 86 | for _, c := range cases { 87 | c := c 88 | t.Run(path.Join(c.rule, c.input), func(t *testing.T) { 89 | val, err := Parse( 90 | "file", 91 | []byte(c.input), 92 | GlobalStore("text", []byte(c.input)), 93 | GlobalStore("len", len(c.input)), 94 | Entrypoint(c.rule), 95 | Recover(false), 96 | ) 97 | if err != nil { 98 | t.Error(err) 99 | } 100 | text := concat(val) 101 | if c.match != text { 102 | t.Errorf("got %q; expected %q", text, c.match) 103 | } 104 | }) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /wikitext/tokens.go: -------------------------------------------------------------------------------- 1 | package wikitext 2 | 3 | import ( 4 | "golang.org/x/net/html" 5 | ) 6 | 7 | func hasAttr(n *html.Node, key string) bool { 8 | for _, attr := range n.Attr { 9 | if attr.Key == key { 10 | return true 11 | } 12 | } 13 | return false 14 | } 15 | 16 | func removeAttr(n *html.Node, key string) { 17 | var attrs []html.Attribute 18 | for _, attr := range n.Attr { 19 | if attr.Key == key { 20 | continue 21 | } 22 | attrs = append(attrs, attr) 23 | } 24 | n.Attr = attrs 25 | } 26 | 27 | func processTokens(n *html.Node) []*html.Node { 28 | for child := n.FirstChild; child != nil; child = child.NextSibling { 29 | if hasAttr(child, "_parsestart") { 30 | removeAttr(child, "_parsestart") 31 | remaining := removeSiblingsAfter(child) 32 | //log.Printf("children: %q, %s", child.Data, spew.Sdump(remaining)) 33 | addChildren(child, remaining) 34 | } else if hasAttr(child, "_parseend") { 35 | remaining := removeSiblingsAfter(child) 36 | child.Parent.RemoveChild(child) 37 | return remaining 38 | } 39 | addChildren(child.Parent, processTokens(child)) 40 | } 41 | return nil 42 | } 43 | 44 | func removeSiblingsAfter(n *html.Node) []*html.Node { 45 | var children []*html.Node 46 | for child := n.NextSibling; child != nil; child = child.NextSibling { 47 | children = append(children, child) 48 | } 49 | parent := n.Parent 50 | for _, child := range children { 51 | parent.RemoveChild(child) 52 | } 53 | return children 54 | } 55 | 56 | func addChildren(n *html.Node, children []*html.Node) { 57 | for _, child := range children { 58 | n.AppendChild(child) 59 | } 60 | } 61 | 62 | func numChildren(n *html.Node) int { 63 | count := 0 64 | for child := n.FirstChild; child != nil; child = child.NextSibling { 65 | count++ 66 | } 67 | return count 68 | } 69 | -------------------------------------------------------------------------------- /wikitext/tokens_test.go: -------------------------------------------------------------------------------- 1 | package wikitext 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "strings" 7 | "testing" 8 | 9 | "golang.org/x/net/html" 10 | ) 11 | 12 | func TestProcessTokens(t *testing.T) { 13 | cases := []struct { 14 | in, want string 15 | }{ 16 | { 17 | "", "", 18 | }, 19 | { 20 | "
    ", 21 | "
    ", 22 | }, 23 | { 24 | "

    Foo

    ", 25 | `

    Foo

    `, 26 | }, 27 | { 28 | `
    Foo
    asdf

    Blah

    Bar
    `, 29 | `
    Foo
    asdf

    Blah

    Bar
    `, 30 | }, 31 | } 32 | 33 | for _, c := range cases { 34 | t.Run(c.in, func(t *testing.T) { 35 | doc, err := html.Parse(strings.NewReader(c.in)) 36 | if err != nil { 37 | t.Fatal(err) 38 | } 39 | 40 | //t.Log(concat(doc)) 41 | 42 | if remaining := processTokens(doc); len(remaining) > 0 { 43 | t.Errorf("got %d extra children", len(remaining)) 44 | } 45 | var buf bytes.Buffer 46 | if err := html.Render(&buf, doc); err != nil { 47 | t.Fatal(err) 48 | } 49 | want := fmt.Sprintf("%s", c.want) 50 | out := buf.String() 51 | if out != want { 52 | t.Errorf("expected %q;\ngot %q", want, out) 53 | } 54 | }) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /wikitext/url.go: -------------------------------------------------------------------------------- 1 | package wikitext 2 | 3 | import "strings" 4 | 5 | func URLToTitle(u string) string { 6 | return strings.Replace(u, "_", " ", -1) 7 | } 8 | 9 | func TitleToURL(u string) string { 10 | return "./" + strings.Replace(u, " ", "_", -1) 11 | } 12 | -------------------------------------------------------------------------------- /wikitext/wikitext.go: -------------------------------------------------------------------------------- 1 | package wikitext 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "log" 7 | "regexp" 8 | "strconv" 9 | "strings" 10 | 11 | "github.com/microcosm-cc/bluemonday" 12 | "github.com/pkg/errors" 13 | "golang.org/x/net/html" 14 | ) 15 | 16 | //go:generate pigeon -o wikitext.peg.go wikitext.peg 17 | 18 | // Convert converts wikitext to HTML. 19 | func Convert(text []byte, options ...ConvertOption) ([]byte, error) { 20 | var opts opts 21 | for _, opt := range options { 22 | opt(&opts) 23 | } 24 | v, err := Parse( 25 | "file.wikitext", 26 | append(text, '\n'), 27 | GlobalStore("len", len(text)), 28 | GlobalStore("text", text), 29 | GlobalStore("opts", opts), 30 | //Memoize(true), 31 | Recover(false), 32 | //Debug(true), 33 | ) 34 | if err != nil { 35 | return nil, err 36 | } 37 | 38 | //spew.Dump(v) 39 | 40 | var doc *html.Node 41 | 42 | for doc == nil && v != nil { 43 | switch val := v.(type) { 44 | case *html.Node: 45 | doc = val 46 | case debugRun: 47 | v = val.Value 48 | } 49 | } 50 | 51 | if doc == nil { 52 | return nil, errors.Errorf("expected *html.Node got: %T", v) 53 | } 54 | 55 | //log.Printf("Token doc: %q", concat(doc)) 56 | 57 | remaining := processTokens(doc) 58 | if opts.strict && len(remaining) > 0 { 59 | return nil, errors.Errorf("got %d extra children: doc %q, children %q", len(remaining), concat(doc), concat(remaining)) 60 | } 61 | addChildren(doc, remaining) 62 | 63 | var buf bytes.Buffer 64 | if err := html.Render(&buf, doc); err != nil { 65 | return nil, err 66 | } 67 | 68 | body := buf.Bytes() 69 | body = wikitextPolicy().SanitizeBytes(body) 70 | body = bytes.TrimSpace(body) 71 | return body, nil 72 | } 73 | 74 | func wikitextPolicy() *bluemonday.Policy { 75 | policy := bluemonday.UGCPolicy() 76 | 77 | policy.AllowNoAttrs().OnElements("ref") 78 | 79 | policy.RequireNoFollowOnLinks(false) 80 | policy.RequireNoFollowOnFullyQualifiedLinks(true) 81 | policy.AllowStyling() 82 | policy.AllowAttrs("id", "name", "style").Globally() 83 | policy.AllowAttrs("_parsestart", "_parseend", "_parsetoken").Globally() 84 | 85 | return policy 86 | } 87 | 88 | type Attribute struct { 89 | Key, Val interface{} 90 | } 91 | 92 | func (a Attribute) String() string { 93 | if a.Val == nil { 94 | return concat(a.Key) 95 | } 96 | return fmt.Sprintf("%s=%s", concat(a.Key), concat(a.Val)) 97 | } 98 | 99 | type opts struct { 100 | templateHandler func(name string, attrs []Attribute) (interface{}, error) 101 | strict bool 102 | } 103 | 104 | type ConvertOption func(opts *opts) 105 | 106 | // TemplateHandler sets the function that runs when a template is found. The 107 | // return value is included in the final document. Either *html.Node or string 108 | // values may be returned. String values will be inserted as escaped text. 109 | func TemplateHandler(f func(name string, attrs []Attribute) (interface{}, error)) ConvertOption { 110 | return func(opts *opts) { 111 | opts.templateHandler = f 112 | } 113 | } 114 | 115 | func strict() ConvertOption { 116 | return func(opts *opts) { 117 | opts.strict = true 118 | } 119 | } 120 | 121 | func flatten(fields ...interface{}) []interface{} { 122 | var out []interface{} 123 | for _, f := range fields { 124 | if f == nil { 125 | continue 126 | } 127 | 128 | switch f := f.(type) { 129 | case []interface{}: 130 | out = append(out, flatten(f...)...) 131 | case []*html.Node: 132 | for _, n := range f { 133 | out = append(out, n) 134 | } 135 | 136 | default: 137 | out = append(out, f) 138 | } 139 | } 140 | return out 141 | } 142 | 143 | func Concat(fields ...interface{}) string { 144 | return concat(fields...) 145 | } 146 | 147 | func concat(fields ...interface{}) string { 148 | var b strings.Builder 149 | for _, f := range flatten(fields...) { 150 | if f == nil { 151 | continue 152 | } 153 | 154 | switch f := f.(type) { 155 | case int: 156 | b.WriteString(strconv.Itoa(f)) 157 | 158 | case string: 159 | b.WriteString(f) 160 | 161 | case []byte: 162 | b.Write(f) 163 | 164 | case *html.Node: 165 | var buf bytes.Buffer 166 | if err := html.Render(&buf, f); err != nil { 167 | panic(err) 168 | } 169 | b.Write(buf.Bytes()) 170 | 171 | case debugRun: 172 | b.WriteString(concat(f.Value)) 173 | 174 | case Attribute: 175 | b.WriteString(f.String()) 176 | 177 | default: 178 | panic(errors.Errorf("concat: unsupported f type %T: %+v", f, f)) 179 | } 180 | } 181 | return b.String() 182 | } 183 | 184 | func addChild(n *html.Node, children interface{}) bool { 185 | if children == nil { 186 | return false 187 | } 188 | 189 | switch children := children.(type) { 190 | case []interface{}: 191 | added := false 192 | for _, c := range children { 193 | if addChild(n, c) { 194 | added = true 195 | } 196 | } 197 | return added 198 | 199 | case *html.Node: 200 | n.AppendChild(children) 201 | return true 202 | 203 | case []byte: 204 | return addChild(n, string(children)) 205 | 206 | case string: 207 | return addChild(n, &html.Node{ 208 | Type: html.TextNode, 209 | Data: children, 210 | }) 211 | 212 | default: 213 | log.Fatalf("unsupported children type %T: %#v", children, children) 214 | return false 215 | } 216 | } 217 | 218 | func inc(c *current, tag string) { 219 | v, _ := c.state[tag].(int) 220 | v++ 221 | c.state[tag] = v 222 | } 223 | 224 | func dec(c *current, tag string) { 225 | v, ok := c.state[tag].(int) 226 | if ok { 227 | v-- 228 | if v == 0 { 229 | delete(c.state, tag) 230 | } else { 231 | c.state[tag] = v 232 | } 233 | } 234 | } 235 | 236 | func count(c *current, tag string) int { 237 | v, _ := c.state[tag].(int) 238 | return v 239 | } 240 | 241 | type stack []interface{} 242 | 243 | func (s stack) Clone() interface{} { 244 | out := make(stack, len(s)) 245 | for k, v := range s { 246 | if c, ok := v.(Cloner); ok { 247 | out[k] = c.Clone() 248 | } else { 249 | out[k] = v 250 | } 251 | } 252 | return out 253 | } 254 | 255 | var _ Cloner = stack{} 256 | 257 | func push(c *current, tag string, val interface{}) int { 258 | v, _ := c.state[tag].(stack) 259 | v = append(v, val) 260 | c.state[tag] = v 261 | return len(v) - 1 262 | } 263 | 264 | func pop(c *current, tag string) interface{} { 265 | v, _ := c.state[tag].(stack) 266 | if len(v) == 0 { 267 | return nil 268 | } 269 | val := v[len(v)-1] 270 | if len(v) == 1 { 271 | delete(c.state, tag) 272 | } else { 273 | c.state[tag] = v[:len(v)-1] 274 | } 275 | return val 276 | } 277 | 278 | func popTo(c *current, tag string, n int) { 279 | v, _ := c.state[tag].(stack) 280 | if len(v) > n { 281 | if n == 0 { 282 | delete(c.state, tag) 283 | } else { 284 | c.state[tag] = v[:n] 285 | } 286 | } 287 | } 288 | 289 | func peek(c *current, tag string) interface{} { 290 | v, _ := c.state[tag].(stack) 291 | if len(v) == 0 { 292 | return nil 293 | } 294 | return v[len(v)-1] 295 | } 296 | 297 | var inlineBreaksRegexp = regexp.MustCompile(`[=|!{}:;\r\n[\]<\-]`) 298 | 299 | func match(pattern string, input []byte) bool { 300 | match, err := regexp.Match(pattern, input) 301 | if err != nil { 302 | panic(err) 303 | } 304 | return match 305 | } 306 | 307 | func inlineBreaks(c *current) (bool, error) { 308 | pos := c.pos.offset + len(c.text) 309 | //log.Printf("inlineBreaks %s, %q, pos %d", c.pos, c.text, pos) 310 | input := c.globalStore["text"].([]byte) 311 | if len(input) <= pos { 312 | log.Printf("inlinebreak false") 313 | return false, nil 314 | } 315 | ch := input[pos] 316 | if !inlineBreaksRegexp.Match([]byte{ch}) { 317 | //log.Printf("inlinebreak match fail: %s", []byte{ch}) 318 | return false, nil 319 | } 320 | 321 | switch ch { 322 | case '=': 323 | if arrow, _ := peek(c, "arrow").(bool); arrow && input[pos+1] == '>' { 324 | return true, nil 325 | } 326 | equal, _ := peek(c, "equal").(bool) 327 | return equal || (count(c, "h") > 0 && (pos == len(input)-1 || 328 | // possibly more equals followed by spaces or comments 329 | //TODO: use match(`^=*(?:[ \t]|<\!--(?:(?!-->)[^])*-->)*(?:[\r\n]|$)`, input[pos+1:]))), nil 330 | match(`^=*(?:[ \t]|<\!--.*-->)*(?:[\r\n]|$)`, input[pos+1:]))), nil 331 | 332 | case '|': 333 | templateArg, _ := peek(c, "templateArg").(bool) 334 | extTag, _ := peek(c, "extTag").(bool) 335 | tableCellArg, _ := peek(c, "tableCellArg").(bool) 336 | linkdesc, _ := peek(c, "linkdesc").(bool) 337 | table, _ := peek(c, "table").(bool) 338 | 339 | return (templateArg && 340 | !(extTag)) || 341 | tableCellArg || 342 | linkdesc || 343 | (table && (pos < len(input)-1 && 344 | match(`[}|]`, []byte{input[pos+1]}))), nil 345 | 346 | case '!': 347 | th, _ := peek(c, "th").(bool) 348 | return th && 349 | count(c, "templatedepth") == 0 && 350 | input[pos+1] == '!', nil 351 | 352 | case '{': 353 | // {{!}} pipe templates.. 354 | // FIXME: Presumably these should mix with and match | above. 355 | tableCellArg, _ := peek(c, "tableCellArg").(bool) 356 | table, _ := peek(c, "table").(bool) 357 | return ((tableCellArg && string(input[pos:pos+5]) == "{{!}}") || 358 | (table && string(input[pos:pos+10]) == "{{!}}{{!}}")), nil 359 | 360 | case '}': 361 | preproc, _ := peek(c, "preproc").(string) 362 | //log.Printf("inlineBreaks: } %q %q", preproc, input[pos:pos+2]) 363 | return string(input[pos:pos+2]) == preproc, nil 364 | 365 | case ':': 366 | return count(c, "colon") > 0 && 367 | !peek(c, "extlink").(bool) && 368 | count(c, "templatedepth") == 0 && 369 | !peek(c, "linkdesc").(bool) && 370 | !(peek(c, "preproc").(string) == "}-"), nil 371 | 372 | case ';': 373 | semicolon, _ := peek(c, "semicolon").(bool) 374 | return semicolon, nil 375 | 376 | case '\r': 377 | table, _ := peek(c, "table").(bool) 378 | return table && match(`\r\n?\s*[!|]`, input[pos:]), nil 379 | 380 | case '\n': 381 | // The code below is just a manual / efficient 382 | // version of this check. 383 | // 384 | // peek(c,'table') && /^\n\s*[!|]/.test(input.substr(pos)); 385 | // 386 | // It eliminates a substr on the string and eliminates 387 | // a potential perf problem since "\n" and the inline_breaks 388 | // test is common during tokenization. 389 | if table, _ := peek(c, "table").(bool); !table { 390 | return false, nil 391 | } 392 | 393 | // Allow leading whitespace in tables 394 | 395 | // Since we switched on 'c' which is input[pos], 396 | // we know that input[pos] is "\n". 397 | // So, the /^\n/ part of the regexp is already satisfied. 398 | // Look for /\s*[!|]/ below. 399 | n := len(input) 400 | for i := pos + 1; i < n; i++ { 401 | d := input[i] 402 | if match(`[!|]`, []byte{d}) { 403 | return true, nil 404 | } else if !match(`\s`, []byte{d}) { 405 | return false, nil 406 | } 407 | } 408 | return false, nil 409 | 410 | case '[': 411 | // This is a special case in php's doTableStuff, added in 412 | // response to T2553. If it encounters a `[[`, it bails on 413 | // parsing attributes and interprets it all as content. 414 | tableCellArg, _ := peek(c, "tableCellArg").(bool) 415 | return tableCellArg && string(input[pos:pos+2]) == "[[", nil 416 | 417 | case '-': 418 | // Same as above: a special case in doTableStuff, added 419 | // as part of T153140 420 | tableCellArg, _ := peek(c, "tableCellArg").(bool) 421 | return tableCellArg && string(input[pos:pos+2]) == "-{", nil 422 | 423 | case ']': 424 | extlink, _ := peek(c, "extlink").(bool) 425 | if extlink { 426 | return true, nil 427 | } 428 | preproc, _ := peek(c, "preproc").(string) 429 | //log.Printf("inlineBreaks extlink:%#v, preproc:%#v", extlink, preproc) 430 | return string(input[pos:pos+2]) == preproc, nil 431 | 432 | case '<': 433 | return (count(c, "noinclude") > 0 && string(input[pos:pos+12]) == "") || 434 | (count(c, "includeonly") > 0 && string(input[pos:pos+14]) == "") || 435 | (count(c, "onlyinclude") > 0 && string(input[pos:pos+14]) == ""), nil 436 | default: 437 | return false, errors.Errorf("Unhandled case!") 438 | } 439 | } 440 | -------------------------------------------------------------------------------- /wikitext/wikitext.peg: -------------------------------------------------------------------------------- 1 | { 2 | package wikitext 3 | 4 | } 5 | 6 | /********************************************************* 7 | * The top-level rule 8 | *********************************************************/ 9 | 10 | start <- tlb:tlb* newlineToken* { 11 | n := &html.Node{ 12 | Type: html.DocumentNode, 13 | } 14 | addChild(n, tlb) 15 | if len(c.state) > 0 { 16 | panic(errors.Errorf("poluted state! %#v", c.state)) 17 | } 18 | return n, nil 19 | } 20 | 21 | /* 22 | * Redirects can only occur as the first thing in a document. See 23 | * WikitextContent::getRedirectTarget() 24 | */ 25 | redirect <- redirect_word 26 | space_or_newline* 27 | (":" space_or_newline*)? 28 | wl:wikilink & { 29 | /* 30 | return wl.length === 1 && wl[0] && wl[0].constructor !== String; 31 | */ 32 | return false, nil 33 | } { 34 | /* 35 | var link = wl[0]; 36 | if (sp) { rw += sp; } 37 | if (c) { rw += c; } 38 | // Build a redirect token 39 | var redirect = new SelfclosingTagTk('mw:redirect', 40 | // Put 'href' into attributes so it gets template-expanded 41 | [Util.lookupKV(link.attribs, 'href')], 42 | { 43 | src: rw, 44 | tsr: tsrOffsets(), 45 | linkTk: link, 46 | }); 47 | return redirect; 48 | */ 49 | return "todo redirect", nil 50 | } 51 | 52 | // These rules are exposed as start rules. 53 | generic_newline_attributes <- generic_newline_attribute* 54 | 55 | table_attributes 56 | <- (table_attribute / optionalSpaceToken b:broken_table_attribute_name_char { 57 | return b, nil })* 58 | 59 | //The 'redirect' magic word. 60 | // The leading whitespace allowed is due to the PHP trim() function. 61 | 62 | redirect_word 63 | <- ([ \t\n\r]* 64 | (!space_or_newline ![:[] .)+ 65 | & {return false, nil /*return env.conf.wiki.getMagicWordMatcher('redirect').test(rw);*/ }) 66 | 67 | 68 | //# This rule exists to support tokenizing the document in chunks. 69 | //# The parser's streaming interface will stop tokenization after each iteration 70 | //# of the starred subexpression, and yield to the node.js event-loop to 71 | //# schedule other pending event handlers. 72 | //# 73 | start_async 74 | <- (tlb 75 | / newlineToken* &{ 76 | return false, nil 77 | /* 78 | if (endOffset() === input.length) { 79 | emitChunk([ new EOFTk() ]); 80 | } 81 | // terminate the loop 82 | return false; 83 | */ 84 | } 85 | )* 86 | 87 | 88 | // A document (start rule) is a sequence of toplevelblocks. Tokens are 89 | // emitted in chunks per toplevelblock to avoid buffering the full document. 90 | // 91 | tlb <- !eof b:block { 92 | return b, nil 93 | } 94 | 95 | 96 | // The actual contents of each block. 97 | // 98 | block 99 | // has to be first alternative; otherwise gets parsed as a
      100 | <- &sof redirect comment_or_includes block_line? {return "comment_or_includes", nil /*return [r].concat(cil, bl || []);*/ } 101 | / block_lines 102 | / & '<' rs:( cm:comment &eolf {return cm, nil /*return c;*/ } 103 | // avoid a paragraph if we know that the line starts with a block tag 104 | / block_tag 105 | ) {return rs, nil /*return rs;*/ } 106 | / paragraph 107 | // Inlineline includes generic tags; wrapped into paragraphs in token 108 | // transform and DOM postprocessor 109 | / inlineline 110 | / s:sol !inline_breaks {return s, nil /*return s;*/ } 111 | 112 | 113 | // A block nested in other constructs. Avoid eating end delimiters for other 114 | // constructs by checking against inline_breaks first. 115 | // 116 | nested_block <- !inline_breaks b:block {return b, nil /*return b;*/ } 117 | 118 | 119 | // The same, but suitable for use inside a table construct. 120 | // Doesn't match table_heading_tag, table_row_tag, table_data_tag, 121 | // table_caption tag, or table_end_tag, although it does allow 122 | // table_start_tag (for nested tables). 123 | // 124 | nested_block_in_table 125 | <- 126 | // avoid recursion via nested_block_in_table, as that can lead to stack 127 | // overflow in large tables 128 | // See https://phabricator.wikimedia.org/T59670 129 | #{ 130 | push(c, "tableDataBlock", true) 131 | return nil 132 | /* 133 | return stops.push('tableDataBlock', true); 134 | */ 135 | } 136 | // XXX: don't rely on a lame look-ahead like this; use syntax stops 137 | // instead, so that multi-line th content followed by a line prefixed with 138 | // a comment is also handled. Alternatively, implement a sol look-behind 139 | // assertion accepting spaces and comments. 140 | !(sol (space* sol)? space* (pipe / "!")) b:nested_block 141 | #{pop(c, "tableDataBlock"); return nil} 142 | { 143 | return b, nil 144 | /* 145 | stops.pop('tableDataBlock'); 146 | return b; 147 | */ 148 | } 149 | 150 | 151 | // Line-based block constructs. 152 | // 153 | block_lines 154 | <- s:sol 155 | // eat an empty line before the block 156 | (s2:(os:optionalSpaceToken so:sol))? 157 | bl:block_line 158 | 159 | // Horizontal rules 160 | hr <- "----" "-"* 161 | // Check if a newline or content follows 162 | ( &sol "" {return nil, nil /*return undefined;*/ } / "" {return true, nil /*return true;*/ } ) { 163 | return &html.Node{ 164 | Type: html.ElementNode, 165 | Data: "hr", 166 | }, nil 167 | /* 168 | var dataAttribs = { 169 | tsr: tsrOffsets(), 170 | lineContent: lineContent, 171 | }; 172 | if (d.length > 0) { 173 | dataAttribs.extra_dashes = d.length; 174 | } 175 | return new SelfclosingTagTk('hr', [], dataAttribs); 176 | */ 177 | } 178 | 179 | 180 | // Block structures with start-of-line wiki syntax 181 | // 182 | block_line 183 | <- heading 184 | / list_item 185 | / hr 186 | / st: space_or_newline* 187 | r:( & [ <{}|!] tl:table_line {return tl, nil /*return tl;*/ } 188 | // tag-only lines should not trigger pre either 189 | / bts:(bt:block_tag stl:optionalSpaceToken {return concat(bt, stl), nil /*return bt.concat(stl);*/ })+ 190 | &eolf {return bts, nil /*return bts;*/ } 191 | ) {return concat(st, r), nil 192 | /* 193 | return st.concat(r); 194 | */ 195 | } 196 | 197 | 198 | // A paragraph. We don't emit 'p' tokens to avoid issues with template 199 | // transclusions,

      tags in the source and the like. Instead, we perform 200 | // some paragraph wrapping on the token stream and the DOM. 201 | // 202 | paragraph 203 | <- s1:sol s2:sol c1:inlineline { 204 | n := &html.Node{ 205 | Type: html.ElementNode, 206 | Data: "p", 207 | } 208 | addChild(n, c1) 209 | return n, nil 210 | } 211 | 212 | br <- optionalSpaceToken &newline { 213 | return &html.Node{ 214 | Type: html.ElementNode, 215 | Data: "br", 216 | }, nil 217 | /* 218 | return s.concat([ 219 | new SelfclosingTagTk('br', [], { tsr: tsrOffsets() }), 220 | ]); 221 | */ 222 | } 223 | 224 | inline_breaks <- & { return inlineBreaks(c) } 225 | 226 | inlineline 227 | <- ((r:urltext) 228 | / inlineline_element)+ 229 | 230 | inlineline_element 231 | <- !inline_breaks 232 | r:(inline_element / [^\r\n]) 233 | {return r, nil} 234 | 235 | inline_element 236 | <- & '<' r:( xmlish_tag 237 | / comment 238 | ) {return r, nil /*return r;*/ } 239 | / & '{' r:tplarg_or_template {return r, nil/* return r; */} 240 | / & "-{" r:lang_variant_or_tpl {return r, nil/* return r; */} 241 | // FIXME: The php parser's replaceInternalLinks2 splits on [[, resulting 242 | // in sequences with odd number of brackets parsing as text, and sequences 243 | // with even number of brackets having its innermost pair parse as a 244 | // wikilink. For now, we faithfully reproduce what's found there but 245 | // wikitext, the language, shouldn't be defined by odd tokenizing behaviour 246 | // in the php parser. Flagging this for a future cleanup. 247 | / ("[[" &'[')+ 248 | / & '[' r:( wikilink / extlink ) {return r, nil/* return r; */} 249 | / & "'" r:quote {return r, nil/* return r; */} 250 | 251 | // Headings */ 252 | 253 | heading <- & "=" // guard, to make sure '='+ will match. 254 | // XXX: Also check to end to avoid inline parsing? 255 | r:( 256 | #{ inc(c, "h"); return nil /*return stops.inc('h');*/ } 257 | s:'='+ // moved in here to make s accessible to inner action 258 | ce:( 259 | (ill:(inlineline?)) 260 | '='+ {return ill, nil} 261 | )? 262 | & { 263 | return ce!=nil || len(concat(s)) > 2, nil 264 | /*return ce || s.length > 2;*/ 265 | } 266 | //("" {return nil, nil /*return endOffset();*/ }) 267 | spc:(spaces / comment)* 268 | &eolf 269 | #{dec(c, "h"); return nil} 270 | { 271 | n := &html.Node{ 272 | Type: html.ElementNode, 273 | Data: "h"+strconv.Itoa(len(concat(s))), 274 | } 275 | addChild(n, []interface{}{ce, spc}) 276 | return n, nil 277 | /* 278 | var c; 279 | var e; 280 | var level; 281 | stops.dec('h'); 282 | if (ce) { 283 | c = ce[0]; 284 | e = ce[1]; 285 | level = Math.min(s.length, e.length); 286 | } else { 287 | // split up equal signs into two equal parts, with at least 288 | // one character in the middle. 289 | level = Math.floor((s.length - 1) / 2); 290 | c = ['='.repeat(s.length - 2 * level)]; 291 | s = e = '='.repeat(level); 292 | } 293 | level = Math.min(6, level); 294 | // convert surplus equals into text 295 | if (s.length > level) { 296 | var extras1 = s.substr(0, s.length - level); 297 | if (c[0].constructor === String) { 298 | c[0] = extras1 + c[0]; 299 | } else { 300 | c.unshift(extras1); 301 | } 302 | } 303 | if (e.length > level) { 304 | var extras2 = e.substr(0, e.length - level); 305 | var lastElem = lastItem(c); 306 | if (lastElem.constructor === String) { 307 | c[c.length - 1] += extras2; 308 | } else { 309 | c.push(extras2); 310 | } 311 | } 312 | 313 | var tsr = tsrOffsets('start'); 314 | tsr[1] += level; 315 | return [ 316 | new TagTk('h' + level, [], { tsr: tsr }), 317 | ].concat(c, [ 318 | new EndTagTk('h' + level, [], { tsr: [endTPos - level, endTPos] }), 319 | spc, 320 | ]); 321 | */ 322 | } 323 | ) { 324 | return r, nil /*return r;*/ 325 | } 326 | 327 | 328 | // Comments */ 329 | 330 | // The php parser does a straight str.replace(/).)*-->/g, "") 331 | // but, as always, things around here are a little more complicated. 332 | // 333 | // We accept the same comments, but because we emit them as HTML comments 334 | // instead of deleting them, we have to encode the data to ensure that 335 | // we always emit a valid HTML5 comment. See the encodeComment helper 336 | // for further details. 337 | 338 | comment 339 | <- "" .)* ("-->" / eof) { 340 | return &html.Node{ 341 | Type: html.CommentNode, 342 | Data: concat(c1), 343 | }, nil 344 | /* 345 | var data = DU.encodeComment(c); 346 | return [new CommentTk(data, { tsr: tsrOffsets() })]; 347 | */ 348 | } 349 | 350 | 351 | // Behavior switches. See: 352 | // https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches 353 | behavior_switch 354 | <- ("__" behavior_text "__") {return "behavior_text", nil 355 | /* 356 | if (env.conf.wiki.isMagicWord(bs)) { 357 | return [ 358 | new SelfclosingTagTk('behavior-switch', [ new KV('word', bs) ], 359 | { tsr: tsrOffsets(), src: bs, magicSrc: bs } 360 | ), 361 | ]; 362 | } else { 363 | return [ bs ]; 364 | } 365 | */ 366 | } 367 | 368 | // Instead of defining a charset, php's doDoubleUnderscore concats a regexp of 369 | // all the language specific aliases of the behavior switches and then does a 370 | // match and replace. Just be as permissive as possible and let the 371 | // BehaviorSwitchPreprocessor back out of any overreach. 372 | behavior_text <- ( !"__" [^'"<~[{\n\r:;\]}|!=] )+ 373 | 374 | 375 | // ************************************************************ 376 | // External (bracketed and autolinked) links 377 | // ************************************************************/ 378 | 379 | autolink 380 | <- ! { 381 | extlink, _ := peek(c, "extlink").(bool) 382 | return extlink, nil 383 | /*return stops.onStack('extlink');*/ 384 | } 385 | // this must be a word boundary, so previous character must be non-word 386 | ! {return true, nil /*return /\w/.test(input[endOffset() - 1] || '');*/ } 387 | r:( 388 | // urllink, inlined 389 | target:autourl { 390 | return target, nil 391 | /* 392 | var res = [new SelfclosingTagTk('urllink', [new KV('href', target)], { tsr: tsrOffsets() })]; 393 | return res; 394 | */ 395 | } 396 | / autoref 397 | / isbn) {return r, nil /*return r;*/ } 398 | 399 | extlink 400 | <- ! { 401 | extlink, _ := peek(c, "extlink").(bool) 402 | return extlink, nil 403 | /* return stops.onStack('extlink'); */ 404 | } // extlink cannot be nested 405 | "[" 406 | # {push(c, "extlink", true); return nil /*return stops.push('extlink', true);*/ } 407 | addr:(url_protocol urladdr / "") 408 | target:(extlink_preprocessor_text / "") 409 | & { 410 | // TODO: smarter check 411 | return true, nil 412 | /* 413 | // Protocol must be valid and there ought to be at least one 414 | // post-protocol character. So strip last char off target 415 | // before testing protocol. 416 | var flat = tu.flattenString([addr, target]); 417 | if (Array.isArray(flat)) { 418 | // There are templates present, alas. 419 | return flat.length > 0; 420 | } 421 | return Util.isProtocolValid(flat.slice(0, -1), env); 422 | */ 423 | } 424 | ( space / unispace )* 425 | //( "" {return nil, nil /*return endOffset();*/ }) 426 | content:inlineline? 427 | "]" 428 | #{ pop(c, "extlink"); return nil } 429 | { 430 | n := &html.Node{ 431 | Type: html.ElementNode, 432 | Data: "a", 433 | Attr: []html.Attribute{ 434 | {Key: "href", Val: concat(addr, target)}, 435 | {Key: "class", Val: "external"}, 436 | {Key: "rel", Val: "nofollow"}, 437 | }, 438 | } 439 | addChild(n, content) 440 | return n, nil 441 | /* 442 | stops.pop('extlink'); 443 | return [ 444 | new SelfclosingTagTk('extlink', [ 445 | new KV('href', tu.flattenString([addr, target])), 446 | new KV('mw:content', content || ''), 447 | new KV('spaces', sp), 448 | ], { 449 | targetOff: targetOff, 450 | tsr: tsrOffsets(), 451 | contentOffsets: [targetOff, endOffset() - 1], 452 | }), 453 | ]; 454 | */ 455 | } 456 | 457 | autoref 458 | <- ("RFC" / "PMID") space_or_nbsp+ [0-9]+ end_of_word 459 | { return nil, nil 460 | /* 461 | var base_urls = { 462 | 'RFC': 'https://tools.ietf.org/html/rfc%s', 463 | 'PMID': '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract', 464 | }; 465 | return [ 466 | new SelfclosingTagTk('extlink', [ 467 | new KV('href', tu.sprintf(base_urls[ref], identifier)), 468 | new KV('mw:content', tu.flattenString([ref, sp, identifier])), 469 | new KV('typeof', 'mw:ExtLink/' + ref), 470 | ], 471 | { stx: "magiclink", tsr: tsrOffsets() }), 472 | ]; 473 | */ 474 | } 475 | 476 | isbn 477 | <- "ISBN" space_or_nbsp+ ( 478 | [0-9] 479 | (space_or_nbsp_or_dash &[0-9] {return nil, nil/* return s; */} / [0-9])+ 480 | ((space_or_nbsp_or_dash / "") [xX] / "") 481 | ) ( 482 | end_of_word 483 | {return nil, nil 484 | /* 485 | // Convert isbn token-and-entity array to stripped string. 486 | return tu.flattenStringlist(isbn).filter(function(e) { 487 | return e.constructor === String; 488 | }).join('').replace(/[^\dX]/ig, '').toUpperCase(); 489 | */ 490 | } 491 | ) &{ 492 | return false, nil 493 | /* 494 | // ISBNs can only be 10 or 13 digits long (with a specific format) 495 | return isbncode.length === 10 || 496 | (isbncode.length === 13 && /^97[89]/.test(isbncode)); 497 | */ 498 | } {return nil, nil 499 | /* 500 | return [ 501 | new SelfclosingTagTk('extlink', [ 502 | new KV('href', 'Special:BookSources/' + isbncode), 503 | new KV('mw:content', tu.flattenString(['ISBN', sp, isbn])), 504 | new KV('typeof', 'mw:WikiLink/ISBN'), 505 | ], 506 | { stx: "magiclink", tsr: tsrOffsets() }), 507 | ]; 508 | */ 509 | } 510 | 511 | 512 | // Default URL protocols in MediaWiki (see DefaultSettings). Normally 513 | // these can be configured dynamically. */ 514 | 515 | url_protocol <- 516 | & {return false, nil/* return Util.isProtocolValid(input.substr(endOffset()), env); */} 517 | ( "//" / [A-Za-z] [-A-Za-z0-9+.]* ":" "//"? ) {return nil, nil/* return p;*/ } 518 | 519 | // no punctuation, and '{<' to trigger directives 520 | no_punctuation_char <- [^ :\][\r\n"'<>,.&%{] 521 | //TODO: no_punctuation_char <- [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{] 522 | 523 | // this is the general url rule 524 | // on the PHP side, the path part matches EXT_LINK_URL_CLASS 525 | // which is '[^][<>"\x00-\x20\x7F\p{Zs}]' 526 | // the 's' and 'r' pieces below match the characters in 527 | // EXT_LINK_URL_CLASS which aren't included in no_punctuation_char 528 | url 529 | <- proto:url_protocol 530 | addr:(urladdr / "") 531 | path:( ( !inline_breaks 532 | c1:no_punctuation_char 533 | {return c1, nil /*return c; */} 534 | ) 535 | / s:[.:,'] {return s, nil/* return s; */} 536 | / comment 537 | / tplarg_or_template 538 | / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" ) 539 | r:( 540 | & "&" he:htmlentity {return he, nil/* return he; */} 541 | / [&%{] 542 | ) {return r, nil /*return r;*/ } 543 | )* 544 | // Must be at least one character after the protocol 545 | & {return false, nil /*return addr.length > 0 || path.length > 0;*/ } 546 | {return []interface{}{proto, addr, path}, nil 547 | /* 548 | return tu.flattenString([proto, addr].concat(path)); 549 | */ 550 | } 551 | 552 | // this is the somewhat-restricted rule used in autolinks 553 | // See Parser::doMagicLinks and Parser.php::makeFreeExternalLink. 554 | // The `path` portion matches EXT_LINK_URL_CLASS, as in the general 555 | // url rule. As in PHP, we do some fancy fixup to yank out 556 | // trailing punctuation, perhaps including parentheses. 557 | // The 's' and 'r' pieces match the characters in EXT_LINK_URL_CLASS 558 | // which aren't included in no_punctuation_char 559 | autourl 560 | <- &{return true, nil /*return stops.push('autourl', { sawLParen: false }); */} 561 | ! "//" // protocol-relative autolinks not allowed (T32269) 562 | ( 563 | url_protocol 564 | (urladdr / "") 565 | ( ( !inline_breaks 566 | ! "(" 567 | c1:no_punctuation_char 568 | {return c1, nil/* return c; */} 569 | ) 570 | / "(" {return "(", nil/* stops.onStack('autourl').sawLParen = true; return "("; */} 571 | / [.:,] 572 | / (['] ![']) // single quotes are ok, double quotes are bad 573 | / comment 574 | / tplarg_or_template 575 | / ! ( raw_htmlentity &{return false, nil /* return /^[<>\u00A0]$/.test(rhe); */} ) 576 | r:( 577 | & "&" he:htmlentity {return he, nil/* return he; */} 578 | / [&%{] 579 | ) {return r, nil/* return r; */} 580 | )* 581 | {return "TODO: autourl",nil 582 | /* 583 | // as in Parser.php::makeFreeExternalLink, we're going to 584 | // yank trailing punctuation out of this match. 585 | var url = tu.flattenStringlist([proto, addr].concat(path)); 586 | // only need to look at last element; HTML entities are strip-proof. 587 | var last = lastItem(url); 588 | var trim = 0; 589 | if (last && last.constructor === String) { 590 | var strip = ',;\\.:!?'; 591 | if (!stops.onStack('autourl').sawLParen) { 592 | strip += ')'; 593 | } 594 | strip = new RegExp('[' + JSUtils.escapeRegExp(strip) + ']*$'); 595 | trim = strip.exec(last)[0].length; 596 | url[url.length - 1] = last.slice(0, last.length - trim); 597 | } 598 | url = tu.flattenStringlist(url); 599 | if (url.length === 1 && url[0].constructor === String && url[0].length <= proto.length) { 600 | return null; // ensure we haven't stripped everything: T106945 601 | } 602 | peg$currPos -= trim; 603 | stops.pop('autourl'); 604 | return url; 605 | */ 606 | } ) &{return false, nil/* return r !== null; */} {return nil, nil/*return r; */} 607 | / &{return false, nil /*return stops.pop('autourl');*/ } 608 | 609 | // This is extracted from EXT_LINK_ADDR in Parser.php: a simplified 610 | // expression to match an IPv6 address. The IPv4 address and "at least 611 | // one character of a host name" portions are punted to the `path` 612 | // component of the `autourl` and `url` productions 613 | urladdr 614 | <- ( "[" [0-9A-Fa-f:.]+ "]" ) 615 | 616 | // ************************************************************ 617 | // Templates, -arguments and wikilinks 618 | // ************************************************************/ 619 | 620 | 621 | // Precedence: template arguments win over templates. See 622 | // http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence 623 | // 4: {{{{·}}}} → {·{{{·}}}·} 624 | // 5: {{{{{·}}}}} → {{·{{{·}}}·}} 625 | // 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}} 626 | // 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·} 627 | // This is only if close has > 3 braces; otherwise we just match open 628 | // and close as we find them. 629 | // 630 | tplarg_or_template 631 | <- &"{{" //&{return false, nil} 632 | // 633 | //// Refuse to recurse beyond `maxDepth` levels. Default in the PHP parser 634 | //// is $wgMaxTemplateDepth = 40; This is to prevent crashing from 635 | //// buggy wikitext with lots of unclosed template calls, as in 636 | //// eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094 637 | // if (stops.onCount('templatedepth') === undefined || 638 | // stops.onCount('templatedepth') < env.conf.parsoid.maxDepth) { 639 | // return true; 640 | // } else { 641 | // return false; 642 | // } 643 | t:tplarg_or_template_guarded {return t, nil /*return t;*/ } 644 | 645 | tplarg_or_template_guarded 646 | <- #{inc(c, "templatedepth"); return nil /* return stops.inc('templatedepth');*/ } 647 | r:( &("{{" &("{{{"+ !'{') tplarg) a:(template/broken_template) {return a, nil /*return a;*/ } 648 | / a:('{' &("{{{"+ !'{'))? b:tplarg {return concat(a, b), nil /*return [a].concat(b);*/ } 649 | / a:('{' &("{{" !'{'))? b:template {return concat(a, b), nil /*return [a].concat(b);*/ } 650 | / a:broken_template {return a, nil /*return a;*/ } 651 | ) #{ 652 | dec(c, "templatedepth") 653 | return nil 654 | } { 655 | return r, nil 656 | /* 657 | stops.dec('templatedepth'); 658 | return r; 659 | */ 660 | } 661 | 662 | tplarg_or_template_or_bust 663 | <- (tplarg_or_template / .)+ 664 | 665 | template 666 | <- #{ 667 | push(c, "level", push(c, "preproc", /*{{*/ "}}")) 668 | return nil 669 | /* return stops.push('preproc', / * {{ * /"}}"); */ 670 | } 671 | t:template_preproc 672 | #{ 673 | popTo(c, "preproc", pop(c, "level").(int)) 674 | return nil 675 | } 676 | {return t, nil/* stops.popTo('preproc', stopLen); return t; */} 677 | 678 | // The PHP preprocessor maintains a single stack of "closing token we 679 | // are currently looking for", with no backtracking. This means that 680 | // once you see `[[ {{` you are looking only for `}}` -- if that template 681 | // turns out to be broken you will never pop the `}}` and there is no way 682 | // to close the `[[`. Since the PEG tokenizer in Parsoid uses backtracking 683 | // and parses in a single pass (instead of PHP's split preprocessor/parser) 684 | // we have to be a little more careful when we emulate this behavior. 685 | // If we use a rule like: 686 | // template = "{{" tplname tplargs* "}}"? 687 | // Then we end up having to reinterpret `tplname tplargs*` as a tlb if it 688 | // turns out we never find the `}}`, which involves a lot of tedious gluing 689 | // tokens back together with fingers crossed we haven't discarded any 690 | // significant newlines/whitespace/etc. An alternative would be a rule like: 691 | // broken_template = "{{" tlb 692 | // but again, `template` is used in many different contexts; `tlb` isn't 693 | // necessarily the right one to recursively invoke. Instead we get the 694 | // broken template off of the PEGjs production stack by returning immediately 695 | // after `{{`, but we leave a "broken token" on top of the preprocessor 696 | // stops stack to indicate we're "still in" the {{ context and shouldn't 697 | // ever inlineBreak for any closing tokens above this one. For example: 698 | // [[Foo{{Bar]] 699 | // This will match as: 700 | // wikilink->text,template->text --> FAILS looking for }} 701 | // backtracks, popping "]]" and "}}" off preproc stack 702 | // wikilink->text,broken_template,text --> FAILS looking for ]] 703 | // backtracks, popping "]]" and "broken" off preproc stack 704 | // broken_wikilink,text,broken_template,text --> OK 705 | // with ["broken", "broken"] left on the preproc stops stack 706 | // Note that we use stops.popTo() to make sure the preproc stack is 707 | // cleaned up properly during backtracking, even if there were broken-FOO 708 | // productions taken which (deliberately) left elements on the preproc stack. 709 | 710 | broken_template 711 | <- &"{{" #{push(c, "preproc", "broken"); return nil/* return stops.push('preproc', 'broken'); */} 712 | // for broken-template, deliberately fail to pop the preproc stops stack 713 | t:"{{" 714 | #{pop(c, "preproc"); return nil} 715 | {return t, nil/* return t; */} 716 | 717 | template_preproc 718 | <- "{{" nl_comment_space* 719 | target:template_param_value 720 | attributes:(nl_comment_space* "|" 721 | r:( 722 | nl_comment_space* 723 | &("|" / "}}") 724 | {return nil, nil/* return new KV('', tu.flattenIfArray(v), [p0, p0, p0, 725 | p]);*/ 726 | } // empty argument 727 | / template_param 728 | ) {return r, nil/* return r; */} 729 | )* 730 | nl_comment_space* 731 | inline_breaks "}}" { 732 | opts, ok := c.globalStore["opts"].(opts) 733 | if !ok { 734 | return nil, nil 735 | } 736 | if opts.templateHandler == nil { 737 | return nil, nil 738 | } 739 | var attrs []Attribute 740 | for _, attr := range flatten(attributes) { 741 | attr := attr.(Attribute) 742 | attrs = append(attrs, attr) 743 | } 744 | val, err := opts.templateHandler(strings.TrimSpace(concat(target)), attrs) 745 | if err != nil { 746 | return fmt.Sprintf("{{ template error: %s }}", err.Error()), nil 747 | } 748 | return val, nil 749 | /* 750 | // Insert target as first positional attribute, so that it can be 751 | // generically expanded. The TemplateHandler then needs to shift it out 752 | // again. 753 | params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets)); 754 | var obj = new SelfclosingTagTk('template', params, { tsr: tsrOffsets(), src: text() }); 755 | return obj; 756 | */ 757 | } / ("{{" space_or_newline* "}}") 758 | 759 | tplarg 760 | <- //("" {return nil, nil /*return stops.push('preproc', / * {{ * /"}}"); */}) 761 | t:(tplarg_preproc / &{return false, nil /*return stops.popTo('preproc', stopLen); */} ) 762 | {return t, nil/* stops.popTo('preproc', stopLen); return t; */} 763 | 764 | tplarg_preproc 765 | <- "{{{" 766 | //("" {return nil, nil/* return endOffset(); */}) 767 | target:template_param_value? 768 | params:(nl_comment_space* "|" 769 | ( ("" {return nil, nil/* return endOffset(); */}) 770 | nl_comment_space* 771 | ("" {return nil, nil/* return endOffset(); */}) 772 | &("|" / "}}}") 773 | {return nil, nil/* return {return nil, nil tokens: v, srcOffsets: [p0, p1] }; */} // empty argument 774 | / template_param_value 775 | ) {return nil, nil/* return r; */} 776 | )* 777 | nl_comment_space* 778 | inline_breaks "}}}" {return concat(target, params), nil 779 | /* 780 | params = params.map(function(o) { 781 | var s = o.srcOffsets; 782 | return new KV('', tu.flattenIfArray(o.tokens), [s[0], s[0], s[0], s[1]]); 783 | }); 784 | if (target === null) { target = { tokens: '', srcOffsets: [p, p, p, p] }; } 785 | // Insert target as first positional attribute, so that it can be 786 | // generically expanded. The TemplateHandler then needs to shift it out 787 | // again. 788 | params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets)); 789 | var obj = new SelfclosingTagTk('templatearg', params, { tsr: tsrOffsets(), src: text() }); 790 | return obj; 791 | */ 792 | } 793 | 794 | template_param 795 | <- key:template_param_name 796 | val:( 797 | //("" {return nil, nil/* return endOffset(); */}) 798 | optionalSpaceToken 799 | "=" 800 | //("" {return nil, nil/* return endOffset(); */}) 801 | optionalSpaceToken 802 | tpv:template_param_value? {return tpv, nil 803 | /* 804 | return { kEndPos: kEndPos, vStartPos: vStartPos, value: (tpv && tpv.tokens) || [] }; 805 | */ 806 | } 807 | )? { 808 | return Attribute{ 809 | Key: key, 810 | Val: val, 811 | }, nil 812 | /* 813 | if (val !== null) { 814 | if (val.value !== null) { 815 | return new KV(name, tu.flattenIfArray(val.value), [startOffset(), val.kEndPos, val.vStartPos, endOffset()]); 816 | } else { 817 | return new KV(tu.flattenIfArray(name), '', [startOffset(), val.kEndPos, val.vStartPos, endOffset()]); 818 | } 819 | } else { 820 | return new KV('', tu.flattenIfArray(name), [startOffset(), startOffset(), startOffset(), endOffset()]); 821 | } 822 | */ 823 | } 824 | // empty parameter 825 | / & [|}] {return nil, nil 826 | /* 827 | return new KV('', '', [startOffset(), startOffset(), startOffset(), endOffset()]); 828 | */ 829 | } 830 | 831 | template_param_name 832 | <- & { 833 | push(c, "equal", true) 834 | return true, nil /*return stops.push('equal', true); */} 835 | tpt:(template_param_text / &'=' {return "", nil/* return ''; */}) 836 | { 837 | pop(c, "equal") 838 | return tpt, nil 839 | /* 840 | stops.pop('equal'); 841 | return tpt; 842 | */ 843 | } 844 | 845 | / & { 846 | pop(c, "equal") 847 | return false, nil 848 | /* return stops.pop('equal'); */ 849 | } 850 | 851 | template_param_value 852 | <- #{ push(c, "equal", false); return nil } 853 | tpt:template_param_text 854 | #{ pop(c, "equal"); return nil } 855 | { 856 | return tpt, nil 857 | /* 858 | stops.pop('equal'); 859 | return { tokens: tpt, srcOffsets: tsrOffsets() }; 860 | */ 861 | } 862 | 863 | template_param_text 864 | <- #{ 865 | push(c, "table", false) 866 | push(c, "extlink", false) 867 | push(c, "templateArg", true) 868 | push(c, "tableCellArg", false) 869 | inc(c, "template") 870 | return nil 871 | /* 872 | // re-enable tables within template parameters 873 | stops.push('table', false); 874 | stops.push('extlink', false); 875 | stops.push('templateArg', true); 876 | stops.push('tableCellArg', false); 877 | return stops.inc('template'); 878 | */ 879 | } 880 | il:(nested_block / newlineToken)+ #{ 881 | pop(c, "table") 882 | pop(c, "extlink") 883 | pop(c, "templateArg") 884 | pop(c, "tableCellArg") 885 | dec(c, "template") 886 | return nil 887 | } 888 | { 889 | return il, nil 890 | /* 891 | stops.pop('table'); 892 | stops.pop('extlink'); 893 | stops.pop('templateArg'); 894 | stops.pop('tableCellArg'); 895 | stops.dec('template'); 896 | // il is guaranteed to be an array -- so, tu.flattenIfArray will 897 | // always return an array 898 | var r = tu.flattenIfArray(il); 899 | if (r.length === 1 && r[0].constructor === String) { 900 | r = r[0]; 901 | } 902 | return r; 903 | */ 904 | } 905 | 906 | //// Language converter block markup of language variants: -{ ... }- 907 | 908 | // Note that "rightmost opening" precedence rule (see 909 | // https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means 910 | // that neither -{{ nor -{{{ are parsed as a -{ token, although 911 | // -{{{{ is (since {{{ has precedence over {{). 912 | 913 | lang_variant_or_tpl 914 | <- &("-{" &("{{{"+ !'{') tplarg) a:lang_variant {return a, nil/* return a; */} 915 | / a:('-' &("{{{"+ !'{')) b:tplarg {return concat(a, b), nil /*return [a].concat(b);*/ } 916 | / a:('-' &("{{" "{{{"* !'{')) b:template {return concat(a, b), nil/* return [a].concat(b); */} 917 | / &"-{" a:lang_variant {return a, nil /*return a; */} 918 | 919 | broken_lang_variant 920 | <- &{return true, nil /*return stops.push('preproc', 'broken'); */} 921 | // for broken-lang-variant, deliberately fail to pop the stops stack 922 | r:"-{" {return r, nil /*return r; */} 923 | 924 | lang_variant 925 | <- ("" {return nil, nil /*return stops.push('preproc', /* -{ * / '}-'); */}) 926 | lv:(lang_variant_preproc / &{return false, nil /*return stops.popTo('preproc', stopLen); */}) 927 | {return lv, nil /*stops.popTo('preproc', stopLen); return lv; */} 928 | / broken_lang_variant 929 | 930 | lang_variant_preproc 931 | <- ("-{" {return nil, nil/* return startOffset(); */}) 932 | ( 933 | &{return false, nil /* return env.langConverterEnabled(); */} 934 | ff:opt_lang_variant_flags {return ff, nil 935 | /* 936 | // Avoid mutating cached expression results 937 | ff = Util.clone(ff, true); 938 | // if flags contains 'R', then don't treat ; or : specially inside. 939 | if (ff.flags) { 940 | ff.raw = ff.flags.has('R') || ff.flags.has('N'); 941 | } else if (ff.variants) { 942 | ff.raw = true; 943 | } 944 | return ff; 945 | */ 946 | } / 947 | &{return false, nil /*return !env.langConverterEnabled(); */} 948 | "" {return nil, nil 949 | /* 950 | // if language converter not enabled, don't try to parse inside. 951 | return { raw: true }; 952 | */ 953 | } 954 | ) 955 | ( 956 | &{return false, nil /*return f.raw; */} lv:lang_variant_text {return lv, nil/* return [{ text: lv }]; */} 957 | / 958 | &{return false, nil /* return !f.raw; */} lv:lang_variant_option_list {return lv, nil/* return lv; */} 959 | ) 960 | inline_breaks 961 | ("}-" {return nil, nil/* return endOffset(); */}) {return "TODO lang_variant_preproc", nil 962 | /* 963 | 964 | if (!env.langConverterEnabled()) { 965 | return [ "-{", ts[0].text.tokens, "}-" ]; 966 | } 967 | var lvsrc = input.substring(lv0, lv1); 968 | var attribs = []; 969 | 970 | // Do a deep clone since we may be destructively modifying 971 | // (the `t[fld] = name;` below) the result of a cached expression 972 | ts = Util.clone(ts, true); 973 | 974 | ts.forEach(function(t) { 975 | // move token strings into KV attributes so that they are 976 | // properly expanded by early stages of the token pipeline 977 | ['text','from','to'].forEach(function(fld) { 978 | if (t[fld] === undefined) { return; } 979 | var name = 'mw:lv' + attribs.length; 980 | attribs.push(new KV(name, t[fld].tokens, t[fld].srcOffsets)); 981 | t[fld] = name; 982 | }); 983 | }); 984 | return [ 985 | new SelfclosingTagTk( 986 | 'language-variant', 987 | attribs, 988 | {return nil, nil 989 | tsr: [lv0, lv1], 990 | src: lvsrc, 991 | flags: f.flags && Array.from(f.flags).sort(), 992 | variants: f.variants && Array.from(f.variants).sort(), 993 | original: f.original, 994 | flagSp: f.sp, 995 | texts: ts, 996 | }), 997 | ]; 998 | */ 999 | } 1000 | 1001 | opt_lang_variant_flags 1002 | <- f:( ff:lang_variant_flags "|" {return ff, nil/* return ff; */} )? {return f, nil 1003 | /* 1004 | // Collect & separate flags and variants into a set and ordered list 1005 | var flags = new Set(); 1006 | var variants = new Set(); 1007 | var flagList = []; 1008 | var flagSpace = []; 1009 | var variantList = []; 1010 | var variantSpace = []; 1011 | var useVariants = false; 1012 | var internalSp = []; // internal whitespace, for round-tripping 1013 | if (f !== null) { 1014 | // lang_variant_flags returns arrays in reverse order. 1015 | f.flags.reverse(); 1016 | f.sp.reverse(); 1017 | var spPtr = 0; 1018 | f.flags.forEach(function(item) { 1019 | if (item.flag) { 1020 | flagSpace.push(f.sp[spPtr++]); 1021 | flags.add(item.flag); 1022 | flagList.push(item.flag); 1023 | flagSpace.push(f.sp[spPtr++]); 1024 | } 1025 | if (item.variant) { 1026 | variantSpace.push(f.sp[spPtr++]); 1027 | variants.add(item.variant); 1028 | variantList.push(item.variant); 1029 | variantSpace.push(f.sp[spPtr++]); 1030 | } 1031 | }); 1032 | if (spPtr < f.sp.length) { 1033 | // handle space after a trailing semicolon 1034 | flagSpace.push(f.sp[spPtr]); 1035 | variantSpace.push(f.sp[spPtr]); 1036 | } 1037 | } 1038 | // Parse flags (this logic is from core/languages/ConverterRule.php 1039 | // in the parseFlags() function) 1040 | if (flags.size === 0 && variants.size === 0) { 1041 | flags.add('$S'); 1042 | } else if (flags.has('R')) { 1043 | flags = new Set(['R']); // remove other flags 1044 | } else if (flags.has('N')) { 1045 | flags = new Set(['N']); // remove other flags 1046 | } else if (flags.has('-')) { 1047 | flags = new Set(['-']); // remove other flags 1048 | } else if (flags.has('T') && flags.size === 1) { 1049 | flags.add('H'); 1050 | } else if (flags.has('H')) { 1051 | // Replace A flag, and remove other flags except T and D 1052 | var nf = new Set(['$+', 'H']); 1053 | if (flags.has('T')) { nf.add('T'); } 1054 | if (flags.has('D')) { nf.add('D'); } 1055 | flags = nf; 1056 | } else if (variants.size > 0) { 1057 | useVariants = true; 1058 | } else { 1059 | if (flags.has('A')) { 1060 | flags.add('$+'); 1061 | flags.add('$S'); 1062 | } 1063 | if (flags.has('D')) { 1064 | flags.delete('$S'); 1065 | } 1066 | } 1067 | if (useVariants) { 1068 | return { variants: variants, original: variantList, sp: variantSpace }; 1069 | } else { 1070 | return { flags: flags, original: flagList, sp: flagSpace }; 1071 | } 1072 | */ 1073 | } 1074 | 1075 | lang_variant_flags 1076 | <- (space_or_newline*) lang_variant_flag (space_or_newline*) 1077 | ( ";" lang_variant_flags? )? {return nil, nil 1078 | /* 1079 | var r = more && more[1] ? more[1] : { sp: [], flags: [] }; 1080 | // Note that sp and flags are in reverse order, since we're using 1081 | // right recursion and want to push instead of unshift. 1082 | r.sp.push(sp2.join('')); 1083 | r.sp.push(sp1.join('')); 1084 | r.flags.push(f); 1085 | return r; 1086 | */ 1087 | } 1088 | / (space_or_newline*) {return nil, nil 1089 | /* 1090 | return { sp: [ sp.join('') ], flags: [] }; 1091 | */ 1092 | } 1093 | 1094 | lang_variant_flag 1095 | <- [-+A-Z] {return nil, nil /*return { flag: f }; */} 1096 | / lang_variant_name {return nil, nil/* return { variant: v }; */} 1097 | / (!space_or_newline !nowiki [^{}|;])+ {return nil, nil/* return { bogus: b.join('') }; /* 1098 | bad flag * /*/} 1099 | 1100 | lang_variant_name // language variant name, like zh, zh-cn, etc. 1101 | <- [a-z] [-a-z]+ {return nil, nil/* return h + t.join(''); */} 1102 | // Escaped otherwise-unrepresentable language names 1103 | // Primarily for supporting html2html round trips; PHP doesn't support 1104 | // using nowikis here (yet!) 1105 | / nowiki_text 1106 | 1107 | lang_variant_option_list 1108 | <- lang_variant_option ( ";" lang_variant_option {return nil, nil/* return oo; */})* 1109 | ( ";" space_or_newline* )? // optional trailing semicolon 1110 | {return nil, nil 1111 | /* 1112 | var r = [ o ].concat(rest); 1113 | if (tr) { r.push({ semi: true, sp: tr[1].join('') }); } 1114 | return r; 1115 | */ 1116 | } 1117 | / lang_variant_text {return nil, nil/* return [{ text: lvtext }]; */} 1118 | 1119 | lang_variant_option 1120 | <- (space_or_newline*) lang_variant_name 1121 | (space_or_newline*) ":" 1122 | (space_or_newline*) 1123 | (lang_variant_nowiki / lang_variant_text_no_semi) 1124 | {return nil, nil 1125 | /* 1126 | return { 1127 | twoway: true, 1128 | lang: lang, 1129 | text: lvtext, 1130 | sp: [sp1.join(''), sp2.join(''), sp3.join('')] 1131 | }; 1132 | */ 1133 | } 1134 | / (space_or_newline*) 1135 | (lang_variant_nowiki / lang_variant_text_no_semi_or_arrow) 1136 | "=>" 1137 | (space_or_newline*) lang_variant_name 1138 | (space_or_newline*) ":" 1139 | (space_or_newline*) 1140 | (lang_variant_nowiki / lang_variant_text_no_semi) 1141 | {return nil, nil 1142 | /* 1143 | return { 1144 | oneway: true, 1145 | from: from, 1146 | lang: lang, 1147 | to: to, 1148 | sp: [sp1.join(''), sp2.join(''), sp3.join(''), sp4.join('')] 1149 | }; 1150 | */ 1151 | } 1152 | 1153 | // html2wt support: If a language name or conversion string can't be 1154 | // represented w/o breaking wikitext, just wrap it in a . 1155 | // PHP doesn't support this (yet), but Parsoid does. 1156 | lang_variant_nowiki 1157 | <- ("" {return nil, nil/*return startOffset();*/}) 1158 | nowiki_text 1159 | ("" {return nil, nil/* return endOffset();*/}) 1160 | space_or_newline* {return nil, nil 1161 | /* 1162 | return { tokens: [ n ], srcOffsets: [start, end] }; 1163 | */ 1164 | } 1165 | 1166 | lang_variant_text 1167 | <- ("" {return nil, nil/*return startOffset();*/}) 1168 | (inlineline / "|" )* 1169 | ("" {return nil, nil/*return endOffset();*/}) 1170 | {return nil, nil/* return { tokens: tokens || [], srcOffsets: [start, end] }; */} 1171 | 1172 | lang_variant_text_no_semi 1173 | <- & {return false, nil/* return stops.push('semicolon', true); */} 1174 | lang_variant_text 1175 | {return nil, nil/* stops.pop('semicolon'); return lvtext; */} 1176 | / & {return false, nil/* return stops.pop('semicolon'); */} 1177 | 1178 | lang_variant_text_no_semi_or_arrow 1179 | <- & {return false, nil/* return stops.push('arrow', true); */} 1180 | lang_variant_text_no_semi {return nil, nil/* stops.pop('arrow'); return lvtext; */} 1181 | / & {return false, nil/* return stops.pop('arrow'); */} 1182 | 1183 | wikilink_content 1184 | <- (pipe lt:link_text? { 1185 | return lt, nil 1186 | /* 1187 | var maybeContent = new KV('mw:maybeContent', lt, [startPos, endOffset()]); 1188 | maybeContent.vsrc = input.substring(startPos, endOffset()); 1189 | return maybeContent; 1190 | */ 1191 | })* 1192 | 1193 | wikilink <- wikilink_preproc / broken_wikilink 1194 | 1195 | // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the 1196 | // second bracket could start an extlink. Deliberately leave entry 1197 | // on preproc stack since we haven't seen a double-close bracket. 1198 | // (See full explanation above broken_template production.) 1199 | broken_wikilink 1200 | <- &"[[" #{ 1201 | push(c, "preproc", "broken") 1202 | return nil 1203 | /* return stops.push('preproc', 'broken'); */ 1204 | } 1205 | a:("[" (extlink / "[")) 1206 | #{ pop(c, "preproc"); return nil } 1207 | { 1208 | return a, nil 1209 | /* return a; */ 1210 | } 1211 | 1212 | wikilink_preproc 1213 | <- "[[" 1214 | #{ push(c, "preproc", "]]"); return nil } 1215 | target:wikilink_preprocessor_text? 1216 | //("" {return nil, nil/* return endOffset(); */}) 1217 | lcs:wikilink_content 1218 | inline_breaks "]]" 1219 | #{ pop(c, "preproc"); return nil } 1220 | { 1221 | targetStr := concat(target) 1222 | if strings.HasPrefix(targetStr, "File:") || strings.HasPrefix(targetStr, "Image:") { 1223 | n := &html.Node{ 1224 | Type: html.ElementNode, 1225 | Data: "div", 1226 | Attr: []html.Attribute{ 1227 | {Key: "class", Val: "image"}, 1228 | }, 1229 | } 1230 | link := &html.Node{ 1231 | Type: html.ElementNode, 1232 | Data: "a", 1233 | Attr: []html.Attribute{ 1234 | {Key: "href", Val: TitleToURL(targetStr)}, 1235 | }, 1236 | } 1237 | addChild(link, targetStr) 1238 | addChild(n, link) 1239 | children, ok := lcs.([]interface{}) 1240 | if ok && len(children) > 0 { 1241 | descDiv := &html.Node{ 1242 | Type: html.ElementNode, 1243 | Data: "div", 1244 | Attr: []html.Attribute{ 1245 | {Key: "class", Val: "caption"}, 1246 | }, 1247 | } 1248 | addChild(descDiv, children[len(children)-1]) 1249 | addChild(n, descDiv) 1250 | } 1251 | return n, nil 1252 | } 1253 | n := &html.Node{ 1254 | Type: html.ElementNode, 1255 | Data: "a", 1256 | Attr: []html.Attribute{ 1257 | {Key: "href", Val: TitleToURL(targetStr)}, 1258 | }, 1259 | } 1260 | if !addChild(n, lcs) { 1261 | addChild(n, targetStr) 1262 | } 1263 | return n, nil 1264 | /* 1265 | var pipeTrick = (lcs.length === 1 && lcs[0].v === null); 1266 | var textTokens = []; 1267 | if (target === null || pipeTrick) { 1268 | textTokens.push("[["); 1269 | if (target) { 1270 | textTokens.push(target); 1271 | } 1272 | lcs.forEach(function(a) { 1273 | // a is a mw:maybeContent attribute 1274 | textTokens.push("|"); 1275 | if (a.v !== null) { textTokens.push(a.v); } 1276 | }); 1277 | textTokens.push("]]"); 1278 | return textTokens; 1279 | } 1280 | var obj = new SelfclosingTagTk('wikilink'); 1281 | var hrefKV = new KV('href', target); 1282 | hrefKV.vsrc = input.substring(startOffset() + 2, tpos); 1283 | // XXX: Point to object with path, revision and input information 1284 | // obj.source = input; 1285 | obj.attribs.push(hrefKV); 1286 | obj.attribs = obj.attribs.concat(lcs); 1287 | obj.dataAttribs = { 1288 | tsr: tsrOffsets(), 1289 | src: text(), 1290 | }; 1291 | return [obj]; 1292 | */ 1293 | } 1294 | 1295 | // Tables are allowed inside image captions. 1296 | link_text 1297 | <- #{ 1298 | // Suppress the flag temporarily in this rule to consume the '=' here. 1299 | push(c, "equal", false) 1300 | push(c, "linkdesc", true) 1301 | return nil 1302 | } 1303 | c1:( // This group is similar to "block_line" but "list_item" 1304 | // is omitted since `doBlockLevels` happens after 1305 | // `replaceInternalLinks2`, where newlines are stripped. 1306 | (sol (heading / hr / full_table_in_link_caption)) 1307 | / urltext 1308 | / (!inline_breaks 1309 | r:( inline_element / '[' text_char+ ']' (&(!']' / "]]")) / . ) {return r, nil} 1310 | ) 1311 | )+ #{ 1312 | pop(c, "equal") 1313 | pop(c, "linkdesc") 1314 | return nil 1315 | } 1316 | { 1317 | return c1, nil 1318 | } 1319 | 1320 | // Generic quote rule for italic and bold, further processed in a token 1321 | // stream transformation in doQuotes. Relies on NlTk tokens being emitted 1322 | // for each line of text to balance quotes per line. 1323 | 1324 | // We are not using a simple pair rule here as we need to support mis-nested 1325 | // bolds/italics and MediaWiki's special heuristics for apostrophes, which are 1326 | // all not context free. */ 1327 | quote <- ("''" "'"*) { 1328 | return &html.Node{ 1329 | Type: html.ElementNode, 1330 | Data: "b", 1331 | Attr: []html.Attribute{ 1332 | {Key: "_parsetoken"}, 1333 | }, 1334 | }, nil 1335 | /* 1336 | // sequences of four or more than five quotes are assumed to start 1337 | // with some number of plain-text apostrophes. 1338 | var plainticks = 0; 1339 | var result = []; 1340 | if (quotes.length === 4) { 1341 | plainticks = 1; 1342 | } else if (quotes.length > 5) { 1343 | plainticks = quotes.length - 5; 1344 | } 1345 | if (plainticks > 0) { 1346 | result.push(quotes.substring(0, plainticks)); 1347 | } 1348 | // mw-quote token Will be consumed in token transforms 1349 | var tsr = tsrOffsets(); 1350 | tsr[0] += plainticks; 1351 | var mwq = new SelfclosingTagTk('mw-quote', [], { tsr: tsr }); 1352 | mwq.value = quotes.substring(plainticks); 1353 | result.push(mwq); 1354 | return result; 1355 | */ 1356 | } 1357 | 1358 | 1359 | // ********************************************************* 1360 | // Pre and xmlish tags 1361 | // *********************************************************/ 1362 | 1363 | extension_tag <- 1364 | &{return false, nil /*return !stops.onStack('extTag'); */} 1365 | xmlish_tag 1366 | // Account for `maybeExtensionTag` returning unmatched start / end tags 1367 | &{return false, nil /* return extToken.name === 'extension'; */} 1368 | {return nil, nil/* return extToken; */} 1369 | 1370 | nowiki 1371 | <- extension_tag 1372 | &{return false, nil /* return extToken.getAttribute('name') === 'nowiki'; */} 1373 | {return nil, nil/* return extToken; */} 1374 | 1375 | // Used by nowiki extension to tokenize html entities. 1376 | nowiki_content 1377 | <- c2:(htmlentity / .)* {return c2, nil/* return tu.flattenIfArray(c); */} 1378 | 1379 | // Used by lang_variant productions to protect special language names or 1380 | // conversion strings. 1381 | nowiki_text 1382 | <- nowiki 1383 | {return nil, nil 1384 | /* 1385 | var txt = Util.getExtArgInfo(extToken).dict.body.extsrc; 1386 | return Util.decodeEntities(txt); 1387 | */ 1388 | } 1389 | 1390 | // Generic XML-like tags 1391 | 1392 | // These also cover extensions (including Cite), which will hook into the 1393 | // token stream for further processing. The content of extension tags is 1394 | // parsed as regular inline, but the source positions of the tag are added 1395 | // to allow reconstructing the unparsed text from the input. */ 1396 | 1397 | // See http://www.w3.org/TR/html5/syntax.html#tag-open-state and 1398 | // following paragraphs. 1399 | tag_name_chars <- [^\t\n\v />\x00] 1400 | tag_name <- ([A-Za-z] tag_name_chars*) 1401 | 1402 | xmlish_tag 1403 | <- # { 1404 | push(c, "table", false) 1405 | push(c, "tableCellArg", false) 1406 | return nil 1407 | } 1408 | // By the time we get to `doTableStuff` in the php parser, we've already 1409 | // safely encoded element attributes. See 55313f4e in core. 1410 | // stops.push('table', false); 1411 | // stops.push('tableCellArg', false); 1412 | //return true; 1413 | //} 1414 | "<" end:"/"? 1415 | name:(tag_name & {return true, nil} 1416 | ///* 1417 | // return isXMLTag(tn, false); // NOTE: 'extTag' stop was pushed. 1418 | // */ 1419 | //} 1420 | ) 1421 | attribs:generic_newline_attributes 1422 | space_or_newline* // No need to preserve this -- canonicalize on RT via dirty diff 1423 | selfclose:"/"? 1424 | space* // not preserved - canonicalized on RT via dirty diff 1425 | ">" 1426 | #{ 1427 | pop(c, "table") 1428 | pop(c, "tableCellArg") 1429 | pop(c, "extTag") 1430 | return nil 1431 | } 1432 | { 1433 | n := &html.Node{ 1434 | Type: html.ElementNode, 1435 | Data: concat(name), 1436 | } 1437 | 1438 | for _, attr := range flatten(attribs) { 1439 | attr := attr.(html.Attribute) 1440 | n.Attr = append(n.Attr, attr) 1441 | } 1442 | 1443 | if end != nil { 1444 | n.Attr = append(n.Attr, html.Attribute{Key:"_parseend"}) 1445 | } else if selfclose == nil { 1446 | n.Attr = append(n.Attr, html.Attribute{Key:"_parsestart"}) 1447 | } 1448 | 1449 | return n, nil 1450 | /* 1451 | stops.pop('table'); 1452 | stops.pop('tableCellArg'); 1453 | stops.pop('extTag'); 1454 | 1455 | var lcName = name.toLowerCase(); 1456 | 1457 | // Extension tags don't necessarily have the same semantics as html tags, 1458 | // so don't treat them as void elements. 1459 | var isVoidElt = Util.isVoidElement(lcName) && !env.conf.wiki.extensionTags.has(lcName); 1460 | 1461 | // Support
      1462 | if (lcName === 'br' && end) { 1463 | end = null; 1464 | } 1465 | 1466 | var res = tu.buildXMLTag(name, lcName, attribs, end, !!selfclose || isVoidElt, tsrOffsets()); 1467 | 1468 | // change up data-attribs in one scenario 1469 | // void-elts that aren't self-closed ==> useful for accurate RT-ing 1470 | if (!selfclose && isVoidElt) { 1471 | res.dataAttribs.selfClose = undefined; 1472 | res.dataAttribs.noClose = true; 1473 | } 1474 | 1475 | return maybeExtensionTag(res); 1476 | */ 1477 | } 1478 | 1479 | 1480 | // A variant of xmlish_tag, but also checks if the tag name is a block-level 1481 | // tag as defined in 1482 | // http://www.w3.org/TR/html5/syntax.html#tag-open-state and 1483 | // following paragraphs. 1484 | // 1485 | block_tag 1486 | <- & { 1487 | // By the time we get to `doTableStuff` in the php parser, we've already 1488 | // safely encoded element attributes. See 55313f4e in core. 1489 | push(c, "table", false) 1490 | push(c, "tableCellArg", false) 1491 | return true, nil 1492 | } 1493 | "<" "/"? 1494 | (tag_name & { 1495 | push(c, "extTag", false) 1496 | return false, nil 1497 | } 1498 | //#/* 1499 | //# return isXMLTag(tn, true); // NOTE: 'extTag' stop was pushed. 1500 | //# */ 1501 | //#} 1502 | ) 1503 | generic_newline_attributes 1504 | space_or_newline* 1505 | "/"? 1506 | ">" { 1507 | pop(c, "table") 1508 | pop(c, "tableCellArg") 1509 | pop(c, "extTag") 1510 | return nil, nil 1511 | /* 1512 | stops.pop('table'); 1513 | stops.pop('tableCellArg'); 1514 | stops.pop('extTag'); 1515 | var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, !!selfclose, tsrOffsets()); 1516 | return [maybeExtensionTag(t)]; 1517 | */ 1518 | } 1519 | / "<" "/"? tag_name & { 1520 | pop(c, "extTag") 1521 | return false, nil 1522 | } 1523 | / & { 1524 | pop(c, "table") 1525 | pop(c, "tableCellArg") 1526 | return false, nil 1527 | } 1528 | 1529 | // A generic attribute that can span multiple lines. 1530 | generic_newline_attribute 1531 | <- space_or_newline* 1532 | ("" {return nil, nil/* return endOffset(); */}) 1533 | key:generic_attribute_name 1534 | ("" {return nil, nil/* return endOffset(); */}) 1535 | val:(space_or_newline* "=" v:generic_att_value? {return v, nil/* return v; */})? 1536 | {return html.Attribute{Key: concat(key), Val: concat(val)}, nil 1537 | /* 1538 | // NB: Keep in sync w/ table_attibute 1539 | var res; 1540 | // Encapsulate protected attributes. 1541 | if (typeof name === 'string') {return nil, nil 1542 | name = tu.protectAttrs(name); 1543 | } 1544 | if (vd !== null) { 1545 | res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]); 1546 | res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]); 1547 | } else { 1548 | res = new KV(name, '', [namePos0, namePos, namePos, namePos]); 1549 | } 1550 | if (Array.isArray(name)) { 1551 | res.ksrc = input.substring(namePos0, namePos); 1552 | } 1553 | return res; 1554 | */ 1555 | } 1556 | 1557 | // A single-line attribute. 1558 | table_attribute 1559 | <- optionalSpaceToken 1560 | ("" {return nil, nil /* return endOffset(); */}) 1561 | table_attribute_name 1562 | ("" {return nil, nil /* return endOffset(); */}) 1563 | (optionalSpaceToken "=" table_att_value? {return nil, nil /* return v; */})? 1564 | {return nil,nil 1565 | /* 1566 | // NB: Keep in sync w/ generic_newline_attribute 1567 | var res; 1568 | // Encapsulate protected attributes. 1569 | if (typeof name === 'string') { 1570 | name = tu.protectAttrs(name); 1571 | } 1572 | if (vd !== null) { 1573 | res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]); 1574 | res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]); 1575 | } else { 1576 | res = new KV(name, '', [namePos0, namePos, namePos, namePos]); 1577 | } 1578 | if (Array.isArray(name)) { 1579 | res.ksrc = input.substring(namePos0, namePos); 1580 | } 1581 | return res; 1582 | */ 1583 | } 1584 | 1585 | // The arrangement of chars is to emphasize the split between what's disallowed 1586 | // by html5 and what's necessary to give directive a chance. 1587 | // See: http://www.w3.org/TR/html5/syntax.html#attributes-0 1588 | generic_attribute_name 1589 | <- q:(["'=]?) // From #before-attribute-name-state, < is omitted for directive 1590 | r:( [^ \t\r\n\x00/=><&{}!|-]+ 1591 | / !inline_breaks 1592 | // \0/=> is the html5 attribute name set we do not want. 1593 | t:( directive / !( space_or_newline / [\x00/=>] ) c2:. { return c2, nil /*return c;*/ } 1594 | ) {return t, nil /*return t; */} 1595 | )* 1596 | & { 1597 | return len(flatten(r))>0 || len(flatten(q))>0, nil 1598 | /* return r.length > 0 || q.length > 0; */ 1599 | } 1600 | {return concat(q, r), nil /* return tu.flattenString([q].concat(r)); */} 1601 | 1602 | // Also accept these chars in a wikitext table or tr attribute name position. 1603 | // They are normally not matched by the table_attribute_name. 1604 | broken_table_attribute_name_char <- [\x00/=>] {return nil, nil /* return new KV(c, ''); */} 1605 | 1606 | // Same as generic_attribute_name, except for accepting tags and wikilinks. 1607 | // (That doesn't make sense (ie. match php) in the generic case.) 1608 | // We also give a chance to break on \[ (see T2553). 1609 | table_attribute_name 1610 | <- (["'=]?) // From #before-attribute-name-state, < is omitted for directive 1611 | ( [^ \t\r\n\x00/=><&{}!|[-]+ 1612 | / !inline_breaks 1613 | // \0/=> is the html5 attribute name set we do not want. 1614 | ( wikilink 1615 | / directive 1616 | // Accept insane tags-inside-attributes as attribute names. 1617 | // The sanitizer will strip and shadow them for roundtripping. 1618 | // Example: generated with.. 1619 | / &xmlish_tag inlineline {return nil, nil/* return ill; */} 1620 | / !( space_or_newline / [\x00/=>] ) . {return nil, nil/* return c; */} 1621 | ) {return nil, nil/* return t; */} 1622 | )* 1623 | & {return false, nil/* return r.length > 0 || q.length > 0; */} 1624 | {return nil, nil/* return tu.flattenString([q].concat(r)); */} 1625 | 1626 | // Attribute value, quoted variants can span multiple lines. 1627 | // Missing end quote: accept /> look-ahead as heuristic. 1628 | // These need to be kept in sync with the attribute_preprocessor_text_* 1629 | generic_att_value 1630 | <- (space_or_newline* "'") t:attribute_preprocessor_text_single? ("'" / &('/'? '>')) { 1631 | return t, nil 1632 | /* 1633 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length); 1634 | */ 1635 | } 1636 | / (space_or_newline* '"') t:attribute_preprocessor_text_double? ('"' / &('/'? '>')) { 1637 | return t, nil 1638 | /* 1639 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length); 1640 | */ 1641 | } 1642 | / space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') { 1643 | return t, nil 1644 | /* 1645 | return tu.getAttrVal(t, startOffset() + s.length, endOffset()); 1646 | */ 1647 | } 1648 | 1649 | // Attribute value, restricted to a single line. 1650 | // Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic. 1651 | // These need to be kept in sync with the table_attribute_preprocessor_text_* 1652 | table_att_value 1653 | <- (space* "'") table_attribute_preprocessor_text_single? ("'" / &("!!" / [|\r\n])) {return nil, nil 1654 | /* 1655 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length); 1656 | */ 1657 | } 1658 | / (space* '"') table_attribute_preprocessor_text_double? ('"' / &("!!" / [|\r\n])) {return nil, nil 1659 | /* 1660 | return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length); 1661 | */ 1662 | } 1663 | / space* table_attribute_preprocessor_text &(space_or_newline/ eof / "!!" / '|') {return nil, nil 1664 | /* 1665 | return tu.getAttrVal(t, startOffset() + s.length, endOffset()); 1666 | */ 1667 | } 1668 | 1669 | // ******************************************************* 1670 | // Lists 1671 | // *******************************************************/ 1672 | list_item <- dtdd / hacky_dl_uses / li 1673 | 1674 | li <- bullets:list_char+ 1675 | c2:inlineline? 1676 | // The inline_break is to check if we've hit a template end delimiter. 1677 | &(eolf / inline_breaks) 1678 | { 1679 | n := &html.Node{ 1680 | Type: html.ElementNode, 1681 | Data: "li", 1682 | } 1683 | addChild(n, c2) 1684 | return n ,nil 1685 | /* 1686 | // Leave bullets as an array -- list handler expects this 1687 | var tsr = tsrOffsets('start'); 1688 | tsr[1] += bullets.length; 1689 | var li = new TagTk('listItem', [], { tsr: tsr }); 1690 | li.bullets = bullets; 1691 | return [ li ].concat(c || []); 1692 | */ 1693 | } 1694 | 1695 | 1696 | // This rule is required to support wikitext of this form 1697 | // ::{|border="1"|foo|bar|baz|} 1698 | // where the leading colons are used to indent the entire table. 1699 | // This hack was added back in 2006 in commit 1700 | // a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl 1701 | // Fürstenberg. 1702 | // 1703 | hacky_dl_uses <- ":"+ 1704 | (table_line (sol table_line)*) 1705 | inlineline? 1706 | &comment_space_eolf 1707 | {return nil,nil 1708 | /* 1709 | // Leave bullets as an array -- list handler expects this 1710 | var tsr = tsrOffsets('start'); 1711 | tsr[1] += bullets.length; 1712 | var li = new TagTk('listItem', [], { tsr: tsr }); 1713 | li.bullets = bullets; 1714 | return tu.flattenIfArray([li, tbl || [], line || []]); 1715 | */ 1716 | } 1717 | 1718 | dtdd 1719 | <- (!(";" !list_char) list_char {return nil, nil /*return lc;*/ })* 1720 | ";" 1721 | & {return false, nil/*return stops.inc('colon');*/} 1722 | inlineline? 1723 | (":" {return nil, nil /*return endOffset(); */}) 1724 | // Fortunately dtdds cannot be nested, so we can simply set the flag 1725 | // back to 0 to disable it. 1726 | & {return false, nil /*stops.counters.colon = 0; return true;*/} 1727 | inlineline? 1728 | &eolf {return nil, nil 1729 | /* 1730 | // Leave bullets as an array -- list handler expects this 1731 | // TSR: +1 for the leading ";" 1732 | var numBullets = bullets.length + 1; 1733 | var tsr = tsrOffsets('start'); 1734 | tsr[1] += numBullets; 1735 | var li1 = new TagTk('listItem', [], { tsr: tsr }); 1736 | li1.bullets = bullets.slice(); 1737 | li1.bullets.push(";"); 1738 | // TSR: -1 for the intermediate ":" 1739 | var li2 = new TagTk('listItem', [], { tsr: [cpos - 1, cpos], stx: 'row' }); 1740 | li2.bullets = bullets.slice(); 1741 | li2.bullets.push(":"); 1742 | 1743 | return [ li1 ].concat(c || [], [ li2 ], d || []); 1744 | */ 1745 | } 1746 | // Fall-back case to clear the colon flag 1747 | / & {return false, nil /*stops.counters.colon = 0; return false; */} 1748 | 1749 | 1750 | list_char <- [*#:;] 1751 | 1752 | 1753 | 1754 | // **************************************************************************** 1755 | // Tables 1756 | // ------ 1757 | // Table rules are geared to support independent parsing of fragments in 1758 | // templates (the common table start / row / table end use case). The tokens 1759 | // produced by these fragments then match up to a table while building the 1760 | // DOM tree. For similar reasons, table rows do not emit explicit end tag 1761 | // tokens. 1762 | 1763 | // The separate table_line rule is faster than moving those rules 1764 | // directly to block_lines. 1765 | 1766 | // Notes about the full_table_in_link_caption rule 1767 | // ----------------------------------------------------- 1768 | // However, for link-tables, we have introduced a stricter parse wherein 1769 | // we require table-start and table-end tags to not come from a template. 1770 | // In addition, this new rule doesn't accept fosterable-content in 1771 | // the table unlike the more lax (sol table_line)+ rule. 1772 | 1773 | // This is the best we can do at this time since we cannot distinguish 1774 | // between table rows and image options entirely in the tokenizer. 1775 | 1776 | // Consider the following examples: 1777 | 1778 | // Example 1: 1779 | 1780 | // [[Image:Foo.jpg|left|30px|Example 1 1781 | // {{This-template-returns-a-table-start-tag}} 1782 | // |foo 1783 | // {{This-template-returns-a-table-end-tag}} 1784 | // ]] 1785 | 1786 | // Example 2: 1787 | 1788 | // [[Image:Foo.jpg|left|30px|Example 1 1789 | // {{echo|a}} 1790 | // |foo 1791 | // {{echo|b}} 1792 | // ]] 1793 | 1794 | // So, we cannot know a priori (without preprocessing or fully expanding 1795 | // all templates) if "|foo" in the two examples is a table cell or an image 1796 | // option. This is a limitation of our tokenizer-based approach compared to 1797 | // the preprocessing-based approach of the PHP parser. 1798 | 1799 | // Given this limitation, we are okay forcing a full-table context in 1800 | // link captions (if necessary, we can relax the fosterable-content requirement 1801 | // but that is broken wikitext anyway, so we can force that edge-case wikitext 1802 | // to get fixed by rejecting it). 1803 | // ****************************************************************************/ 1804 | 1805 | full_table_in_link_caption 1806 | <- (! inline_breaks / & "{{!}}" ) 1807 | ( 1808 | // Note that "linkdesc" is suppressed here to provide a nested parsing 1809 | // context in which to parse the table. Otherwise, we may break on 1810 | // on pipes in the `table_start_tag` and `table_row_tag` attributes. 1811 | // However, as a result, this can be more permissive than the current 1812 | // php implementation, but likelier to match the users intent. 1813 | & {return false, nil /*stops.push('linkdesc', false); return stops.push('table', true); 1814 | */} 1815 | ( 1816 | table_start_tag optionalNewlines 1817 | // Accept multiple end tags since a nested table may have been 1818 | // opened in the table content line. 1819 | ((sol (table_content_line / tplarg_or_template) optionalNewlines)* 1820 | sol table_end_tag)+ 1821 | ){return nil, nil 1822 | /* 1823 | stops.pop('linkdesc'); 1824 | stops.pop('table'); 1825 | return tbl; 1826 | */ 1827 | } 1828 | / & {return false, nil/* stops.pop('linkdesc'); return stops.pop('table'); */} 1829 | ) {return nil, nil/* return r; */} 1830 | 1831 | // This rule assumes start-of-line position! 1832 | table_line 1833 | <- (! inline_breaks / & "{{!}}" ) 1834 | ( 1835 | & {return false, nil /* return stops.push('table', true); */} 1836 | ( 1837 | table_start_tag optionalNewlines 1838 | / table_content_line optionalNewlines 1839 | / table_end_tag 1840 | ) {return nil, nil 1841 | /* 1842 | stops.pop('table'); 1843 | return tl; 1844 | */ 1845 | } 1846 | / & {return false, nil /* return stops.pop('table'); */} 1847 | ) {return nil, nil/* return r; */} 1848 | 1849 | table_content_line <- (space / comment)* ( 1850 | table_heading_tags 1851 | / table_row_tag 1852 | / table_data_tags 1853 | / table_caption_tag 1854 | ) 1855 | 1856 | table_start_tag 1857 | <- (space / comment)* ("" {return nil, nil/* return endOffset(); */}) "{" pipe 1858 | // ok to normalize away stray |} on rt (see T59360) 1859 | & {return false, nil /* return stops.push('table', false); */} 1860 | table_attributes 1861 | ("" {return nil, nil/* stops.pop('table'); return endOffset(); */}) 1862 | {return nil, nil 1863 | /* 1864 | var coms = tu.popComments(ta); 1865 | if (coms) { 1866 | tsEndPos = coms.commentStartPos; 1867 | } 1868 | 1869 | var da = { tsr: [startPos, tsEndPos] }; 1870 | if (p !== "|") { 1871 | // Variation from default 1872 | da.startTagSrc = b + p; 1873 | } 1874 | 1875 | sc.push(new TagTk('table', ta, da)); 1876 | if (coms) { 1877 | sc = sc.concat(coms.buf); 1878 | } 1879 | return sc; 1880 | */ 1881 | } 1882 | 1883 | // FIXME: Not sure if we want to support it, but this should allow columns. 1884 | table_caption_tag 1885 | // avoid recursion via nested_block_in_table 1886 | <- ! {return true, nil /*return stops.onStack('tableDataBlock');*/ } 1887 | pipe "+" 1888 | row_syntax_table_args? 1889 | ("" {return nil, nil /*return endOffset();*/ }) 1890 | nested_block_in_table* {return nil, nil 1891 | /* 1892 | return tu.buildTableTokens("caption", "|+", args, [startOffset(), tagEndPos], endOffset(), c, true); 1893 | */ 1894 | } 1895 | 1896 | table_row_tag 1897 | <- // avoid recursion via nested_block_in_table 1898 | ! {return true, nil /*return stops.onStack('tableDataBlock'); */} 1899 | pipe "-"+ 1900 | & {return false, nil /* return stops.push('table', false); */} 1901 | table_attributes 1902 | ("" {return nil, nil/* stops.pop('table'); return endOffset(); */}) 1903 | {return nil, nil 1904 | /* 1905 | var coms = tu.popComments(a); 1906 | if (coms) { 1907 | tagEndPos = coms.commentStartPos; 1908 | } 1909 | 1910 | var da = { 1911 | tsr: [ startOffset(), tagEndPos ], 1912 | startTagSrc: p + dashes, 1913 | }; 1914 | 1915 | // We rely on our tree builder to close the row as needed. This is 1916 | // needed to support building tables from fragment templates with 1917 | // individual cells or rows. 1918 | var trToken = new TagTk('tr', a, da); 1919 | 1920 | var res = [ trToken ]; 1921 | if (coms) { 1922 | res = res.concat(coms.buf); 1923 | } 1924 | return res; 1925 | */ 1926 | } 1927 | 1928 | tds 1929 | <- ( ( pipe_pipe / pipe & row_syntax_table_args {return nil, nil /*return p;*/ } ) 1930 | table_data_tag {return nil, nil 1931 | /* 1932 | var da = tdt[0].dataAttribs; 1933 | da.stx = "row"; 1934 | da.tsr[0] -= pp.length; // include "||" 1935 | if (pp !== "||" || (da.startTagSrc && da.startTagSrc !== pp)) { 1936 | // Variation from default 1937 | da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : ''); 1938 | } 1939 | return tdt; 1940 | */ 1941 | } 1942 | )* 1943 | 1944 | // avoid recursion via nested_block_in_table 1945 | table_data_tags 1946 | <- ! {return true, nil/* return stops.onStack('tableDataBlock'); */} 1947 | pipe 1948 | ![+-] table_data_tag 1949 | ("" {return nil, nil/* return endOffset(); */}) 1950 | tds {return nil, nil 1951 | // blahaskjdf;alsdf;; 1952 | } 1953 | 1954 | table_data_tag 1955 | <- ! "}" 1956 | row_syntax_table_args? 1957 | // use inline_breaks to break on tr etc 1958 | ("" {return nil, nil/* return endOffset(); */}) 1959 | nested_block_in_table* 1960 | {return nil, nil 1961 | /* 1962 | return tu.buildTableTokens("td", "|", arg, [startOffset(), tagEndPos], endOffset(), td); 1963 | */ 1964 | } 1965 | 1966 | table_heading_tags 1967 | <- "!" 1968 | & {return false, nil /*return stops.push('th', endOffset()); */} 1969 | table_heading_tag 1970 | ( ("!!" / pipe_pipe) table_heading_tag {return nil, nil 1971 | /* 1972 | var da = tht[0].dataAttribs; 1973 | da.stx = 'row'; 1974 | da.tsr[0] -= pp.length; // include "!!" or "||" 1975 | 1976 | if (pp !== "!!" || (da.startTagSrc && da.startTagSrc !== pp)) { 1977 | // Variation from default 1978 | da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : ''); 1979 | } 1980 | return tht; 1981 | */ 1982 | } 1983 | )* {return nil, nil 1984 | /* 1985 | stops.pop('th'); 1986 | th[0].dataAttribs.tsr[0]--; // include "!" 1987 | return th.concat(ths); 1988 | */ 1989 | } 1990 | / & {return false, nil /*return stops.onStack('th') !== false ? stops.pop('th') : false;*/ } 1991 | 1992 | table_heading_tag 1993 | <- row_syntax_table_args? 1994 | ("" {return nil, nil /*return endOffset();*/ }) 1995 | ( & {return false, nil 1996 | /* 1997 | // This SyntaxStop is only true until we hit the end of the line. 1998 | if (stops.onStack('th') !== false && 1999 | /\n/.test(input.substring(stops.onStack('th'), endOffset()))) { 2000 | // There's been a newline. Remove the break and continue 2001 | // tokenizing nested_block_in_tables. 2002 | stops.pop('th'); 2003 | } 2004 | return true; 2005 | */ 2006 | } nested_block_in_table {return nil, nil/* return d; */} )* {return nil, nil 2007 | /* 2008 | return tu.buildTableTokens("th", "!", arg, [startOffset(), tagEndPos], endOffset(), c); 2009 | */ 2010 | } 2011 | 2012 | table_end_tag 2013 | <- (space / comment)* ("" {return nil, nil/* return endOffset(); */}) pipe "}" {return nil, nil 2014 | /* 2015 | var tblEnd = new EndTagTk('table', [], { tsr: [startPos, endOffset()] }); 2016 | if (p !== "|") { 2017 | // p+"" is triggering some bug in pegJS 2018 | // I cannot even use that expression in the comment! 2019 | tblEnd.dataAttribs.endTagSrc = p + b; 2020 | } 2021 | return sc.concat([tblEnd]); 2022 | */ 2023 | } 2024 | 2025 | // 2026 | // Table parameters separated from the content by a single pipe. Does *not* 2027 | // match if followed by double pipe (row-based syntax). 2028 | // 2029 | row_syntax_table_args 2030 | <- & {return false, nil /* return stops.push('tableCellArg', return true, nil); */} 2031 | table_attributes space* pipe !pipe {return nil, nil 2032 | /* 2033 | stops.pop('tableCellArg'); 2034 | return [as, s, p]; 2035 | */ 2036 | } 2037 | / & {return false, nil /* return stops.pop('tableCellArg'); */} 2038 | 2039 | 2040 | // ***************************************************************** 2041 | // Text variants and other general rules 2042 | // *****************************************************************/ 2043 | 2044 | // All chars that cannot start syntactic structures in the middle of a line 2045 | // XXX: ] and other end delimiters should probably only be activated inside 2046 | // structures to avoid unnecessarily leaving the text rule on plain 2047 | // content. 2048 | 2049 | // TODO: Much of this is should really be context-dependent (syntactic 2050 | // flags). The wikilink_preprocessor_text rule is an example where 2051 | // text_char is not quite right and had to be augmented. Try to minimize / 2052 | // clarify this carefully! 2053 | // 2054 | 2055 | text_char <- [^'<~[{\n\r:;\]}|!=-] 2056 | 2057 | // Legend 2058 | // ' quotes (italic/bold) 2059 | // < start of xmlish_tag 2060 | // ~ signatures/dates 2061 | // [ start of links 2062 | // { start of parser functions, transclusion and template args 2063 | // \n all sort of block-level markup at start of line 2064 | // \r ditto 2065 | // A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC) 2066 | 2067 | // _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related) 2068 | // ! and | table cell delimiters, might be better to specialize those 2069 | // = headings - also specialize those! 2070 | 2071 | // The following chars are also included for now, but only apply in some 2072 | // contexts and should probably be enabled only in those: 2073 | // : separate definition in ; term : definition 2074 | // ] end of link 2075 | // } end of parser func/transclusion/template arg 2076 | // - start of lang_variant -{ ... }- 2077 | // ; separator in lang_variant 2078 | // 2079 | 2080 | urltext <- ( [^-'<~[{\n/A-Za-z_|!:;\]} &=]+ 2081 | / & [/A-Za-z] al:autolink {return al, nil /*return al;*/ } 2082 | / & "&" he:htmlentity {return he, nil /*return he;*/ } 2083 | // Convert trailing space into   2084 | // XXX: This should be moved to a serializer 2085 | // This is a hack to force a whitespace display before the colon 2086 | / ' ' & ':' {return " ", nil 2087 | /* 2088 | var toks = Util.placeholder('\u00a0', { 2089 | ' ', 2090 | tsr: tsrOffsets('start'), 2091 | isDisplayHack: true, 2092 | }, { tsr: tsrOffsets('end'), isDisplayHack: true }); 2093 | var typeOf = toks[0].getAttribute('typeof'); 2094 | toks[0].setAttribute('typeof', 'mw:DisplaySpace ' + typeOf); 2095 | return toks; 2096 | */ 2097 | } 2098 | / & ("__") bs:behavior_switch {return bs, nil /*return bs;*/ } 2099 | // About 96% of text_char calls originate here. 2100 | // pegjs 0.8 inlines this simple rule automatically. 2101 | / text_char )+ 2102 | 2103 | raw_htmlentity <- ("&" [#0-9a-zA-Z]+ ";") {return nil, nil 2104 | /* 2105 | return Util.decodeEntities(m); 2106 | */ 2107 | } 2108 | 2109 | htmlentity <- raw_htmlentity {return nil, nil 2110 | /* 2111 | // if this is an invalid entity, don't tag it with 'mw:Entity' 2112 | if (cc.length > 2 /* decoded entity would be 1 or 2 UTF-16 characters * /) { 2113 | return cc; 2114 | } 2115 | return [ 2116 | new TagTk('span', [new KV('typeof', 'mw:Entity')], { src: text(), srcContent: cc, tsr: tsrOffsets('start') }), 2117 | cc, 2118 | new EndTagTk('span', [], { tsr: tsrOffsets('end') }), 2119 | ]; 2120 | */ 2121 | } 2122 | 2123 | spaces <- [ \t]+ 2124 | 2125 | space <- [ \t] 2126 | 2127 | optionalSpaceToken <- space* 2128 | 2129 | // This rule corresponds to \s in the PHP preg_* functions, 2130 | // which is used frequently in the PHP parser. The inclusion of 2131 | // form feed (but not other whitespace, like vertical tab) is a quirk 2132 | // of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular 2133 | // Expressions) library. 2134 | // 2135 | space_or_newline 2136 | <- [ \t\n\r\x0c] 2137 | 2138 | // This rule corresponds to \b in the PHP preg_* functions, 2139 | // after a word character. That is, it's a zero-width lookahead that 2140 | // the next character is not a word character. 2141 | // 2142 | end_of_word 2143 | <- eof / ![A-Za-z0-9_] 2144 | 2145 | // Unicode "separator, space" category. It covers the \u0020 space as well 2146 | // as \u3000 IDEOGRAPHIC SPACE (see bug 19052). In PHP this is \p{Zs}. 2147 | // Keep this up-to-date with the characters tagged ;Zs; in 2148 | // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt 2149 | unispace <- [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000] 2150 | 2151 | // Non-newline whitespace, including non-breaking spaces. Used for magic links. 2152 | space_or_nbsp 2153 | <- space // includes \t 2154 | / unispace 2155 | / he:htmlentity &{ return false, nil /*return Array.isArray(he) && /^\u00A0$/.test(he[1]);*/ } 2156 | {return he, nil /*return he;*/ } 2157 | 2158 | // Used within ISBN magic links 2159 | space_or_nbsp_or_dash 2160 | <- space_or_nbsp / "-" 2161 | 2162 | // Extra newlines followed by at least another newline. Usually used to 2163 | // compress surplus newlines into a meta tag, so that they don't trigger 2164 | // paragraphs. 2165 | optionalNewlines 2166 | <- ([\n\r\t ] &[\n\r])* 2167 | 2168 | comment_or_includes <- (comment / ( 2169 | ( #{ 2170 | push(c, "sol_il", true) 2171 | return nil 2172 | } 2173 | i:include_limits 2174 | #{ 2175 | pop(c, "sol_il") 2176 | return nil 2177 | } 2178 | ) {return i, nil} 2179 | ))* 2180 | 2181 | sol <- (empty_line_with_comments / sol_prefix) comment_or_includes 2182 | 2183 | sol_prefix 2184 | <- newlineToken 2185 | / & { 2186 | return c.pos.offset == 0, nil 2187 | /* 2188 | // Use the sol flag only at the start of the input 2189 | // NOTE: Explicitly check for 'false' and not a falsy value 2190 | return endOffset() === 0 && options.sol !== false; 2191 | */ 2192 | } {return nil, nil /*return [];*/ } 2193 | 2194 | empty_line_with_comments 2195 | <- sol_prefix ("" {return "empty_line_with_comments", nil /*return endOffset();*/ }) (space* comment (space / comment)* newline)+ {return nil, nil 2196 | /* 2197 | return [ 2198 | sp, 2199 | new SelfclosingTagTk("meta", [new KV('typeof', 'mw:EmptyLine')], { 2200 | tokens: tu.flattenIfArray(c), 2201 | tsr: [p, endOffset()], 2202 | }), 2203 | ]; 2204 | */ 2205 | } 2206 | 2207 | comment_space <- comment / space 2208 | 2209 | nl_comment_space <- newlineToken / comment_space 2210 | 2211 | // 2212 | // noinclude / includeonly / onlyinclude rules. These are normally 2213 | // handled by the xmlish_tag rule, except where generic tags are not 2214 | // allowed- for example in directives, which are allowed in various attribute 2215 | // names and -values. 2216 | 2217 | // Example test case: 2218 | // {| 2219 | // |- 2220 | // foo 2221 | // 2222 | // |Hello 2223 | // |} 2224 | // 2225 | 2226 | include_limits <- 2227 | il:("<" "/"? ([oyinclude]i+ & {return false, nil 2228 | /* 2229 | var incl = n.toLowerCase(); 2230 | return incl === "noinclude" || incl === "onlyinclude" || 2231 | incl === "includeonly"; 2232 | */ 2233 | }) space_or_newline* ">" {return nil, nil 2234 | /* 2235 | var incl = name.toLowerCase(); 2236 | var dp = { tsr: tsrOffsets() }; 2237 | 2238 | // Record variant since tag is not in normalized lower case 2239 | if (name !== incl) { 2240 | dp.srcTagName = name; 2241 | } 2242 | 2243 | // End tag only 2244 | if (c) { 2245 | return new EndTagTk(name, [], dp); 2246 | } 2247 | 2248 | var restOfInput = input.substring(endOffset()); 2249 | var tagContent = restOfInput.match(new RegExp("^([\\s\\S]*?)(?:)", "m")); 2250 | 2251 | // Start tag only 2252 | if (!tagContent || !tagContent[1]) { 2253 | return new TagTk(name, [], dp); 2254 | } 2255 | 2256 | // Get the content 2257 | var inclContent = tagContent[1]; 2258 | 2259 | // Preserve SOL where necessary (for onlyinclude and noinclude) 2260 | // Note that this only works because we encounter <*include*> tags in 2261 | // the toplevel content and we rely on the php preprocessor to expand 2262 | // templates, so we shouldn't ever be tokenizing inInclude. 2263 | // Last line should be empty (except for comments) 2264 | if (incl !== "includeonly" && stops.onStack("sol_il")) { 2265 | var last = lastItem(inclContent.split('\n')); 2266 | if (!/^()*$/.test(last)) { 2267 | return false; 2268 | } 2269 | } 2270 | 2271 | // Tokenize include content in a new tokenizer 2272 | var inclContentToks = (new PegTokenizer(env)).tokenizeSync(inclContent); 2273 | inclContentToks = Util.stripEOFTkfromTokens(inclContentToks); 2274 | 2275 | // Shift tsr 2276 | Util.shiftTokenTSR(inclContentToks, endOffset()); 2277 | 2278 | // Skip past content 2279 | peg$currPos += inclContent.length; 2280 | 2281 | return [new TagTk(name, [], dp)].concat(inclContentToks); 2282 | */ 2283 | }) & {return il != nil, nil /*return !!il; */ } {return il, nil /*return il; */ } 2284 | 2285 | // Start of file 2286 | sof <- & { 2287 | return c.pos.offset == 0, nil 2288 | } 2289 | 2290 | // End of file 2291 | eof <- & { 2292 | len := c.globalStore["len"].(int) 2293 | return c.pos.offset == len, nil 2294 | } 2295 | 2296 | newline <- '\n' / "\r\n" 2297 | 2298 | newlineToken <- newline {return "\n", nil/* return [new NlTk(tsrOffsets())]; */} 2299 | 2300 | eolf <- newline / eof 2301 | 2302 | comment_space_eolf <- (space+ / comment)* eolf 2303 | 2304 | // 'Preprocessor' directive- higher-level things that can occur in otherwise 2305 | // plain-text content. 2306 | directive 2307 | <- comment 2308 | / extension_tag 2309 | / tplarg_or_template 2310 | / & "-{" v:lang_variant_or_tpl {return v, nil/* return v; */} 2311 | / & "&" e:htmlentity {return e, nil/* return e; */} 2312 | / include_limits 2313 | 2314 | wikilink_preprocessor_text 2315 | <- r:( [^<[{\n\r\t|!\]}{ &-]+ 2316 | // XXX gwicke: any more chars we need to allow here? 2317 | / !inline_breaks wr:( directive / ( !"]]" ( text_char / [!<}\]\n\r-] ) ) ) 2318 | {return wr, nil/* return wr; */} 2319 | )+ {return r, nil 2320 | /* 2321 | return tu.flattenStringlist(r); 2322 | */ 2323 | } 2324 | 2325 | extlink_preprocessor_text 2326 | // added special separator character class inline: separates url from 2327 | // description / text 2328 | <- # { push(c, "linkdesc", false); return nil 2329 | /* 2330 | // Prevent breaking on pipes when we're in a link description. 2331 | // See the test, 'Images with the "|" character in the comment'. 2332 | return stops.push('linkdesc', false); 2333 | */ 2334 | } 2335 | r:( [^'<~[{\n\r|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000-]+ 2336 | / !inline_breaks s:( directive / no_punctuation_char / [&|{-] ) {return s, nil/* return s; 2337 | */} 2338 | /// urlencoded_char 2339 | // !inline_breaks no_punctuation_char 2340 | / ([.:,] !(space / eolf)) 2341 | / (['] ![']) // single quotes are ok, double quotes are bad 2342 | )+ 2343 | #{ pop(c, "linkdesc"); return nil } 2344 | {return r, nil 2345 | /* 2346 | stops.pop('linkdesc'); 2347 | return tu.flattenString(r); 2348 | */ 2349 | } 2350 | 2351 | // Attribute values with preprocessor support 2352 | 2353 | // n.b. / is a permissible char in the three rules below. 2354 | // We only break on />, enforced by the negated expression. 2355 | // Hence, it isn't included in the stop set. 2356 | 2357 | // The stop set is space_or_newline and > which matches generic_att_value. 2358 | attribute_preprocessor_text 2359 | <- r:( [^{}&<|/ \t\n\r\x0c>-]+ 2360 | / !inline_breaks 2361 | !"/>" 2362 | s:( directive / [{}&<|/-] ) {return s, nil /*return s; */} 2363 | )+ {return r, nil 2364 | /* 2365 | return tu.flattenString(r); 2366 | */ 2367 | } 2368 | 2369 | // The stop set is '> which matches generic_att_value. 2370 | attribute_preprocessor_text_single 2371 | <- r:( [^{}&<|/'>-]+ 2372 | / !inline_breaks 2373 | !"/>" 2374 | s:( directive / [{}&<|/-] ) {return s, nil/* return s; */} 2375 | )* {return r, nil 2376 | /* 2377 | return tu.flattenString(r); 2378 | */ 2379 | } 2380 | 2381 | // The stop set is "> which matches generic_att_value. 2382 | attribute_preprocessor_text_double 2383 | <- r:( [^{}&<|/">-]+ 2384 | / !inline_breaks 2385 | !"/>" 2386 | s:( directive / [{}&<|/-] ) {return s, nil/* return s; */} 2387 | )* {return r, nil 2388 | /* 2389 | return tu.flattenString(r); 2390 | */ 2391 | } 2392 | 2393 | // Variants with the entire attribute on a single line 2394 | 2395 | // n.b. ! is a permissible char in the three rules below. 2396 | // We only break on !! in th, enforced by the inline break. 2397 | // Hence, it isn't included in the stop set. 2398 | // [ is also permissible but we give a chance to break 2399 | // for the [[ special case in php's doTableStuff (See T2553). 2400 | 2401 | // The stop set is space_or_newline and | which matches table_att_value. 2402 | table_attribute_preprocessor_text 2403 | <- r:( [^{}& 0 { 24 | t.Fatalf("leaking state! %#v", p.cur.state) 25 | } 26 | } 27 | 28 | func TestConvert(t *testing.T) { 29 | log.SetFlags(log.Flags() | log.Lshortfile) 30 | 31 | cases := []struct { 32 | in string 33 | want string 34 | }{ 35 | { 36 | "Blah", 37 | "

      Blah

      ", 38 | }, 39 | { 40 | "== Test ==", 41 | "

      Test

      ", 42 | }, 43 | { 44 | "=Test=", 45 | "

      Test

      ", 46 | }, 47 | { 48 | "'''Test'''", 49 | "Test", 50 | }, 51 | { 52 | "* foo\n* nah\n* woof", 53 | "
    1. foo
    2. \n
    3. nah
    4. \n
    5. woof
    6. ", 54 | }, 55 | { 56 | "----", 57 | "
      ", 58 | }, 59 | { 60 | "{{reflink}}\n\nBlah", 61 | "

      Blah

      ", 62 | }, 63 | { 64 | "[[Jordanstown]]", 65 | `

      Jordanstown

      `, 66 | }, 67 | { 68 | "[[Jordanstown|Blah]]", 69 | `

      Blah

      `, 70 | }, 71 | { 72 | `{{Infobox basketball club 73 | | name = Ulster Elks 74 | | color1 = white 75 | | color2 = blue 76 | | logo = 77 | | arena = [[Ulster University]] Sports Centre 78 | }}`, 79 | "

      ", 80 | }, 81 | { 82 | `
      Test
      `, 83 | `

      Test

      `, 84 | }, 85 | { 86 | "Foo\nBar", 87 | "

      Foo\nBar

      ", 88 | }, 89 | { 90 | "AB", 91 | "

      AB

      ", 92 | }, 93 | } 94 | 95 | debugRules(true) 96 | 97 | for _, c := range cases { 98 | c := c 99 | t.Run(c.in, func(t *testing.T) { 100 | outBytes, err := Convert([]byte(c.in), strict()) 101 | if err != nil { 102 | t.Fatal(err) 103 | } 104 | 105 | out := string(outBytes) 106 | if out != c.want { 107 | t.Errorf("Covert(%q) = %q; not %q", c.in, out, c.want) 108 | } 109 | }) 110 | } 111 | } 112 | 113 | func TestSanitizationPolicy(t *testing.T) { 114 | cases := []struct { 115 | in string 116 | want string 117 | }{ 118 | { 119 | "
      ", 120 | "
      ", 121 | }, 122 | { 123 | "
      A
      ", 124 | "
      A
      ", 125 | }, 126 | { 127 | "", 128 | "", 129 | }, 130 | } 131 | 132 | p := wikitextPolicy() 133 | 134 | for _, c := range cases { 135 | c := c 136 | t.Run(c.in, func(t *testing.T) { 137 | doc, err := html.Parse(strings.NewReader(c.in)) 138 | if err != nil { 139 | t.Fatal(err) 140 | } 141 | t.Logf("Doc = %s", spew.Sdump(doc)) 142 | 143 | out := p.Sanitize(c.in) 144 | if out != c.want { 145 | t.Errorf("Sanitize(%q) = %q; not %q", c.in, out, c.want) 146 | } 147 | }) 148 | } 149 | } 150 | --------------------------------------------------------------------------------