├── .gitignore
├── lua
    └── libraryUtil.lua
├── main.go
├── readme.md
├── static
    ├── favicon.png
    ├── gopher-front.svg
    └── style.css
├── templates.go
├── templates
    ├── article.html
    ├── base.html
    ├── error.html
    └── source.html
└── wikitext
    ├── debug.go
    ├── pegTokenizer.pegjs
    ├── rules_test.go
    ├── tokens.go
    ├── tokens_test.go
    ├── url.go
    ├── wikitext.go
    ├── wikitext.peg
    ├── wikitext.peg.go
    └── wikitext_test.go


/.gitignore:
--------------------------------------------------------------------------------
1 | *.bleve
2 | 


--------------------------------------------------------------------------------
/lua/libraryUtil.lua:
--------------------------------------------------------------------------------
 1 | local libraryUtil = {}
 2 | 
 3 | function libraryUtil.checkType( name, argIdx, arg, expectType, nilOk )
 4 | 	if arg == nil and nilOk then
 5 | 		return
 6 | 	end
 7 | 	if type( arg ) ~= expectType then
 8 | 		local msg = string.format( "bad argument #%d to '%s' (%s expected, got %s)",
 9 | 			argIdx, name, expectType, type( arg )
10 | 		)
11 | 		error( msg, 3 )
12 | 	end
13 | end
14 | 
15 | function libraryUtil.checkTypeForIndex( index, value, expectType )
16 | 	if type( value ) ~= expectType then
17 | 		local msg = string.format( "value for index '%s' must be %s, %s given",
18 | 			index, expectType, type( value )
19 | 		)
20 | 		error( msg, 3 )
21 | 	end
22 | end
23 | 
24 | function libraryUtil.makeCheckSelfFunction( libraryName, varName, selfObj, selfObjDesc )
25 | 	return function ( self, method )
26 | 		if self ~= selfObj then
27 | 			error( string.format(
28 | 				"%s: invalid %s. Did you call %s with a dot instead of a colon, i.e. " ..
29 | 				"%s.%s() instead of %s:%s()?",
30 | 				libraryName, selfObjDesc, method, varName, method, varName, method
31 | 			), 3 )
32 | 		end
33 | 	end
34 | end
35 | 
36 | return libraryUtil
37 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"compress/bzip2"
  6 | 	"encoding/xml"
  7 | 	"flag"
  8 | 	"fmt"
  9 | 	"html/template"
 10 | 	"io"
 11 | 	"log"
 12 | 	"net/http"
 13 | 	_ "net/http/pprof"
 14 | 	"os"
 15 | 	"path"
 16 | 	"path/filepath"
 17 | 	"strconv"
 18 | 	"strings"
 19 | 	"sync"
 20 | 
 21 | 	"github.com/blevesearch/bleve"
 22 | 	"github.com/creachadair/cityhash"
 23 | 	pbzip2 "github.com/d4l3k/go-pbzip2"
 24 | 	"github.com/d4l3k/wikigopher/wikitext"
 25 | 	"github.com/pkg/errors"
 26 | )
 27 | 
 28 | var (
 29 | 	indexFile       = flag.String("index", "enwiki-latest-pages-articles-multistream-index.txt.bz2", "the index file to load")
 30 | 	articlesFile    = flag.String("articles", "enwiki-latest-pages-articles-multistream.xml.bz2", "the article dump file to load")
 31 | 	search          = flag.Bool("search", false, "whether or not to build a search index")
 32 | 	searchIndexFile = flag.String("searchIndex", "index.bleve", "the search index file")
 33 | 	httpAddr        = flag.String("http", ":8080", "the address to bind HTTP to")
 34 | )
 35 | 
 36 | var tmpls = map[string]*template.Template{}
 37 | 
 38 | func loadTemplates() error {
 39 | 	files, err := filepath.Glob("templates/*")
 40 | 	if err != nil {
 41 | 		return err
 42 | 	}
 43 | 	for _, file := range files {
 44 | 		name := filepath.Base(file)
 45 | 		tmpls[name], err = template.ParseFiles("templates/base.html", file)
 46 | 		if err != nil {
 47 | 			return err
 48 | 		}
 49 | 	}
 50 | 	return nil
 51 | }
 52 | 
 53 | func executeTemplate(wr io.Writer, name string, data interface{}) error {
 54 | 	return tmpls[name].ExecuteTemplate(wr, "base", data)
 55 | }
 56 | 
 57 | type indexEntry struct {
 58 | 	id, seek int
 59 | }
 60 | 
 61 | var mu = struct {
 62 | 	sync.Mutex
 63 | 
 64 | 	offsets    map[uint64]indexEntry
 65 | 	offsetSize map[int]int
 66 | }{
 67 | 	offsets:    map[uint64]indexEntry{},
 68 | 	offsetSize: map[int]int{},
 69 | }
 70 | var index bleve.Index
 71 | 
 72 | func loadIndex() error {
 73 | 	mapping := bleve.NewIndexMapping()
 74 | 	os.RemoveAll(*searchIndexFile)
 75 | 	var err error
 76 | 	index, err = bleve.New(*searchIndexFile, mapping)
 77 | 	if err != nil {
 78 | 		return err
 79 | 	}
 80 | 	f, err := os.Open(*indexFile)
 81 | 	if err != nil {
 82 | 		return err
 83 | 	}
 84 | 	defer f.Close()
 85 | 	r, err := pbzip2.NewReader(f)
 86 | 	if err != nil {
 87 | 		return err
 88 | 	}
 89 | 	defer r.Close()
 90 | 
 91 | 	scanner := bufio.NewScanner(r)
 92 | 
 93 | 	log.Printf("Reading index file...")
 94 | 	i := 0
 95 | 	for scanner.Scan() {
 96 | 		parts := strings.Split(scanner.Text(), ":")
 97 | 		if len(parts) < 3 {
 98 | 			return errors.Errorf("expected at least 3 parts, got: %#v", parts)
 99 | 		}
100 | 		seek, err := strconv.Atoi(parts[0])
101 | 		if err != nil {
102 | 			return err
103 | 		}
104 | 		id, err := strconv.Atoi(parts[1])
105 | 		if err != nil {
106 | 			return err
107 | 		}
108 | 		title := strings.Join(parts[2:], ":")
109 | 		entry := indexEntry{
110 | 			id:   id,
111 | 			seek: seek,
112 | 		}
113 | 		titleHash := cityhash.Hash64([]byte(title))
114 | 
115 | 		mu.Lock()
116 | 		mu.offsets[titleHash] = entry
117 | 		mu.offsetSize[entry.seek]++
118 | 		mu.Unlock()
119 | 
120 | 		i++
121 | 		if i%100000 == 0 {
122 | 			log.Printf("read %d entries", i)
123 | 		}
124 | 	}
125 | 	if err := scanner.Err(); err != nil {
126 | 		return err
127 | 	}
128 | 	log.Printf("Done reading!")
129 | 
130 | 	if !*search {
131 | 		return nil
132 | 	}
133 | 
134 | 	/*
135 | 		log.Printf("Indexing titles...")
136 | 		i = 0
137 | 		batch := index.NewBatch()
138 | 
139 | 		mu.Lock()
140 | 		defer mu.Unlock()
141 | 
142 | 		for key, entry := range mu.offsets {
143 | 			mu.Unlock()
144 | 
145 | 			if err := batch.Index(key, entry); err != nil {
146 | 				mu.Lock()
147 | 				return err
148 | 			}
149 | 			i++
150 | 			if i%100000 == 0 {
151 | 				if err := index.Batch(batch); err != nil {
152 | 					mu.Lock()
153 | 					return err
154 | 				}
155 | 				batch.Reset()
156 | 				log.Printf("indexed %d entries", i)
157 | 			}
158 | 
159 | 			mu.Lock()
160 | 		}
161 | 
162 | 		log.Printf("Done indexing!")
163 | 	*/
164 | 
165 | 	return nil
166 | }
167 | 
168 | /*
169 | Example:
170 |   <page>
171 |     <title>AccessibleComputing</title>
172 |     <ns>0</ns>
173 |     <id>10</id>
174 |     <redirect title="Computer accessibility" />
175 |     <revision>
176 |       <id>834079434</id>
177 |       <parentid>767284433</parentid>
178 |       <timestamp>2018-04-03T20:38:02Z</timestamp>
179 |       <contributor>
180 |         <username>امیر اعوانی</username>
181 |         <id>8214454</id>
182 |       </contributor>
183 |       <minor />
184 |       <model>wikitext</model>
185 |       <format>text/x-wiki</format>
186 |       <text xml:space="preserve">#REDIRECT [[Computer accessibility]]
187 | 
188 | {{Redirect category shell}}
189 | {{R from move}}
190 | {{R from CamelCase}}
191 | {{R unprintworthy}}</text>
192 |       <sha1>qdiw0cwardl0qpkyeutu3pd77fwym8y</sha1>
193 |     </revision>
194 |   </page>
195 | */
196 | 
197 | type redirect struct {
198 | 	Title string `xml:"title,attr"`
199 | }
200 | 
201 | type page struct {
202 | 	XMLName    xml.Name   `xml:"page"`
203 | 	Title      string     `xml:"title"`
204 | 	NS         int        `xml:"ns"`
205 | 	ID         int        `xml:"id"`
206 | 	Redirect   []redirect `xml:"redirect"`
207 | 	RevisionID string     `xml:"revision>id"`
208 | 	Timestamp  string     `xml:"revision>timestamp"`
209 | 	Username   string     `xml:"revision>contributor>username"`
210 | 	UserID     string     `xml:"revision>contributor>id"`
211 | 	Model      string     `xml:"revision>model"`
212 | 	Format     string     `xml:"revision>format"`
213 | 	Text       string     `xml:"revision>text"`
214 | }
215 | 
216 | func readArticle(meta indexEntry) (page, error) {
217 | 	f, err := os.Open(*articlesFile)
218 | 	if err != nil {
219 | 		return page{}, err
220 | 	}
221 | 	defer f.Close()
222 | 
223 | 	mu.Lock()
224 | 	maxTries := mu.offsetSize[meta.seek]
225 | 	mu.Unlock()
226 | 
227 | 	r := bzip2.NewReader(f)
228 | 
229 | 	if _, err := f.Seek(int64(meta.seek), 0); err != nil {
230 | 		return page{}, err
231 | 	}
232 | 
233 | 	d := xml.NewDecoder(r)
234 | 
235 | 	var p page
236 | 	for i := 0; i < maxTries; i++ {
237 | 		if err := d.Decode(&p); err != nil {
238 | 			return page{}, err
239 | 		}
240 | 		if p.ID == meta.id {
241 | 			return p, nil
242 | 		}
243 | 	}
244 | 
245 | 	return page{}, errors.Errorf("failed to find page after %d tries", maxTries)
246 | }
247 | 
248 | func fetchArticle(name string) (indexEntry, error) {
249 | 	mu.Lock()
250 | 	defer mu.Unlock()
251 | 
252 | 	articleMeta, ok := mu.offsets[cityhash.Hash64([]byte(name))]
253 | 	if ok {
254 | 		return articleMeta, nil
255 | 	}
256 | 	articleMeta, ok = mu.offsets[cityhash.Hash64([]byte(strings.Title(strings.ToLower(name))))]
257 | 	if ok {
258 | 		return articleMeta, nil
259 | 	}
260 | 	return indexEntry{}, statusErrorf(http.StatusNotFound, "article not found: %q", name)
261 | }
262 | 
263 | func randomArticleHash() (uint64, error) {
264 | 	mu.Lock()
265 | 	defer mu.Unlock()
266 | 
267 | 	for hash := range mu.offsets {
268 | 		return hash, nil
269 | 	}
270 | 	return 0, errors.Errorf("no articles")
271 | }
272 | 
273 | func randomArticle() (page, error) {
274 | 	hash, err := randomArticleHash()
275 | 	if err != nil {
276 | 		return page{}, err
277 | 	}
278 | 
279 | 	mu.Lock()
280 | 	meta := mu.offsets[hash]
281 | 	mu.Unlock()
282 | 
283 | 	return readArticle(meta)
284 | }
285 | 
286 | type statusError int
287 | 
288 | func (s statusError) Error() string {
289 | 	return fmt.Sprintf("%d - %s", int(s), http.StatusText(int(s)))
290 | }
291 | 
292 | func statusErrorf(code int, str string, args ...interface{}) error {
293 | 	return errors.Wrapf(statusError(code), str, args...)
294 | }
295 | 
296 | func errorHandler(f func(w http.ResponseWriter, r *http.Request) error) http.HandlerFunc {
297 | 	return func(w http.ResponseWriter, r *http.Request) {
298 | 		if err := f(w, r); err != nil {
299 | 			cause := errors.Cause(err)
300 | 			status := http.StatusInternalServerError
301 | 			if cause, ok := cause.(statusError); ok {
302 | 				status = int(cause)
303 | 			}
304 | 			if err := executeTemplate(w, "error.html", struct {
305 | 				Title, Error string
306 | 			}{
307 | 				Title: err.Error(),
308 | 				Error: fmt.Sprintf("%+v", err),
309 | 			}); err != nil {
310 | 				http.Error(w, err.Error(), http.StatusInternalServerError)
311 | 				return
312 | 			}
313 | 			w.WriteHeader(status)
314 | 		}
315 | 	}
316 | 
317 | }
318 | 
319 | func handleArticle(w http.ResponseWriter, r *http.Request) error {
320 | 	articleName := wikitext.URLToTitle(path.Base(r.URL.Path))
321 | 
322 | 	if articleName == "Special:Random" {
323 | 		article, err := randomArticle()
324 | 		if err != nil {
325 | 			return err
326 | 		}
327 | 		http.Redirect(w, r, path.Join("/wiki/", wikitext.TitleToURL(article.Title)), http.StatusTemporaryRedirect)
328 | 		return nil
329 | 	}
330 | 
331 | 	articleMeta, err := fetchArticle(articleName)
332 | 	if err != nil {
333 | 		return err
334 | 	}
335 | 
336 | 	p, err := readArticle(articleMeta)
337 | 	if err != nil {
338 | 		return err
339 | 	}
340 | 
341 | 	if p.Title != articleName {
342 | 		http.Redirect(w, r, path.Join("/wiki/", wikitext.TitleToURL(p.Title)), http.StatusTemporaryRedirect)
343 | 		return nil
344 | 	}
345 | 
346 | 	body, err := wikitext.Convert(
347 | 		[]byte(p.Text),
348 | 		wikitext.TemplateHandler(p.templateHandler),
349 | 	)
350 | 	if err != nil {
351 | 		return err
352 | 	}
353 | 	if err := executeTemplate(w, "article.html", struct {
354 | 		Title string
355 | 		Body  template.HTML
356 | 	}{
357 | 		Title: articleName,
358 | 		Body:  template.HTML(body),
359 | 	}); err != nil {
360 | 		return err
361 | 	}
362 | 	return nil
363 | }
364 | 
365 | func handleSource(w http.ResponseWriter, r *http.Request) error {
366 | 	articleName := wikitext.URLToTitle(path.Base(r.URL.Path))
367 | 
368 | 	articleMeta, err := fetchArticle(articleName)
369 | 	if err != nil {
370 | 		return err
371 | 	}
372 | 	p, err := readArticle(articleMeta)
373 | 	if err != nil {
374 | 		return err
375 | 	}
376 | 	return executeTemplate(w, "source.html", p)
377 | }
378 | 
379 | func handleIndex(w http.ResponseWriter, r *http.Request) error {
380 | 	http.Redirect(w, r, "/wiki/Main_Page", http.StatusTemporaryRedirect)
381 | 	return nil
382 | }
383 | 
384 | func main() {
385 | 	if err := run(); err != nil {
386 | 		log.Fatalf("%+v", err)
387 | 	}
388 | }
389 | 
390 | func run() error {
391 | 	flag.Parse()
392 | 	log.SetFlags(log.Flags() | log.Lshortfile)
393 | 
394 | 	go func() {
395 | 		if err := loadIndex(); err != nil {
396 | 			log.Fatalf("%+v", err)
397 | 		}
398 | 	}()
399 | 
400 | 	if err := loadTemplates(); err != nil {
401 | 		return err
402 | 	}
403 | 
404 | 	http.Handle("/static/", http.StripPrefix("/static/", http.FileServer(http.Dir("./static"))))
405 | 	http.HandleFunc("/source/", errorHandler(handleSource))
406 | 	http.HandleFunc("/wiki/", errorHandler(handleArticle))
407 | 	http.HandleFunc("/", errorHandler(handleIndex))
408 | 
409 | 	log.Printf("Listening on %s...", *httpAddr)
410 | 	return http.ListenAndServe(*httpAddr, nil)
411 | }
412 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # wikigopher
 2 | 
 3 | A fully self contained server that can read Wikipedia database dumps and display
 4 | them. It also contains a wikitext -> html converter.
 5 | 
 6 | ## Install
 7 | 
 8 | ```
 9 | $ go get -u github.com/d4l3k/wikigopher
10 | ```
11 | 
12 | ## Download Wikipedia Database Dumps
13 | 
14 | You need to download the multistream article dumps
15 | 
16 | * enwiki-latest-pages-articles-multistream-index.txt.bz2
17 | * enwiki-latest-pages-articles-multistream.xml.bz2
18 | 
19 | from https://dumps.wikimedia.org/enwiki/latest/
20 | 
21 | You'll need to place these in the wikigopher directory or specify their location
22 | with `-index=....txt.bz2 -articles=....xml.bz2`.
23 | 
24 | The multistream varients are required. The index file is a mapping between
25 | article titles and their locations in the multistream xml file.
26 | 
27 | More information can be found at https://en.wikipedia.org/wiki/Wikipedia:Database_download#Where_do_I_get_it?
28 | 
29 | ## License
30 | 
31 | wikigopher is licensed under the MIT license.
32 | 
33 | ## Attributions
34 | 
35 | The gopher image used was created by Takuya Ueda (https://twitter.com/tenntenn). Licensed under the Creative Commons 3.0 Attributions license.
36 | 
37 | Some CSS styles have been borrowed from MediaWiki.
38 | 


--------------------------------------------------------------------------------
/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d4l3k/wikigopher/95ca9e7b979357263800dcabc8462c8ce8ef5ee6/static/favicon.png


--------------------------------------------------------------------------------
/static/gopher-front.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 18.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Gopher" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 215.6 281.6" enable-background="new 0 0 215.6 281.6" xml:space="preserve">
 5 | <g>
 6 | 	<path fill="#8CC5E7" d="M207.3,44.6c-6.7-13.7-22.9-1.6-27-5.9c-21-21.6-46.4-27-66.3-28c0,0-9,0-11,0c-20,0.5-45.4,6.3-66.3,28
 7 | 		c-4.1,4.3-20.4-7.8-27,5.9c-7.7,16,15.7,17.6,14.5,24.7c-2.3,12.8-0.8,31.8,1,50.5C28,151.5,4.3,227.4,53.6,257.9
 8 | 		c9.3,5.8,34.4,9,56.2,9.5l0,0c0,0,0.1,0,0.1,0c0,0,0.1,0,0.1,0l0,0c21.8-0.5,43.9-3.7,53.2-9.5c49.4-30.5,25.7-106.4,28.6-138.1
 9 | 		c1.7-18.7,3.2-37.7,1-50.5C191.6,62.2,215,60.5,207.3,44.6z"/>
10 | 	<g>
11 | 		<path fill="#E0DEDC" d="M143.2,54.3c-33.4,3.9-28.9,38.7-16,50c24,21,49,0,46.2-21.2C170.9,62.7,153.6,53.1,143.2,54.3z"/>
12 | 		<circle fill="#111212" cx="145.5" cy="84.3" r="11.4"/>
13 | 		<circle fill="#FFFFFF" cx="142.5" cy="79.4" r="3.6"/>
14 | 	</g>
15 | 	<g>
16 | 		<path fill="#B8937F" d="M108.5,107c-16,2.4-21.7,7-20.5,14.2c2,11.8,39.7,10.5,40.9,0.6C129.9,113.3,114.8,106.1,108.5,107z"/>
17 | 		<path d="M98.2,111.8c-2.7,9.8,21.7,8.3,21.1,2c-0.3-3.7-3.6-8.4-12.3-8.2C103.6,105.7,99.4,107.2,98.2,111.8z"/>
18 | 		<path fill="#E0DEDC" d="M99,127.7c-0.9,0.4-2.4,10.2,2.2,10.7c3.1,0.3,11.6,1.3,13.6,0c3.9-2.5,3.5-8.5,1.3-10
19 | 			C112.4,126,100,127.2,99,127.7z"/>
20 | 	</g>
21 | 	<g>
22 | 		<path fill="#E0DEDC" d="M73.6,54.3c33.4,3.9,28.9,38.7,16,50c-24,21-49,0-46.2-21.2C46,62.7,63.3,53.1,73.6,54.3z"/>
23 | 		<circle fill="#111212" cx="71.4" cy="84.3" r="11.4"/>
24 | 		<circle fill="#FFFFFF" cx="74.4" cy="79.4" r="3.6"/>
25 | 	</g>
26 | 	<path fill="#B8937F" d="M193.6,186.7c11,0.1,5.6-23.5-1.2-18.8c-3.3,2.3-3.9,7.6-3.9,12.1C188.5,182.5,190.5,186.6,193.6,186.7z"/>
27 | 	<path fill="#B8937F" d="M23.3,186.7c-11,0.1-5.6-23.5,1.2-18.8c3.3,2.3,3.9,7.6,3.9,12.1C28.4,182.5,26.4,186.6,23.3,186.7z"/>
28 | 	<path fill="#B8937F" d="M172.7,259.2c-6-8.9-11.4-2-20.1,2.4c-4.1,2.1,6.8,9.6,19,4C174.8,264.1,174.7,262.1,172.7,259.2z"/>
29 | 	<path fill="#B8937F" d="M44.2,260.2c6-8.9,11.4-2,20.1,2.4c4.1,2.1-6.8,9.6-19,4C42.1,265.1,42.2,263.1,44.2,260.2z"/>
30 | 	<path fill="#3C89BF" d="M188.6,47c-0.6,2.1,2.1,1.8,3.1,8.3c0.4,2.4,9-3.5,5.5-7.8C194.3,43.9,189.1,44.9,188.6,47z"/>
31 | 	<path fill="#3C89BF" d="M28.3,47c0.6,2.1-2.1,1.8-3.1,8.3c-0.4,2.4-9-3.5-5.5-7.8C22.5,43.9,27.7,44.9,28.3,47z"/>
32 | </g>
33 | </svg>
34 | 


--------------------------------------------------------------------------------
/static/style.css:
--------------------------------------------------------------------------------
  1 | html {
  2 |   font-size: 100%;
  3 | }
  4 | 
  5 | body {
  6 |   font-family: sans-serif;
  7 |   color: #222222;
  8 |   line-height: 1.6;
  9 |   display: flex;
 10 |   margin: 0;
 11 |   padding: 0;
 12 |   background-color: #f6f6f6;
 13 | }
 14 | 
 15 | nav {
 16 |   margin: 1.6em;
 17 | }
 18 | 
 19 | pre {
 20 |   white-space: pre-wrap;
 21 | }
 22 | 
 23 | h1, h2, h3, h4, h5, h6 {
 24 |   color: #000;
 25 |   background: none;
 26 |   font-weight: normal;
 27 |   margin: 0;
 28 |   margin-bottom: 0px;
 29 |   overflow: hidden;
 30 |   padding-top: 0.5em;
 31 |   padding-bottom: 0.17em;
 32 |   border-bottom: 1px solid #a2a9b1;
 33 | }
 34 | 
 35 | h1, h2 {
 36 |   font-family: 'Linux Libertine','Georgia','Times',serif;
 37 |   line-height: 1.3;
 38 |   margin-bottom: 0.25em;
 39 |   padding: 0;
 40 | }
 41 | 
 42 | h3, h4, h5, h6 {
 43 |   border-bottom: 0;
 44 |   font-weight: bold;
 45 | }
 46 | 
 47 | h1 {
 48 |   font-size: 1.8em;
 49 | }
 50 | 
 51 | h2 {
 52 |   margin-top: 1em;
 53 |   font-size: 1.5em;
 54 | }
 55 | 
 56 | h3 {
 57 |   font-size: 1.2em;
 58 | }
 59 | 
 60 | p {
 61 |   line-height: inherit;
 62 |   margin: 0.5em 0;
 63 | }
 64 | 
 65 | .content {
 66 |   margin: 2em 0;
 67 |   border: 1px solid #a7d7f9;
 68 |   border-right: none;
 69 |   background-color: white;
 70 |   padding: 1.25em 1.5em 1.5em 1.5em;
 71 |   flex-grow: 10000;
 72 | }
 73 | 
 74 | .content-nav {
 75 |   font-size: 0.5em;
 76 |   font-family: sans-serif;
 77 |   float: right;
 78 | }
 79 | 
 80 | .body {
 81 |   font-size: 0.875em;
 82 |   position: relative;
 83 | }
 84 | 
 85 | a.external::after {
 86 |   content: " 🔗";
 87 |   text-decoration: none;
 88 |   display: inline-block;
 89 |   margin-left: 2px;
 90 |   position: relative;
 91 |   bottom: 4px;
 92 |   font-size: 0.9em;
 93 | }
 94 | 
 95 | .image {
 96 |   border: 1px solid #c8ccd1;
 97 |   padding: 3px;
 98 |   background-color: #f8f9fa;
 99 |   font-size: 94%;
100 |   text-align: center;
101 |   overflow: hidden;
102 |   width: 300px;
103 |   margin: 0.5em 0 1.3em 1.4em;
104 |   float: right;
105 | }
106 | 
107 | a {
108 |   text-decoration: none;
109 |   color: #0645ad;
110 |   background: none;
111 | }
112 | 
113 | a:visited {
114 |   color:#0b0080;
115 | }
116 | 
117 | a:active {
118 |   color:#faa700;
119 | }
120 | 
121 | a:hover, a:focus {
122 |   text-decoration: underline;
123 | }
124 | 
125 | .image .caption {
126 |   text-align: left;
127 | }
128 | 
129 | .brand, .brand:hover, .brand:focus, .brand:active, .brand:visited {
130 |   text-decoration: none;
131 |   font-family: monospace;
132 |   font-size: 1.5em;
133 |   color: inherit;
134 |   padding-bottom: 1.5em;
135 |   display: block;
136 |   text-align: center;
137 | }
138 | 
139 | nav > * {
140 |   font-size: 0.875em;
141 | }
142 | 
143 | nav > a {
144 |   display: block;
145 | }
146 | 
147 | ref {
148 |   vertical-align: super;
149 |   font-size: smaller;
150 | }
151 | 
152 | ref::before {
153 |   content:"[";
154 | }
155 | 
156 | ref::after {
157 |   content:"]";
158 | }
159 | 


--------------------------------------------------------------------------------
/templates.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"log"
  6 | 	"path"
  7 | 	"strconv"
  8 | 	"strings"
  9 | 
 10 | 	lua "github.com/Shopify/go-lua"
 11 | 	"github.com/d4l3k/wikigopher/wikitext"
 12 | 	"github.com/davecgh/go-spew/spew"
 13 | 	"github.com/pkg/errors"
 14 | )
 15 | 
 16 | var templateFuncs = map[string]func(attrs []wikitext.Attribute) (interface{}, error){
 17 | 	"ifeq": func(attrs []wikitext.Attribute) (interface{}, error) {
 18 | 		if len(attrs) < 3 || len(attrs) > 4 {
 19 | 			return nil, errors.Errorf("must have 3 or 4 arguments to #ifeq, got %d", len(attrs))
 20 | 		}
 21 | 
 22 | 		var trueVal interface{}
 23 | 		var falseVal interface{}
 24 | 		if len(attrs) >= 3 {
 25 | 			trueVal = attrs[2].Key
 26 | 		}
 27 | 		if len(attrs) == 4 {
 28 | 			falseVal = attrs[3].Key
 29 | 		}
 30 | 
 31 | 		a := wikitext.Concat(attrs[0].Key)
 32 | 		b := wikitext.Concat(attrs[1].Key)
 33 | 		aVal, err := strconv.ParseFloat(a, 64)
 34 | 		if err == nil {
 35 | 			bVal, err := strconv.ParseFloat(b, 64)
 36 | 			if err == nil {
 37 | 				if aVal == bVal {
 38 | 					return trueVal, nil
 39 | 				}
 40 | 				return falseVal, nil
 41 | 			}
 42 | 		}
 43 | 
 44 | 		if a == b {
 45 | 			return trueVal, nil
 46 | 		}
 47 | 		return falseVal, nil
 48 | 	},
 49 | 
 50 | 	"if": func(attrs []wikitext.Attribute) (interface{}, error) {
 51 | 		if len(attrs) < 2 || len(attrs) > 3 {
 52 | 			return nil, errors.Errorf("must have 2 or 3 arguments to #if, got %d", len(attrs))
 53 | 		}
 54 | 
 55 | 		a := strings.TrimSpace(wikitext.Concat(attrs[0].Key))
 56 | 		if len(a) > 0 {
 57 | 			return attrs[1].Key, nil
 58 | 		}
 59 | 		if len(attrs) > 2 {
 60 | 			return attrs[2].Key, nil
 61 | 		}
 62 | 		return nil, nil
 63 | 	},
 64 | 
 65 | 	"invoke": func(attrs []wikitext.Attribute) (interface{}, error) {
 66 | 		if len(attrs) < 1 {
 67 | 			return nil, errors.Errorf("must have at least one attribute")
 68 | 		}
 69 | 
 70 | 		module, err := loadModule(wikitext.Concat(attrs[0]))
 71 | 		if err != nil {
 72 | 			return nil, err
 73 | 		}
 74 | 		methodName := wikitext.Concat(attrs[1])
 75 | 
 76 | 		l := lua.NewState()
 77 | 
 78 | 		lua.OpenLibraries(l)
 79 | 		/*
 80 | 			lua.BaseOpen(l)
 81 | 			lua.StringOpen(l)
 82 | 			lua.MathOpen(l)
 83 | 			lua.TableOpen(l)
 84 | 			lua.Bit32Open(l)
 85 | 		*/
 86 | 
 87 | 		l.Global("require")
 88 | 		l.SetGlobal("oldRequire")
 89 | 
 90 | 		l.Register("require", func(l *lua.State) int {
 91 | 			moduleName := lua.CheckString(l, 0)
 92 | 			log.Printf("require called! %q", moduleName)
 93 | 
 94 | 			if moduleName == "libraryUtil" {
 95 | 				if err := lua.DoFile(l, path.Join("lua", moduleName+".lua")); err != nil {
 96 | 					lua.Errorf(l, errors.Wrapf(err, "executing module %q", moduleName).Error())
 97 | 					return 0
 98 | 				}
 99 | 				return lua.MultipleReturns
100 | 			} else if strings.HasPrefix(moduleName, "Module:") {
101 | 				body, err := articleBody(moduleName)
102 | 				if err != nil {
103 | 					lua.Errorf(l, errors.Wrapf(err, "loading module %q", moduleName).Error())
104 | 				}
105 | 				if err := lua.DoString(l, body); err != nil {
106 | 					lua.Errorf(l, errors.Wrapf(err, "executing module %q", moduleName).Error())
107 | 					return 0
108 | 				}
109 | 				spew.Dump(l.ToValue(0))
110 | 				spew.Dump(l.ToValue(-1))
111 | 				return lua.MultipleReturns
112 | 			}
113 | 
114 | 			l.Global("oldRequire")
115 | 			l.PushString(moduleName)
116 | 			l.Call(1, 1)
117 | 			return 1
118 | 		})
119 | 		if err := lua.DoString(l, module); err != nil {
120 | 			return nil, errors.Wrapf(err, "DoString")
121 | 		}
122 | 		log.Printf("module loaded")
123 | 		l.Field(-1, methodName)
124 | 		l.PushString("args")
125 | 		if err := l.ProtectedCall(1, 1, 0); err != nil {
126 | 			return nil, errors.Wrapf(err, "calling %q", methodName)
127 | 		}
128 | 		return lua.CheckString(l, 0), nil
129 | 	},
130 | }
131 | 
132 | func loadModule(name string) (string, error) {
133 | 	name = "Module:" + name
134 | 	return articleBody(name)
135 | }
136 | 
137 | func stripComments(code string) (string, error) {
138 | 	scanner := bufio.NewScanner(strings.NewReader(code))
139 | 	var b strings.Builder
140 | 	for scanner.Scan() {
141 | 		line := scanner.Text()
142 | 		if strings.HasPrefix(line, "--") {
143 | 			continue
144 | 		}
145 | 		b.WriteString(line)
146 | 		b.WriteRune('\n')
147 | 	}
148 | 	if err := scanner.Err(); err != nil {
149 | 		return "", err
150 | 	}
151 | 	return b.String(), nil
152 | }
153 | 
154 | func articleBody(name string) (string, error) {
155 | 	articleMeta, err := fetchArticle(name)
156 | 	if err != nil {
157 | 		return "", err
158 | 	}
159 | 	p, err := readArticle(articleMeta)
160 | 	if err != nil {
161 | 		return "", err
162 | 	}
163 | 	return p.Text, nil
164 | }
165 | 
166 | func templateFuncHandler(name string, attrs []wikitext.Attribute) (interface{}, error) {
167 | 	f, ok := templateFuncs[name]
168 | 	if ok {
169 | 		v, err := f(attrs)
170 | 		if err != nil {
171 | 			log.Printf("Error executing func %q: %+v", name, err)
172 | 			return nil, err
173 | 		}
174 | 		return v, nil
175 | 	}
176 | 	return nil, errors.Errorf("unknown func: %q, args: %v", name, attrs)
177 | }
178 | 
179 | func (p page) templateHandler(name string, attrs []wikitext.Attribute) (interface{}, error) {
180 | 	if name == "NAMESPACE" {
181 | 		parts := strings.Split(p.Title, ":")
182 | 		if len(parts) > 1 {
183 | 			return parts[0], nil
184 | 		}
185 | 		return nil, nil
186 | 
187 | 	} else if name == "NUMBEROFARTICLES" {
188 | 		mu.Lock()
189 | 		defer mu.Unlock()
190 | 
191 | 		return len(mu.offsets), nil
192 | 
193 | 	} else if strings.HasPrefix(name, "#") {
194 | 		parts := strings.SplitN(name, ":", 2)
195 | 		if len(parts) > 1 {
196 | 			attrs = append([]wikitext.Attribute{
197 | 				{Key: parts[1]},
198 | 			}, attrs...)
199 | 		}
200 | 		return templateFuncHandler(parts[0][1:], attrs)
201 | 	}
202 | 
203 | 	/*
204 | 		templateBody, err := articleBody("Template:" + name)
205 | 		if err != nil {
206 | 			return nil, errors.Wrapf(err, "unknown template: %q, args: %v", name, attrs)
207 | 		}
208 | 
209 | 		body, err := wikitext.Convert(
210 | 			[]byte(templateBody),
211 | 			wikitext.TemplateHandler(p.templateHandler),
212 | 		)
213 | 		if err != nil {
214 | 			return nil, err
215 | 		}
216 | 		doc, err := html.Parse(bytes.NewReader(body))
217 | 		if err != nil {
218 | 			return nil, err
219 | 		}
220 | 
221 | 		return doc, nil
222 | 	*/
223 | 
224 | 	return nil, errors.Errorf("unknown template: %q, args: %v", name, attrs)
225 | }
226 | 


--------------------------------------------------------------------------------
/templates/article.html:
--------------------------------------------------------------------------------
1 | {{define "nav"}}
2 |   <a href="/source/{{.Title}}">Source</a>
3 | {{end}}
4 | 
5 | {{define "content"}}
6 |   {{.Body}}
7 | {{end}}
8 | 


--------------------------------------------------------------------------------
/templates/base.html:
--------------------------------------------------------------------------------
 1 | {{ define "base" }}
 2 | <html>
 3 | <head>
 4 |   <title>{{block "title" .}}{{.Title}}{{end}} - wikigopher</title>
 5 |   <link rel="stylesheet" href="/static/style.css">
 6 |   <link rel="shortcut icon" href="/static/favicon.png" />
 7 | </head>
 8 | <body>
 9 |   <nav>
10 |     <a class="brand" href="/wiki/Main_Page">
11 |       <img src="/static/gopher-front.svg">
12 |       <div>wikigopher</div>
13 |     </a>
14 | 
15 |     <a href="/wiki/Main_Page">Main Page</a>
16 |     <a href="https://github.com/d4l3k/wikigopher">Source Code</a>
17 |     <p>Created by <a href="https://fn.lc">Tristan Rice</a>.</p>
18 |   </nav>
19 |   <div class="content">
20 |     <h1>
21 |       {{block "title" .}} {{end}}
22 |       <span class="content-nav">
23 |         {{block "nav" .}}{{end}}
24 |       </span>
25 |     </h1>
26 | 
27 |     <div class="body">
28 |       {{template "content" .}}
29 |     </div>
30 |   </div>
31 | </body>
32 | </html>
33 | {{ end }}
34 | 


--------------------------------------------------------------------------------
/templates/error.html:
--------------------------------------------------------------------------------
1 | {{define "title"}}Error: {{.Title}}{{end}}
2 | 
3 | {{define "content"}}
4 | <pre>{{.Error}}</pre>
5 | {{end}}
6 | 


--------------------------------------------------------------------------------
/templates/source.html:
--------------------------------------------------------------------------------
 1 | {{define "title"}}Source: {{.Title}}{{end}}
 2 | 
 3 | {{define "nav"}}
 4 |   <a href="/wiki/{{.Title}}">Article</a>
 5 | {{end}}
 6 | 
 7 | 
 8 | {{define "content"}}
 9 | <pre>{{.Text}}</pre>
10 | {{end}}
11 | 


--------------------------------------------------------------------------------
/wikitext/debug.go:
--------------------------------------------------------------------------------
  1 | package wikitext
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"reflect"
  6 | 	"runtime"
  7 | )
  8 | 
  9 | func debugRules(compute bool) {
 10 | 	for _, rule := range g.rules {
 11 | 		debugExpr(rule.expr, compute)
 12 | 	}
 13 | }
 14 | 
 15 | func debugExpr(e interface{}, compute bool) {
 16 | 	switch e := e.(type) {
 17 | 	case *actionExpr:
 18 | 		oldRun := e.run
 19 | 		name := getFunctionName(e.run)
 20 | 		e.run = func(p *parser) (interface{}, error) {
 21 | 			log.Printf("run %q", name)
 22 | 			stack := p.vstack[len(p.vstack)-1]
 23 | 			r := debugRun{
 24 | 				Name:  name,
 25 | 				Stack: stack,
 26 | 				Text:  string(p.cur.text),
 27 | 			}
 28 | 			if compute {
 29 | 				p.vstack[len(p.vstack)-1] = shuckStack(stack)
 30 | 				val, err := oldRun(p)
 31 | 				if err != nil {
 32 | 					return nil, err
 33 | 				}
 34 | 				p.vstack[len(p.vstack)-1] = stack
 35 | 				r.Value = val
 36 | 			}
 37 | 
 38 | 			return r, nil
 39 | 		}
 40 | 		debugExpr(e.expr, compute)
 41 | 
 42 | 	case *labeledExpr:
 43 | 		debugExpr(e.expr, compute)
 44 | 
 45 | 	case *expr:
 46 | 		debugExpr(e.expr, compute)
 47 | 
 48 | 	case *andExpr:
 49 | 		debugExpr(e.expr, compute)
 50 | 
 51 | 	case *notExpr:
 52 | 		debugExpr(e.expr, compute)
 53 | 
 54 | 	case *zeroOrOneExpr:
 55 | 		debugExpr(e.expr, compute)
 56 | 
 57 | 	case *zeroOrMoreExpr:
 58 | 		debugExpr(e.expr, compute)
 59 | 
 60 | 	case *oneOrMoreExpr:
 61 | 		debugExpr(e.expr, compute)
 62 | 
 63 | 	case *seqExpr:
 64 | 		for _, e := range e.exprs {
 65 | 			debugExpr(e, compute)
 66 | 		}
 67 | 
 68 | 	case *choiceExpr:
 69 | 		for _, e := range e.alternatives {
 70 | 			debugExpr(e, compute)
 71 | 		}
 72 | 
 73 | 	case *ruleRefExpr, *litMatcher, *andCodeExpr, *charClassMatcher, *anyMatcher, *notCodeExpr, *stateCodeExpr:
 74 | 
 75 | 	default:
 76 | 		log.Fatalf("debugExpr: unsupported type %T: %#v", e, e)
 77 | 	}
 78 | }
 79 | 
 80 | // from https://stackoverflow.com/questions/7052693/how-to-get-the-name-of-a-function-in-go
 81 | func getFunctionName(i interface{}) string {
 82 | 	return runtime.FuncForPC(reflect.ValueOf(i).Pointer()).Name()
 83 | }
 84 | 
 85 | type debugRun struct {
 86 | 	Name  string
 87 | 	Stack map[string]interface{}
 88 | 	Text  string
 89 | 	Value interface{}
 90 | }
 91 | 
 92 | func shuck(v interface{}) interface{} {
 93 | 	switch v := v.(type) {
 94 | 	case debugRun:
 95 | 		return v.Value
 96 | 
 97 | 	case []interface{}:
 98 | 		return shuckArr(v)
 99 | 
100 | 	default:
101 | 		return v
102 | 	}
103 | }
104 | 
105 | func shuckArr(arr []interface{}) []interface{} {
106 | 	var out []interface{}
107 | 	for _, val := range arr {
108 | 		out = append(out, shuck(val))
109 | 	}
110 | 	return out
111 | }
112 | 
113 | func shuckStack(stack map[string]interface{}) map[string]interface{} {
114 | 	out := map[string]interface{}{}
115 | 	for k, v := range stack {
116 | 		out[k] = shuck(v)
117 | 	}
118 | 	return out
119 | }
120 | 


--------------------------------------------------------------------------------
/wikitext/pegTokenizer.pegjs:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
   3 |  * chunks of tokens (one chunk per top-level block matched) and eventually an
   4 |  * end event. Tokens map to HTML tags as far as possible, with custom tokens
   5 |  * used where further processing on the token stream is needed.
   6 |  */
   7 | {
   8 | 
   9 |     var pegIncludes = options.pegIncludes;
  10 |     var pegTokenizer = options.pegTokenizer;
  11 | 
  12 |     var env = pegTokenizer.env;
  13 |     var pipelineOpts = pegTokenizer.options;
  14 | 
  15 |     var DU = pegIncludes.DOMUtils;
  16 |     var Util = pegIncludes.Util;
  17 |     var JSUtils = pegIncludes.JSUtils;
  18 |     var PegTokenizer = pegIncludes.PegTokenizer;
  19 |     var defines = pegIncludes.defines;
  20 |     var constants = pegIncludes.constants;
  21 |     var tu = pegIncludes.tu;
  22 | 
  23 |     // define some constructor shortcuts
  24 |     var KV = defines.KV;
  25 |     var TagTk = defines.TagTk;
  26 |     var SelfclosingTagTk = defines.SelfclosingTagTk;
  27 |     var EndTagTk = defines.EndTagTk;
  28 |     var NlTk = defines.NlTk;
  29 |     var CommentTk = defines.CommentTk;
  30 |     var EOFTk = defines.EOFTk;
  31 |     var lastItem = JSUtils.lastItem;
  32 | 
  33 |     var inlineBreaks = tu.inlineBreaks;
  34 |     var stops = new tu.SyntaxStops();
  35 | 
  36 |     var prevOffset = 0;
  37 | 
  38 |     // Some shorthands for legibility
  39 |     var startOffset = function() {
  40 |       return location().start.offset;
  41 |     };
  42 |     var endOffset = function() {
  43 |       return location().end.offset;
  44 |     };
  45 |     var tsrOffsets = function(flag) {
  46 |       return tu.tsrOffsets(location(), flag);
  47 |     };
  48 | 
  49 |     /*
  50 |      * Emit a chunk of tokens to our consumers.  Once this has been done, the
  51 |      * current expression can return an empty list (true).
  52 |      */
  53 |     var emitChunk = function(tokens) {
  54 |         if (env.immutable) {
  55 |             // Tokens placed in the tokenizer's cache have been frozen to
  56 |             // to catch any mutations while testing, which may have led to
  57 |             // subtle, spooky action at a distance.
  58 |             tokens = Util.unFreeze(tokens, true);
  59 |         }
  60 | 
  61 |         // Shift tsr of all tokens by the pipeline offset
  62 |         Util.shiftTokenTSR(tokens, options.pipelineOffset);
  63 |         env.log("trace/peg", pegTokenizer.pipelineId, "---->  ", tokens);
  64 | 
  65 |         var i;
  66 |         var n = tokens.length;
  67 | 
  68 |         // Enforce parsing resource limits
  69 |         for (i = 0; i < n; i++) {
  70 |             tu.enforceParserResourceLimits(env, tokens[i]);
  71 |         }
  72 | 
  73 |         // limit the size of individual chunks
  74 |         var chunkLimit = 100000;
  75 |         if (n > chunkLimit) {
  76 |             i = 0;
  77 |             while (i < n) {
  78 |                 options.cb(tokens.slice(i, i + chunkLimit));
  79 |                 i += chunkLimit;
  80 |             }
  81 |         } else {
  82 |             options.cb(tokens);
  83 |         }
  84 |     };
  85 | 
  86 |     /* ------------------------------------------------------------------------
  87 |      * Extension tags should be parsed with higher priority than anything else.
  88 |      *
  89 |      * The trick we use is to strip out the content inside a matching tag-pair
  90 |      * and not tokenize it. The content, if it needs to parsed (for example,
  91 |      * for <ref>, <*include*> tags), is parsed in a fresh tokenizer context
  92 |      * which means any error correction that needs to happen is restricted to
  93 |      * the scope of the extension content and doesn't spill over to the higher
  94 |      * level.  Ex: <math><!--foo</math>.
  95 |      *
  96 |      * IGNORE: {{ this just balances the blocks in this comment for pegjs
  97 |      *
  98 |      * This trick also lets us prevent extension content (that don't accept WT)
  99 |      * from being parsed as wikitext (Ex: <math>\frac{foo\frac{bar}}</math>)
 100 |      * We don't want the "}}" being treated as a template closing tag and
 101 |      * closing outer templates.
 102 |      * --------------------------------------------------------------------- */
 103 | 
 104 |     var isXMLTag = function(name, block) {
 105 |         var lName = name.toLowerCase();
 106 |         var uName = name.toUpperCase();
 107 | 
 108 |         var isInstalledExt = env.conf.wiki.extensionTags.has(lName);
 109 |         var isIncludeTag = lName === 'includeonly' ||
 110 |                 lName === 'noinclude' || lName === 'onlyinclude';
 111 | 
 112 |         var isHtmlTag = block ?
 113 |                 Util.isBlockTag(uName) :
 114 |                 constants.HTML.HTML5Tags.has(uName) || constants.HTML.OlderHTMLTags.has(uName);
 115 | 
 116 |         // WARNING: Be careful to pop this when `isXMLTag` is used.
 117 |         stops.push('extTag', isInstalledExt);
 118 | 
 119 |         return isHtmlTag || isInstalledExt || isIncludeTag;
 120 |     };
 121 | 
 122 |     var maybeExtensionTag = function(t) {
 123 |         var tagName = t.name.toLowerCase();
 124 | 
 125 |         var isInstalledExt = env.conf.wiki.extensionTags.has(tagName);
 126 |         var isIncludeTag = tagName === 'includeonly' ||
 127 |                 tagName === 'noinclude' || tagName === 'onlyinclude';
 128 | 
 129 |         // Extensions have higher precedence when they shadow html tags.
 130 |         if (!(isInstalledExt || isIncludeTag)) {
 131 |             return t;
 132 |         }
 133 | 
 134 |         var dp = t.dataAttribs;
 135 |         var skipLen = 0;
 136 | 
 137 |         switch (t.constructor) {
 138 |         case EndTagTk:
 139 |             if (isIncludeTag) {
 140 |                 return t;
 141 |             }
 142 |             // Similar to TagTk, we rely on the sanitizer to convert to text
 143 |             // where necessary and emit tokens to ease the wikitext escaping
 144 |             // code.  However, extension tags that shadow html tags will see
 145 |             // their unmatched end tags dropped while tree building, since
 146 |             // the sanitizer will let them through.
 147 |             return t;  // not text()
 148 |         case SelfclosingTagTk:
 149 |             dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
 150 |             dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0];
 151 |             if (isIncludeTag) {
 152 |                 return t;
 153 |             }
 154 |             break;
 155 |         case TagTk:
 156 |             var tsr0 = dp.tsr[0];
 157 |             var endTagRE = new RegExp("^[\\s\\S]*?(</\\s*" + tagName + "\\s*>)", "mi");
 158 |             var restOfInput = input.substring(tsr0);
 159 |             var tagContent = restOfInput.match(endTagRE);
 160 | 
 161 |             if (!tagContent) {
 162 |                 dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
 163 |                 dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0];
 164 |                 if (isIncludeTag) {
 165 |                     return t;
 166 |                 } else {
 167 |                     // This is undefined behaviour.  The php parser currently
 168 |                     // returns a tag here as well, which results in unclosed
 169 |                     // extension tags that shadow html tags falling back to
 170 |                     // their html equivalent.  The sanitizer will take care
 171 |                     // of converting to text where necessary.  We do this to
 172 |                     // simplify `hasWikitextTokens` when escaping wikitext,
 173 |                     // which wants these as tokens because it's otherwise
 174 |                     // lacking in context.
 175 |                     return t;  // not text()
 176 |                 }
 177 |             }
 178 | 
 179 |             var extSrc = tagContent[0];
 180 |             var endTagWidth = tagContent[1].length;
 181 | 
 182 |             if (pipelineOpts.inTemplate) {
 183 |                 // Support 1-level of nesting in extensions tags while
 184 |                 // tokenizing in templates to support the #tag parser function.
 185 |                 //
 186 |                 // It's necessary to permit this broadly in templates because
 187 |                 // there's no way to distinguish whether the nesting happened
 188 |                 // while expanding the #tag parser function, or just a general
 189 |                 // syntax errors.  In other words,
 190 |                 //
 191 |                 //   hi<ref>ho<ref>hi</ref>ho</ref>
 192 |                 //
 193 |                 // and
 194 |                 //
 195 |                 //   hi{{#tag:ref|ho<ref>hi</ref>ho}}
 196 |                 //
 197 |                 // found in template are returned indistinguishably after a
 198 |                 // preprocessing request, though the php parser renders them
 199 |                 // differently.  #tag in template is probably a common enough
 200 |                 // use case that we want to accept these false positives,
 201 |                 // though another approach could be to drop this code here, and
 202 |                 // invoke a native #tag handler and forgo those in templates.
 203 |                 //
 204 |                 // Expand `extSrc` as long as there is a <tagName> found in the
 205 |                 // extension source body.
 206 |                 var s = extSrc.substring(endOffset() - tsr0);
 207 |                 while (s && s.match(new RegExp("<" + tagName + "[^/<>]*>"))) {
 208 |                     tagContent = restOfInput.substring(extSrc.length).match(endTagRE);
 209 |                     if (tagContent) {
 210 |                         s = tagContent[0];
 211 |                         endTagWidth = tagContent[1].length;
 212 |                         extSrc += s;
 213 |                     } else {
 214 |                         s = null;
 215 |                     }
 216 |                 }
 217 |             }
 218 | 
 219 |             // Extension content source
 220 |             dp.src = extSrc;
 221 |             dp.tagWidths = [endOffset() - tsr0, endTagWidth];
 222 | 
 223 |             skipLen = extSrc.length - dp.tagWidths[0] - dp.tagWidths[1];
 224 | 
 225 |             // If the xml-tag is a known installed (not native) extension,
 226 |             // skip the end-tag as well.
 227 |             if (isInstalledExt) {
 228 |                 skipLen += endTagWidth;
 229 |             }
 230 |             break;
 231 |         default:
 232 |             console.assert(false, 'Should not be reachable.');
 233 |         }
 234 | 
 235 |         peg$currPos += skipLen;
 236 | 
 237 |         if (isInstalledExt) {
 238 |             // update tsr[1] to span the start and end tags.
 239 |             dp.tsr[1] = endOffset();  // was just modified above
 240 |             return new SelfclosingTagTk('extension', [
 241 |                 new KV('typeof', 'mw:Extension'),
 242 |                 new KV('name', tagName),
 243 |                 new KV('about', env.newAboutId()),
 244 |                 new KV('source', dp.src),
 245 |                 new KV('options', t.attribs),
 246 |             ], dp);
 247 |         } else if (isIncludeTag) {
 248 |             // Parse ext-content, strip eof, and shift tsr
 249 |             var extContent = dp.src.substring(dp.tagWidths[0], dp.src.length - dp.tagWidths[1]);
 250 |             var extContentToks = (new PegTokenizer(env)).tokenizeSync(extContent);
 251 |             if (dp.tagWidths[1] > 0) {
 252 |                 extContentToks = Util.stripEOFTkfromTokens(extContentToks);
 253 |             }
 254 |             Util.shiftTokenTSR(extContentToks, dp.tsr[0] + dp.tagWidths[0]);
 255 |             return [t].concat(extContentToks);
 256 |         } else {
 257 |             console.assert(false, 'Should not be reachable.');
 258 |         }
 259 |     };
 260 | }
 261 | 
 262 | /*********************************************************
 263 |  * The top-level rule
 264 |  *********************************************************/
 265 | 
 266 | start "start"
 267 |   = tlb* newlineToken* {
 268 |       // end is passed inline as a token, as well as a separate event for now.
 269 |       emitChunk([ new EOFTk() ]);
 270 |       return true;
 271 |   }
 272 | 
 273 | /*
 274 |  * Redirects can only occur as the first thing in a document.  See
 275 |  * WikitextContent::getRedirectTarget()
 276 |  */
 277 | redirect
 278 |   = rw:redirect_word
 279 |     sp:$space_or_newline*
 280 |     c:$(":" space_or_newline*)?
 281 |     wl:wikilink & {
 282 |       return wl.length === 1 && wl[0] && wl[0].constructor !== String;
 283 |   } {
 284 |     var link = wl[0];
 285 |     if (sp) { rw += sp; }
 286 |     if (c) { rw += c; }
 287 |     // Build a redirect token
 288 |     var redirect = new SelfclosingTagTk('mw:redirect',
 289 |             // Put 'href' into attributes so it gets template-expanded
 290 |             [Util.lookupKV(link.attribs, 'href')],
 291 |             {
 292 |                 src: rw,
 293 |                 tsr: tsrOffsets(),
 294 |                 linkTk: link,
 295 |             });
 296 |     return redirect;
 297 | }
 298 | 
 299 | // These rules are exposed as start rules.
 300 | generic_newline_attributes "generic_newline_attributes" = generic_newline_attribute*
 301 | table_attributes "table_attributes"
 302 |   = (table_attribute / optionalSpaceToken b:broken_table_attribute_name_char { return b; })*
 303 | 
 304 | /* The 'redirect' magic word.
 305 |  * The leading whitespace allowed is due to the PHP trim() function.
 306 |  */
 307 | redirect_word
 308 |   = $([ \t\n\r\0\x0b]*
 309 |     rw:$(!space_or_newline ![:\[] .)+
 310 |     & { return env.conf.wiki.getMagicWordMatcher('redirect').test(rw); })
 311 | 
 312 | /*
 313 |  * This rule exists to support tokenizing the document in chunks.
 314 |  * The parser's streaming interface will stop tokenization after each iteration
 315 |  * of the starred subexpression, and yield to the node.js event-loop to
 316 |  * schedule other pending event handlers.
 317 |  */
 318 | start_async
 319 |   = (tlb
 320 |     / newlineToken* &{
 321 |       if (endOffset() === input.length) {
 322 |           emitChunk([ new EOFTk() ]);
 323 |       }
 324 |       // terminate the loop
 325 |       return false;
 326 |     }
 327 |     )*
 328 | 
 329 | /*
 330 |  * A document (start rule) is a sequence of toplevelblocks. Tokens are
 331 |  * emitted in chunks per toplevelblock to avoid buffering the full document.
 332 |  */
 333 | tlb "tlb"
 334 |   = !eof b:block {
 335 |     // Clear the tokenizer's backtracking cache after matching each
 336 |     // toplevelblock. There won't be any backtracking as a document is just a
 337 |     // sequence of toplevelblocks, so the cache for previous toplevelblocks
 338 |     // will never be needed.
 339 |     var end = startOffset();
 340 |     for (; prevOffset < end; prevOffset++) {
 341 |         peg$cache[prevOffset] = undefined;
 342 |     }
 343 | 
 344 |     var tokens;
 345 |     if (Array.isArray(b) && b.length) {
 346 |         tokens = tu.flattenIfArray(b);
 347 |     } else if (b && b.constructor === String) {
 348 |         tokens = [b];
 349 |     }
 350 | 
 351 |     // Emit tokens for this toplevelblock. This feeds a chunk to the parser pipeline.
 352 |     if (tokens) {
 353 |         emitChunk(tokens);
 354 |     }
 355 | 
 356 |     // We don't return any tokens to the start rule to save memory. We
 357 |     // just emitted them already to our consumers.
 358 |     return true;
 359 |   }
 360 | 
 361 | /*
 362 |  * The actual contents of each block.
 363 |  */
 364 | block
 365 |       // has to be first alternative; otherwise gets parsed as a <ol>
 366 |     = &sof r:redirect cil:comment_or_includes bl:block_line? { return [r].concat(cil, bl || []); }
 367 |     / block_lines
 368 |     / & '<' rs:( c:comment &eolf { return c; }
 369 |             // avoid a paragraph if we know that the line starts with a block tag
 370 |             / bt:block_tag
 371 |             ) { return rs; }
 372 |     / paragraph
 373 |     // Inlineline includes generic tags; wrapped into paragraphs in token
 374 |     // transform and DOM postprocessor
 375 |     / inlineline
 376 |     / s:sol !inline_breaks { return s; }
 377 | 
 378 | /*
 379 |  * A block nested in other constructs. Avoid eating end delimiters for other
 380 |  * constructs by checking against inline_breaks first.
 381 |  */
 382 | nested_block = !inline_breaks b:block { return b; }
 383 | 
 384 | /*
 385 |  * The same, but suitable for use inside a table construct.
 386 |  * Doesn't match table_heading_tag, table_row_tag, table_data_tag,
 387 |  * table_caption tag, or table_end_tag, although it does allow
 388 |  * table_start_tag (for nested tables).
 389 |  */
 390 | nested_block_in_table
 391 |   =
 392 |     // avoid recursion via nested_block_in_table, as that can lead to stack
 393 |     // overflow in large tables
 394 |     // See https://phabricator.wikimedia.org/T59670
 395 |     & { return stops.push('tableDataBlock', true); }
 396 |     // XXX: don't rely on a lame look-ahead like this; use syntax stops
 397 |     // instead, so that multi-line th content followed by a line prefixed with
 398 |     // a comment is also handled. Alternatively, implement a sol look-behind
 399 |     // assertion accepting spaces and comments.
 400 |     !(sol (space* sol)? space* (pipe / "!")) b:nested_block {
 401 |         stops.pop('tableDataBlock');
 402 |         return b;
 403 |     }
 404 |   / & { return stops.pop('tableDataBlock'); }
 405 | 
 406 | /*
 407 |  * Line-based block constructs.
 408 |  */
 409 | block_lines
 410 |   = s:sol
 411 |     // eat an empty line before the block
 412 |     s2:(os:optionalSpaceToken so:sol { return os.concat(so); })?
 413 |     bl:block_line {
 414 |         return s.concat(s2 || [], bl);
 415 |     }
 416 | 
 417 | // Horizontal rules
 418 | hr =
 419 |   "----" d:"-"*
 420 |   // Check if a newline or content follows
 421 |   lineContent:( &sol "" { return undefined; } / "" { return true; } ) {
 422 |     var dataAttribs = {
 423 |       tsr: tsrOffsets(),
 424 |       lineContent: lineContent,
 425 |     };
 426 |     if (d.length > 0) {
 427 |       dataAttribs.extra_dashes = d.length;
 428 |     }
 429 |     return new SelfclosingTagTk('hr', [], dataAttribs);
 430 |   }
 431 | 
 432 | /*
 433 |  * Block structures with start-of-line wiki syntax
 434 |  */
 435 | block_line
 436 |   = heading
 437 |   / list_item
 438 |   / hr
 439 |   / st:space_or_newline*
 440 |     r:( & [ <{}|!] tl:table_line { return tl; }
 441 |       // tag-only lines should not trigger pre either
 442 |       / bts:(bt:block_tag stl:optionalSpaceToken { return bt.concat(stl); })+
 443 |         &eolf { return bts; }
 444 |       ) {
 445 |           return st.concat(r);
 446 |       }
 447 | 
 448 | /*
 449 |  * A paragraph. We don't emit 'p' tokens to avoid issues with template
 450 |  * transclusions, <p> tags in the source and the like. Instead, we perform
 451 |  * some paragraph wrapping on the token stream and the DOM.
 452 |  */
 453 | paragraph
 454 |   = s1:sol s2:sol c:inlineline {
 455 |       return s1.concat(s2, c);
 456 |   }
 457 | 
 458 | br = s:optionalSpaceToken &newline {
 459 |     return s.concat([
 460 |       new SelfclosingTagTk('br', [], { tsr: tsrOffsets() }),
 461 |     ]);
 462 | }
 463 | 
 464 | inline_breaks
 465 |   = & { return inlineBreaks(input, endOffset(), stops); }
 466 | 
 467 | inlineline
 468 |   = c:(urltext
 469 |           / !inline_breaks
 470 |             r:(inline_element / [^\r\n]) { return r; })+ {
 471 |       return tu.flattenStringlist(c);
 472 |   }
 473 | 
 474 | inline_element
 475 |   = & '<' r:( xmlish_tag
 476 |           / comment
 477 |           ) { return r; }
 478 |     / & '{' r:tplarg_or_template { return r; }
 479 |     / & "-{" r:lang_variant_or_tpl { return r; }
 480 |     // FIXME: The php parser's replaceInternalLinks2 splits on [[, resulting
 481 |     // in sequences with odd number of brackets parsing as text, and sequences
 482 |     // with even number of brackets having its innermost pair parse as a
 483 |     // wikilink.  For now, we faithfully reproduce what's found there but
 484 |     // wikitext, the language, shouldn't be defined by odd tokenizing behaviour
 485 |     // in the php parser.  Flagging this for a future cleanup.
 486 |     / $('[[' &'[')+
 487 |     / & '[' r:( wikilink / extlink ) { return r; }
 488 |     / & "'" r:quote { return r; }
 489 | 
 490 | /* Headings  */
 491 | 
 492 | heading = & "=" // guard, to make sure '='+ will match.
 493 |           // XXX: Also check to end to avoid inline parsing?
 494 |     r:(
 495 |      & { return stops.inc('h'); }
 496 |      s:$'='+ // moved in here to make s accessible to inner action
 497 |      ce:(
 498 |        (ill:inlineline? { return ill || []; })
 499 |        $'='+
 500 |      )?
 501 |      & { return ce || s.length > 2; }
 502 |      endTPos:("" { return endOffset(); })
 503 |      spc:(spaces / comment)*
 504 |      &eolf
 505 |      {
 506 |         var c;
 507 |         var e;
 508 |         var level;
 509 |         stops.dec('h');
 510 |         if (ce) {
 511 |             c = ce[0];
 512 |             e = ce[1];
 513 |             level = Math.min(s.length, e.length);
 514 |         } else {
 515 |             // split up equal signs into two equal parts, with at least
 516 |             // one character in the middle.
 517 |             level = Math.floor((s.length - 1) / 2);
 518 |             c = ['='.repeat(s.length - 2 * level)];
 519 |             s = e = '='.repeat(level);
 520 |         }
 521 |         level = Math.min(6, level);
 522 |         // convert surplus equals into text
 523 |         if (s.length > level) {
 524 |             var extras1 = s.substr(0, s.length - level);
 525 |             if (c[0].constructor === String) {
 526 |                 c[0] = extras1 + c[0];
 527 |             } else {
 528 |                 c.unshift(extras1);
 529 |             }
 530 |         }
 531 |         if (e.length > level) {
 532 |             var extras2 = e.substr(0, e.length - level);
 533 |             var lastElem = lastItem(c);
 534 |             if (lastElem.constructor === String) {
 535 |                 c[c.length - 1] += extras2;
 536 |             } else {
 537 |                 c.push(extras2);
 538 |             }
 539 |         }
 540 | 
 541 |         var tsr = tsrOffsets('start');
 542 |         tsr[1] += level;
 543 |         return [
 544 |           new TagTk('h' + level, [], { tsr: tsr }),
 545 |         ].concat(c, [
 546 |           new EndTagTk('h' + level, [], { tsr: [endTPos - level, endTPos] }),
 547 |           spc,
 548 |         ]);
 549 |       }
 550 |     / & { stops.dec('h'); return false; }
 551 |     ) { return r; }
 552 | 
 553 | 
 554 | /* Comments */
 555 | 
 556 | // The php parser does a straight str.replace(/<!--((?!-->).)*-->/g, "")
 557 | // but, as always, things around here are a little more complicated.
 558 | //
 559 | // We accept the same comments, but because we emit them as HTML comments
 560 | // instead of deleting them, we have to encode the data to ensure that
 561 | // we always emit a valid HTML5 comment.  See the encodeComment helper
 562 | // for further details.
 563 | 
 564 | comment
 565 |     = '<!--' c:$(!"-->" .)* ('-->' / eof) {
 566 |         var data = DU.encodeComment(c);
 567 |         return [new CommentTk(data, { tsr: tsrOffsets() })];
 568 |     }
 569 | 
 570 | 
 571 | // Behavior switches. See:
 572 | // https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches
 573 | behavior_switch
 574 |   = bs:$('__' behavior_text '__') {
 575 |     if (env.conf.wiki.isMagicWord(bs)) {
 576 |       return [
 577 |         new SelfclosingTagTk('behavior-switch', [ new KV('word', bs) ],
 578 |           { tsr: tsrOffsets(), src: bs, magicSrc: bs }
 579 |         ),
 580 |       ];
 581 |     } else {
 582 |       return [ bs ];
 583 |     }
 584 |   }
 585 | 
 586 | // Instead of defining a charset, php's doDoubleUnderscore concats a regexp of
 587 | // all the language specific aliases of the behavior switches and then does a
 588 | // match and replace. Just be as permissive as possible and let the
 589 | // BehaviorSwitchPreprocessor back out of any overreach.
 590 | behavior_text = $( !'__' [^'"<~[{\n\r:;\]}|!=] )+
 591 | 
 592 | 
 593 | /**************************************************************
 594 |  * External (bracketed and autolinked) links
 595 |  **************************************************************/
 596 | 
 597 | autolink
 598 |   = ! { return stops.onStack('extlink'); }
 599 |     // this must be a word boundary, so previous character must be non-word
 600 |     ! { return /\w/.test(input[endOffset() - 1] || ''); }
 601 |   r:(
 602 |       // urllink, inlined
 603 |       target:autourl {
 604 |         var res = [new SelfclosingTagTk('urllink', [new KV('href', target)], { tsr: tsrOffsets() })];
 605 |           return res;
 606 |       }
 607 |     / autoref
 608 |     / isbn) { return r; }
 609 | 
 610 | extlink "extlink"
 611 |   = ! { return stops.onStack('extlink'); } // extlink cannot be nested
 612 |   r:(
 613 |         "["
 614 |         & { return stops.push('extlink', true); }
 615 |         addr:(url_protocol urladdr / "")
 616 |         target:(extlink_preprocessor_text / "")
 617 |         & {
 618 |           // Protocol must be valid and there ought to be at least one
 619 |           // post-protocol character.  So strip last char off target
 620 |           // before testing protocol.
 621 |           var flat = tu.flattenString([addr, target]);
 622 |           if (Array.isArray(flat)) {
 623 |              // There are templates present, alas.
 624 |              return flat.length > 0;
 625 |           }
 626 |           return Util.isProtocolValid(flat.slice(0, -1), env);
 627 |         }
 628 |         sp:$( space / unispace )*
 629 |         targetOff:( "" { return endOffset(); })
 630 |         content:inlineline?
 631 |         "]" {
 632 |             stops.pop('extlink');
 633 |             return [
 634 |                 new SelfclosingTagTk('extlink', [
 635 |                     new KV('href', tu.flattenString([addr, target])),
 636 |                     new KV('mw:content', content || ''),
 637 |                     new KV('spaces', sp),
 638 |                 ], {
 639 |                     targetOff: targetOff,
 640 |                     tsr: tsrOffsets(),
 641 |                     contentOffsets: [targetOff, endOffset() - 1],
 642 |                 }),
 643 |             ];
 644 |         }
 645 |       / "[" & { return stops.pop('extlink'); }
 646 |     ) { return r; }
 647 | 
 648 | autoref
 649 |   = ref:('RFC' / 'PMID') sp:space_or_nbsp+ identifier:$[0-9]+ end_of_word
 650 | {
 651 |     var base_urls = {
 652 |       'RFC': 'https://tools.ietf.org/html/rfc%s',
 653 |       'PMID': '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract',
 654 |     };
 655 |     return [
 656 |         new SelfclosingTagTk('extlink', [
 657 |            new KV('href', tu.sprintf(base_urls[ref], identifier)),
 658 |            new KV('mw:content', tu.flattenString([ref, sp, identifier])),
 659 |            new KV('typeof', 'mw:ExtLink/' + ref),
 660 |         ],
 661 |         { stx: "magiclink", tsr: tsrOffsets() }),
 662 |     ];
 663 | }
 664 | 
 665 | isbn
 666 |   = 'ISBN' sp:space_or_nbsp+ isbn:(
 667 |       [0-9]
 668 |       (s:space_or_nbsp_or_dash &[0-9] { return s; } / [0-9])+
 669 |       ((space_or_nbsp_or_dash / "") [xX] / "")
 670 |     ) isbncode:(
 671 |       end_of_word
 672 |       {
 673 |         // Convert isbn token-and-entity array to stripped string.
 674 |         return tu.flattenStringlist(isbn).filter(function(e) {
 675 |           return e.constructor === String;
 676 |         }).join('').replace(/[^\dX]/ig, '').toUpperCase();
 677 |       }
 678 |     ) &{
 679 |        // ISBNs can only be 10 or 13 digits long (with a specific format)
 680 |        return isbncode.length === 10 ||
 681 |              (isbncode.length === 13 && /^97[89]/.test(isbncode));
 682 |     } {
 683 |       return [
 684 |         new SelfclosingTagTk('extlink', [
 685 |            new KV('href', 'Special:BookSources/' + isbncode),
 686 |            new KV('mw:content', tu.flattenString(['ISBN', sp, isbn])),
 687 |            new KV('typeof', 'mw:WikiLink/ISBN'),
 688 |         ],
 689 |         { stx: "magiclink", tsr: tsrOffsets() }),
 690 |       ];
 691 | }
 692 | 
 693 | 
 694 | /* Default URL protocols in MediaWiki (see DefaultSettings). Normally
 695 |  * these can be configured dynamically. */
 696 | 
 697 | url_protocol =
 698 |     & { return Util.isProtocolValid(input.substr(endOffset()), env); }
 699 |     p:$( '//' / [A-Za-z] [-A-Za-z0-9+.]* ':' '//'? ) { return p; }
 700 | 
 701 | // no punctuation, and '{<' to trigger directives
 702 | no_punctuation_char = [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
 703 | 
 704 | // this is the general url rule
 705 | // on the PHP side, the path part matches EXT_LINK_URL_CLASS
 706 | // which is '[^][<>"\\x00-\\x20\\x7F\p{Zs}]'
 707 | // the 's' and 'r' pieces below match the characters in
 708 | // EXT_LINK_URL_CLASS which aren't included in no_punctuation_char
 709 | url "url"
 710 |   = proto:url_protocol
 711 |     addr:(urladdr / "")
 712 |     path:(  ( !inline_breaks
 713 |               c:no_punctuation_char
 714 |               { return c; }
 715 |             )
 716 |             / s:[.:,']  { return s; }
 717 |             / comment
 718 |             / tplarg_or_template
 719 |             / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
 720 |                 r:(
 721 |                     & "&" he:htmlentity { return he; }
 722 |                   / [&%{]
 723 |                 ) { return r; }
 724 |          )*
 725 |          // Must be at least one character after the protocol
 726 |          & { return addr.length > 0 || path.length > 0; }
 727 | {
 728 |     return tu.flattenString([proto, addr].concat(path));
 729 | }
 730 | 
 731 | // this is the somewhat-restricted rule used in autolinks
 732 | // See Parser::doMagicLinks and Parser.php::makeFreeExternalLink.
 733 | // The `path` portion matches EXT_LINK_URL_CLASS, as in the general
 734 | // url rule.  As in PHP, we do some fancy fixup to yank out
 735 | // trailing punctuation, perhaps including parentheses.
 736 | // The 's' and 'r' pieces match the characters in EXT_LINK_URL_CLASS
 737 | // which aren't included in no_punctuation_char
 738 | autourl
 739 |   = &{ return stops.push('autourl', { sawLParen: false }); }
 740 |     ! '//' // protocol-relative autolinks not allowed (T32269)
 741 |     r:(
 742 |     proto:url_protocol
 743 |     addr:(urladdr / "")
 744 |     path:(  ( !inline_breaks
 745 |               ! "("
 746 |               c:no_punctuation_char
 747 |               { return c; }
 748 |             )
 749 |             / "(" { stops.onStack('autourl').sawLParen = true; return "("; }
 750 |             / [.:,]
 751 |             / $(['] ![']) // single quotes are ok, double quotes are bad
 752 |             / comment
 753 |             / tplarg_or_template
 754 |             / ! ( rhe:raw_htmlentity &{ return /^[<>\u00A0]$/.test(rhe); } )
 755 |                 r:(
 756 |                     & "&" he:htmlentity { return he; }
 757 |                   / [&%{]
 758 |                 ) { return r; }
 759 |          )*
 760 | {
 761 |     // as in Parser.php::makeFreeExternalLink, we're going to
 762 |     // yank trailing punctuation out of this match.
 763 |     var url = tu.flattenStringlist([proto, addr].concat(path));
 764 |     // only need to look at last element; HTML entities are strip-proof.
 765 |     var last = lastItem(url);
 766 |     var trim = 0;
 767 |     if (last && last.constructor === String) {
 768 |       var strip = ',;\\.:!?';
 769 |       if (!stops.onStack('autourl').sawLParen) {
 770 |         strip += ')';
 771 |       }
 772 |       strip = new RegExp('[' + JSUtils.escapeRegExp(strip) + ']*$');
 773 |       trim = strip.exec(last)[0].length;
 774 |       url[url.length - 1] = last.slice(0, last.length - trim);
 775 |     }
 776 |     url = tu.flattenStringlist(url);
 777 |     if (url.length === 1 && url[0].constructor === String && url[0].length <= proto.length) {
 778 |       return null; // ensure we haven't stripped everything: T106945
 779 |     }
 780 |     peg$currPos -= trim;
 781 |     stops.pop('autourl');
 782 |     return url;
 783 | } ) &{ return r !== null; } {return r; }
 784 |     / &{ return stops.pop('autourl'); }
 785 | 
 786 | // This is extracted from EXT_LINK_ADDR in Parser.php: a simplified
 787 | // expression to match an IPv6 address.  The IPv4 address and "at least
 788 | // one character of a host name" portions are punted to the `path`
 789 | // component of the `autourl` and `url` productions
 790 | urladdr
 791 |   = $( "[" [0-9A-Fa-f:.]+ "]" )
 792 | 
 793 | /**************************************************************
 794 |  * Templates, -arguments and wikilinks
 795 |  **************************************************************/
 796 | 
 797 | /*
 798 |  * Precedence: template arguments win over templates. See
 799 |  * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
 800 |  * 4: {{{{·}}}} → {·{{{·}}}·}
 801 |  * 5: {{{{{·}}}}} → {{·{{{·}}}·}}
 802 |  * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
 803 |  * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
 804 |  * This is only if close has > 3 braces; otherwise we just match open
 805 |  * and close as we find them.
 806 |  */
 807 | tplarg_or_template
 808 |   = &'{{' &{
 809 |       // Refuse to recurse beyond `maxDepth` levels. Default in the PHP parser
 810 |       // is $wgMaxTemplateDepth = 40; This is to prevent crashing from
 811 |       // buggy wikitext with lots of unclosed template calls, as in
 812 |       // eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094
 813 |       if (stops.onCount('templatedepth') === undefined ||
 814 |           stops.onCount('templatedepth') < env.conf.parsoid.maxDepth) {
 815 |         return true;
 816 |       } else {
 817 |         return false;
 818 |       }
 819 |     } t:tplarg_or_template_guarded { return t; }
 820 | 
 821 | tplarg_or_template_guarded
 822 |   = &{ return stops.inc('templatedepth'); }
 823 |     r:( &('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return a; }
 824 |       / a:$('{' &('{{{'+ !'{'))? b:tplarg { return [a].concat(b); }
 825 |       / a:$('{' &('{{' !'{'))? b:template { return [a].concat(b); }
 826 |       / a:broken_template { return a; }
 827 |     ) {
 828 |       stops.dec('templatedepth');
 829 |       return r;
 830 |     }
 831 |     / & { return stops.dec('templatedepth'); }
 832 | 
 833 | tplarg_or_template_or_bust "tplarg_or_template_or_bust"
 834 |     = r:(tplarg_or_template / .)+ { return tu.flattenIfArray(r); }
 835 | 
 836 | template
 837 |   = stopLen:("" { return stops.push('preproc', /* {{ */'}}'); })
 838 |     t:( template_preproc / &{ return stops.popTo('preproc', stopLen); } )
 839 |     { stops.popTo('preproc', stopLen); return t; }
 840 | 
 841 | // The PHP preprocessor maintains a single stack of "closing token we
 842 | // are currently looking for", with no backtracking.  This means that
 843 | // once you see `[[ {{` you are looking only for `}}` -- if that template
 844 | // turns out to be broken you will never pop the `}}` and there is no way
 845 | // to close the `[[`.  Since the PEG tokenizer in Parsoid uses backtracking
 846 | // and parses in a single pass (instead of PHP's split preprocessor/parser)
 847 | // we have to be a little more careful when we emulate this behavior.
 848 | // If we use a rule like:
 849 | //   template = "{{" tplname tplargs* "}}"?
 850 | // Then we end up having to reinterpret `tplname tplargs*` as a tlb if it
 851 | // turns out we never find the `}}`, which involves a lot of tedious gluing
 852 | // tokens back together with fingers crossed we haven't discarded any
 853 | // significant newlines/whitespace/etc.  An alternative would be a rule like:
 854 | //   broken_template = "{{" tlb
 855 | // but again, `template` is used in many different contexts; `tlb` isn't
 856 | // necessarily the right one to recursively invoke.  Instead we get the
 857 | // broken template off of the PEGjs production stack by returning immediately
 858 | // after `{{`, but we leave a "broken token" on top of the preprocessor
 859 | // stops stack to indicate we're "still in" the {{ context and shouldn't
 860 | // ever inlineBreak for any closing tokens above this one.  For example:
 861 | //   [[Foo{{Bar]]
 862 | // This will match as:
 863 | //   wikilink->text,template->text             --> FAILS looking for }}
 864 | //     backtracks, popping "]]" and "}}" off preproc stack
 865 | //   wikilink->text,broken_template,text       --> FAILS looking for ]]
 866 | //     backtracks, popping "]]" and "broken" off preproc stack
 867 | //   broken_wikilink,text,broken_template,text --> OK
 868 | //     with ["broken", "broken"] left on the preproc stops stack
 869 | // Note that we use stops.popTo() to make sure the preproc stack is
 870 | // cleaned up properly during backtracking, even if there were broken-FOO
 871 | // productions taken which (deliberately) left elements on the preproc stack.
 872 | 
 873 | broken_template
 874 |   = &"{{" &{ return stops.push('preproc', 'broken'); }
 875 |     // for broken-template,  deliberately fail to pop the preproc stops stack
 876 |     t:"{{" { return t; }
 877 | 
 878 | template_preproc
 879 |   = "{{" nl_comment_space*
 880 |     target:template_param_value
 881 |     params:(nl_comment_space* "|"
 882 |                 r:( p0:("" { return endOffset(); })
 883 |                     v:nl_comment_space*
 884 |                     p:("" { return endOffset(); })
 885 |                     &("|" / "}}")
 886 |                     { return new KV('', tu.flattenIfArray(v), [p0, p0, p0, p]); } // empty argument
 887 |                     / template_param
 888 |                   ) { return r; }
 889 |             )*
 890 |     nl_comment_space*
 891 |     inline_breaks "}}" {
 892 |       // Insert target as first positional attribute, so that it can be
 893 |       // generically expanded. The TemplateHandler then needs to shift it out
 894 |       // again.
 895 |       params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets));
 896 |       var obj = new SelfclosingTagTk('template', params, { tsr: tsrOffsets(), src: text() });
 897 |       return obj;
 898 |     } / $('{{' space_or_newline* '}}')
 899 | 
 900 | tplarg
 901 |   = stopLen:("" { return stops.push('preproc', /* {{ */'}}'); })
 902 |     t:(tplarg_preproc / &{ return stops.popTo('preproc', stopLen); } )
 903 |     { stops.popTo('preproc', stopLen); return t; }
 904 | 
 905 | tplarg_preproc
 906 |   = "{{{"
 907 |     p:("" { return endOffset(); })
 908 |     target:template_param_value?
 909 |     params:(nl_comment_space* "|"
 910 |                 r:( p0:("" { return endOffset(); })
 911 |                     v:nl_comment_space*
 912 |                     p1:("" { return endOffset(); })
 913 |                     &("|" / "}}}")
 914 |                     { return { tokens: v, srcOffsets: [p0, p1] }; }  // empty argument
 915 |                     / template_param_value
 916 |                   ) { return r; }
 917 |             )*
 918 |     nl_comment_space*
 919 |     inline_breaks "}}}" {
 920 |       params = params.map(function(o) {
 921 |         var s = o.srcOffsets;
 922 |         return new KV('', tu.flattenIfArray(o.tokens), [s[0], s[0], s[0], s[1]]);
 923 |       });
 924 |       if (target === null) { target = { tokens: '', srcOffsets: [p, p, p, p] }; }
 925 |       // Insert target as first positional attribute, so that it can be
 926 |       // generically expanded. The TemplateHandler then needs to shift it out
 927 |       // again.
 928 |       params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets));
 929 |       var obj = new SelfclosingTagTk('templatearg', params, { tsr: tsrOffsets(), src: text() });
 930 |       return obj;
 931 |     }
 932 | 
 933 | template_param
 934 |   = name:template_param_name
 935 |     val:(
 936 |         kEndPos:("" { return endOffset(); })
 937 |         optionalSpaceToken
 938 |         "="
 939 |         vStartPos:("" { return endOffset(); })
 940 |         optionalSpaceToken
 941 |         tpv:template_param_value? {
 942 |             return { kEndPos: kEndPos, vStartPos: vStartPos, value: (tpv && tpv.tokens) || [] };
 943 |         }
 944 |     )? {
 945 |       if (val !== null) {
 946 |           if (val.value !== null) {
 947 |             return new KV(name, tu.flattenIfArray(val.value), [startOffset(), val.kEndPos, val.vStartPos, endOffset()]);
 948 |           } else {
 949 |             return new KV(tu.flattenIfArray(name), '', [startOffset(), val.kEndPos, val.vStartPos, endOffset()]);
 950 |           }
 951 |       } else {
 952 |         return new KV('', tu.flattenIfArray(name), [startOffset(), startOffset(), startOffset(), endOffset()]);
 953 |       }
 954 |     }
 955 |   // empty parameter
 956 |   / & [|}] {
 957 |     return new KV('', '', [startOffset(), startOffset(), startOffset(), endOffset()]);
 958 |   }
 959 | 
 960 | template_param_name
 961 |   = & { return stops.push('equal', true); }
 962 |     tpt:(template_param_text / &'=' { return ''; })
 963 |     {
 964 |         stops.pop('equal');
 965 |         return tpt;
 966 |     }
 967 | 
 968 |   / & { return stops.pop('equal'); }
 969 | 
 970 | template_param_value
 971 |   = & { return stops.push('equal', false); }
 972 |     tpt:template_param_text
 973 |     {
 974 |         stops.pop('equal');
 975 |         return { tokens: tpt, srcOffsets: tsrOffsets() };
 976 |     }
 977 |   / & { return stops.pop('equal'); }
 978 | 
 979 | template_param_text
 980 |   = & { // re-enable tables within template parameters
 981 |         stops.push('table', false);
 982 |         stops.push('extlink', false);
 983 |         stops.push('templateArg', true);
 984 |         stops.push('tableCellArg', false);
 985 |         return stops.inc('template');
 986 |     }
 987 |     il:(nested_block / newlineToken)+ {
 988 |         stops.pop('table');
 989 |         stops.pop('extlink');
 990 |         stops.pop('templateArg');
 991 |         stops.pop('tableCellArg');
 992 |         stops.dec('template');
 993 |         // il is guaranteed to be an array -- so, tu.flattenIfArray will
 994 |         // always return an array
 995 |         var r = tu.flattenIfArray(il);
 996 |         if (r.length === 1 && r[0].constructor === String) {
 997 |             r = r[0];
 998 |         }
 999 |         return r;
1000 |     }
1001 |   / & { stops.pop('table');
1002 |         stops.pop('extlink');
1003 |         stops.pop('templateArg');
1004 |         stops.pop('tableCellArg');
1005 |         return stops.dec('template');
1006 |     }
1007 | 
1008 | //// Language converter block markup of language variants: -{ ... }-
1009 | 
1010 | // Note that "rightmost opening" precedence rule (see
1011 | // https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means
1012 | // that neither -{{ nor -{{{ are parsed as a -{ token, although
1013 | // -{{{{ is (since {{{ has precedence over {{).
1014 | 
1015 | lang_variant_or_tpl
1016 |   = &('-{' &('{{{'+ !'{') tplarg) a:lang_variant { return a; }
1017 |   / a:$('-' &('{{{'+ !'{')) b:tplarg { return [a].concat(b); }
1018 |   / a:$('-' &('{{' '{{{'* !'{')) b:template { return [a].concat(b); }
1019 |   / &'-{' a:lang_variant { return a; }
1020 | 
1021 | broken_lang_variant
1022 |   = &{ return stops.push('preproc', 'broken'); }
1023 |     // for broken-lang-variant, deliberately fail to pop the stops stack
1024 |     r:"-{" { return r; }
1025 | 
1026 | lang_variant
1027 |   = stopLen:("" { return stops.push('preproc', /* -{ */ '}-'); })
1028 |     lv:(lang_variant_preproc / &{ return stops.popTo('preproc', stopLen); })
1029 |     { stops.popTo('preproc', stopLen); return lv; }
1030 |   / broken_lang_variant
1031 | 
1032 | lang_variant_preproc
1033 |   = lv0:("-{" { return startOffset(); })
1034 |     f:(
1035 |        &{ return env.langConverterEnabled(); }
1036 |        ff:opt_lang_variant_flags {
1037 |          // Avoid mutating cached expression results
1038 |          ff = Util.clone(ff, true);
1039 |          // if flags contains 'R', then don't treat ; or : specially inside.
1040 |          if (ff.flags) {
1041 |            ff.raw = ff.flags.has('R') || ff.flags.has('N');
1042 |          } else if (ff.variants) {
1043 |            ff.raw = true;
1044 |          }
1045 |          return ff;
1046 |        } /
1047 |        &{ return !env.langConverterEnabled(); }
1048 |        "" {
1049 |          // if language converter not enabled, don't try to parse inside.
1050 |          return { raw: true };
1051 |        }
1052 |     )
1053 |     ts:(
1054 |       &{ return f.raw; } lv:lang_variant_text { return [{ text: lv }]; }
1055 |       /
1056 |       &{ return !f.raw; } lv:lang_variant_option_list { return lv; }
1057 |     )
1058 |     inline_breaks
1059 |     lv1:("}-" { return endOffset(); }) {
1060 | 
1061 |       if (!env.langConverterEnabled()) {
1062 |         return [ "-{", ts[0].text.tokens, "}-" ];
1063 |       }
1064 |       var lvsrc = input.substring(lv0, lv1);
1065 |       var attribs = [];
1066 | 
1067 |       // Do a deep clone since we may be destructively modifying
1068 |       // (the `t[fld] = name;` below) the result of a cached expression
1069 |       ts = Util.clone(ts, true);
1070 | 
1071 |       ts.forEach(function(t) {
1072 |         // move token strings into KV attributes so that they are
1073 |         // properly expanded by early stages of the token pipeline
1074 |         ['text','from','to'].forEach(function(fld) {
1075 |           if (t[fld] === undefined) { return; }
1076 |           var name = 'mw:lv' + attribs.length;
1077 |           attribs.push(new KV(name, t[fld].tokens, t[fld].srcOffsets));
1078 |           t[fld] = name;
1079 |         });
1080 |       });
1081 |       return [
1082 |         new SelfclosingTagTk(
1083 |           'language-variant',
1084 |            attribs,
1085 |            {
1086 |              tsr: [lv0, lv1],
1087 |              src: lvsrc,
1088 |              flags: f.flags && Array.from(f.flags).sort(),
1089 |              variants: f.variants && Array.from(f.variants).sort(),
1090 |              original: f.original,
1091 |              flagSp: f.sp,
1092 |              texts: ts,
1093 |            }),
1094 |       ];
1095 |     }
1096 | 
1097 | opt_lang_variant_flags
1098 |   = f:( ff:lang_variant_flags "|" { return ff; } )? {
1099 |     // Collect & separate flags and variants into a set and ordered list
1100 |     var flags = new Set();
1101 |     var variants = new Set();
1102 |     var flagList = [];
1103 |     var flagSpace = [];
1104 |     var variantList = [];
1105 |     var variantSpace = [];
1106 |     var useVariants = false;
1107 |     var internalSp = []; // internal whitespace, for round-tripping
1108 |     if (f !== null) {
1109 |       // lang_variant_flags returns arrays in reverse order.
1110 |       f.flags.reverse();
1111 |       f.sp.reverse();
1112 |       var spPtr = 0;
1113 |       f.flags.forEach(function(item) {
1114 |         if (item.flag) {
1115 |           flagSpace.push(f.sp[spPtr++]);
1116 |           flags.add(item.flag);
1117 |           flagList.push(item.flag);
1118 |           flagSpace.push(f.sp[spPtr++]);
1119 |         }
1120 |         if (item.variant) {
1121 |           variantSpace.push(f.sp[spPtr++]);
1122 |           variants.add(item.variant);
1123 |           variantList.push(item.variant);
1124 |           variantSpace.push(f.sp[spPtr++]);
1125 |         }
1126 |       });
1127 |       if (spPtr < f.sp.length) {
1128 |         // handle space after a trailing semicolon
1129 |         flagSpace.push(f.sp[spPtr]);
1130 |         variantSpace.push(f.sp[spPtr]);
1131 |       }
1132 |     }
1133 |     // Parse flags (this logic is from core/languages/ConverterRule.php
1134 |     // in the parseFlags() function)
1135 |     if (flags.size === 0 && variants.size === 0) {
1136 |       flags.add('$S');
1137 |     } else if (flags.has('R')) {
1138 |       flags = new Set(['R']); // remove other flags
1139 |     } else if (flags.has('N')) {
1140 |       flags = new Set(['N']); // remove other flags
1141 |     } else if (flags.has('-')) {
1142 |       flags = new Set(['-']); // remove other flags
1143 |     } else if (flags.has('T') && flags.size === 1) {
1144 |       flags.add('H');
1145 |     } else if (flags.has('H')) {
1146 |       // Replace A flag, and remove other flags except T and D
1147 |       var nf = new Set(['$+', 'H']);
1148 |       if (flags.has('T')) { nf.add('T'); }
1149 |       if (flags.has('D')) { nf.add('D'); }
1150 |       flags = nf;
1151 |     } else if (variants.size > 0) {
1152 |       useVariants = true;
1153 |     } else {
1154 |       if (flags.has('A')) {
1155 |         flags.add('$+');
1156 |         flags.add('$S');
1157 |       }
1158 |       if (flags.has('D')) {
1159 |         flags.delete('$S');
1160 |       }
1161 |     }
1162 |     if (useVariants) {
1163 |       return { variants: variants, original: variantList, sp: variantSpace };
1164 |     } else {
1165 |       return { flags: flags, original: flagList, sp: flagSpace };
1166 |     }
1167 |   }
1168 | 
1169 | lang_variant_flags
1170 |   = sp1:(space_or_newline*) f:lang_variant_flag sp2:(space_or_newline*)
1171 |     more:( ";" lang_variant_flags? )? {
1172 |     var r = more && more[1] ? more[1] : { sp: [], flags: [] };
1173 |     // Note that sp and flags are in reverse order, since we're using
1174 |     // right recursion and want to push instead of unshift.
1175 |     r.sp.push(sp2.join(''));
1176 |     r.sp.push(sp1.join(''));
1177 |     r.flags.push(f);
1178 |     return r;
1179 |   }
1180 |   / sp:(space_or_newline*) {
1181 |     return { sp: [ sp.join('') ], flags: [] };
1182 |   }
1183 | 
1184 | lang_variant_flag
1185 |   = f:[-+A-Z]           { return { flag: f }; }
1186 |   / v:lang_variant_name { return { variant: v }; }
1187 |   / b:(!space_or_newline !nowiki [^{}|;])+ { return { bogus: b.join('') }; /* bad flag */}
1188 | 
1189 | lang_variant_name // language variant name, like zh, zh-cn, etc.
1190 |   = h:[a-z] t:[-a-z]+ { return h + t.join(''); }
1191 |   // Escaped otherwise-unrepresentable language names
1192 |   // Primarily for supporting html2html round trips; PHP doesn't support
1193 |   // using nowikis here (yet!)
1194 |   / nowiki_text
1195 | 
1196 | lang_variant_option_list
1197 |   = o:lang_variant_option rest:( ";" oo:lang_variant_option { return oo; })*
1198 |     tr:( ";" space_or_newline* )? // optional trailing semicolon
1199 |     {
1200 |       var r = [ o ].concat(rest);
1201 |       if (tr) { r.push({ semi: true, sp: tr[1].join('') }); }
1202 |       return r;
1203 |     }
1204 |   / lvtext:lang_variant_text { return [{ text: lvtext }]; }
1205 | 
1206 | lang_variant_option
1207 |   = sp1:(space_or_newline*) lang:lang_variant_name
1208 |     sp2:(space_or_newline*) ":"
1209 |     sp3:(space_or_newline*)
1210 |     lvtext:(lang_variant_nowiki / lang_variant_text_no_semi)
1211 |     {
1212 |       return {
1213 |         twoway: true,
1214 |         lang: lang,
1215 |         text: lvtext,
1216 |         sp: [sp1.join(''), sp2.join(''), sp3.join('')]
1217 |       };
1218 |     }
1219 |   / sp1:(space_or_newline*)
1220 |     from:(lang_variant_nowiki / lang_variant_text_no_semi_or_arrow)
1221 |     "=>"
1222 |     sp2:(space_or_newline*) lang:lang_variant_name
1223 |     sp3:(space_or_newline*) ":"
1224 |     sp4:(space_or_newline*)
1225 |     to:(lang_variant_nowiki / lang_variant_text_no_semi)
1226 |     {
1227 |       return {
1228 |         oneway: true,
1229 |         from: from,
1230 |         lang: lang,
1231 |         to: to,
1232 |         sp: [sp1.join(''), sp2.join(''), sp3.join(''), sp4.join('')]
1233 |       };
1234 |     }
1235 | 
1236 | // html2wt support: If a language name or conversion string can't be
1237 | // represented w/o breaking wikitext, just wrap it in a <nowiki>.
1238 | // PHP doesn't support this (yet), but Parsoid does.
1239 | lang_variant_nowiki
1240 |   = start:("" {return startOffset();})
1241 |     n:nowiki_text
1242 |     end:("" { return endOffset();})
1243 |     space_or_newline* {
1244 |   return { tokens: [ n ], srcOffsets: [start, end] };
1245 | }
1246 | 
1247 | lang_variant_text
1248 |   = start:("" {return startOffset();})
1249 |     tokens:(inlineline / "|" )*
1250 |     end:("" {return endOffset();})
1251 |     { return { tokens: tokens || [], srcOffsets: [start, end] }; }
1252 | 
1253 | lang_variant_text_no_semi
1254 |   = & { return stops.push('semicolon', true); }
1255 |     lvtext:lang_variant_text
1256 |     { stops.pop('semicolon'); return lvtext; }
1257 |   / & { return stops.pop('semicolon'); }
1258 | 
1259 | lang_variant_text_no_semi_or_arrow
1260 |   = & { return stops.push('arrow', true); }
1261 |     lvtext:lang_variant_text_no_semi { stops.pop('arrow'); return lvtext; }
1262 |   / & { return stops.pop('arrow'); }
1263 | 
1264 | wikilink_content
1265 |   = ( pipe startPos:("" { return endOffset(); }) lt:link_text? {
1266 |         var maybeContent = new KV('mw:maybeContent', lt, [startPos, endOffset()]);
1267 |         maybeContent.vsrc = input.substring(startPos, endOffset());
1268 |         return maybeContent;
1269 |   } )*
1270 | 
1271 | wikilink
1272 |   = stopLen:("" { return stops.push('preproc', ']]'); })
1273 |     w:(wikilink_preproc / &{ return stops.popTo('preproc', stopLen); })
1274 |     { stops.popTo('preproc', stopLen); return w; }
1275 |   / broken_wikilink
1276 | 
1277 | // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the
1278 | // second bracket could start an extlink.  Deliberately leave entry
1279 | // on preproc stack since we haven't seen a double-close bracket.
1280 | // (See full explanation above broken_template production.)
1281 | broken_wikilink
1282 |   = &"[[" &{ return stops.push('preproc', 'broken'); }
1283 |     a:("[" (extlink / "[")) { return a; }
1284 | 
1285 | wikilink_preproc
1286 |   = "[["
1287 |     target:wikilink_preprocessor_text?
1288 |     tpos:("" { return endOffset(); })
1289 |     lcs:wikilink_content
1290 |     inline_breaks "]]"
1291 |   {
1292 |       var pipeTrick = (lcs.length === 1 && lcs[0].v === null);
1293 |       var textTokens = [];
1294 |       if (target === null || pipeTrick) {
1295 |         textTokens.push("[[");
1296 |         if (target) {
1297 |           textTokens.push(target);
1298 |         }
1299 |         lcs.forEach(function(a) {
1300 |           // a is a mw:maybeContent attribute
1301 |           textTokens.push("|");
1302 |           if (a.v !== null) { textTokens.push(a.v); }
1303 |         });
1304 |         textTokens.push("]]");
1305 |         return textTokens;
1306 |       }
1307 |       var obj = new SelfclosingTagTk('wikilink');
1308 |       var hrefKV = new KV('href', target);
1309 |       hrefKV.vsrc = input.substring(startOffset() + 2, tpos);
1310 |       // XXX: Point to object with path, revision and input information
1311 |       // obj.source = input;
1312 |       obj.attribs.push(hrefKV);
1313 |       obj.attribs = obj.attribs.concat(lcs);
1314 |       obj.dataAttribs = {
1315 |           tsr: tsrOffsets(),
1316 |           src: text(),
1317 |       };
1318 |       return [obj];
1319 |   }
1320 | 
1321 | // Tables are allowed inside image captions.
1322 | link_text
1323 |   = & {
1324 |       // Suppress the flag temporarily in this rule to consume the '=' here.
1325 |       stops.push('equal', false);
1326 |       return stops.push('linkdesc', true);
1327 |     }
1328 |     c:(  // This group is similar to "block_line" but "list_item"
1329 |          // is omitted since `doBlockLevels` happens after
1330 |          // `replaceInternalLinks2`, where newlines are stripped.
1331 |          (sol (heading / hr / full_table_in_link_caption))
1332 |        / urltext
1333 |        / (!inline_breaks
1334 |           r:( inline_element / '[' text_char+ ']' $(&(!']' / ']]')) / . ) { return r; }
1335 |          )
1336 |     )+ {
1337 |       stops.pop('equal');
1338 |       stops.pop('linkdesc');
1339 |       return tu.flattenStringlist(c);
1340 |     }
1341 |     / & { stops.pop('equal'); return stops.pop('linkdesc'); }
1342 | 
1343 | /* Generic quote rule for italic and bold, further processed in a token
1344 |  * stream transformation in doQuotes. Relies on NlTk tokens being emitted
1345 |  * for each line of text to balance quotes per line.
1346 |  *
1347 |  * We are not using a simple pair rule here as we need to support mis-nested
1348 |  * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
1349 |  * all not context free. */
1350 | quote = quotes:$("''" "'"*) {
1351 |     // sequences of four or more than five quotes are assumed to start
1352 |     // with some number of plain-text apostrophes.
1353 |     var plainticks = 0;
1354 |     var result = [];
1355 |     if (quotes.length === 4) {
1356 |         plainticks = 1;
1357 |     } else if (quotes.length > 5) {
1358 |         plainticks = quotes.length - 5;
1359 |     }
1360 |     if (plainticks > 0) {
1361 |         result.push(quotes.substring(0, plainticks));
1362 |     }
1363 |     // mw-quote token Will be consumed in token transforms
1364 |     var tsr = tsrOffsets();
1365 |     tsr[0] += plainticks;
1366 |     var mwq = new SelfclosingTagTk('mw-quote', [], { tsr: tsr });
1367 |     mwq.value = quotes.substring(plainticks);
1368 |     result.push(mwq);
1369 |     return result;
1370 | }
1371 | 
1372 | 
1373 | /***********************************************************
1374 |  * Pre and xmlish tags
1375 |  ***********************************************************/
1376 | 
1377 | extension_tag =
1378 |   &{ return !stops.onStack('extTag'); }
1379 |   extToken:xmlish_tag
1380 |   // Account for `maybeExtensionTag` returning unmatched start / end tags
1381 |   &{ return extToken.name === 'extension'; }
1382 |   { return extToken; }
1383 | 
1384 | nowiki
1385 |   = extToken:extension_tag
1386 |     &{ return extToken.getAttribute('name') === 'nowiki'; }
1387 |     { return extToken; }
1388 | 
1389 | // Used by nowiki extension to tokenize html entities.
1390 | nowiki_content "nowiki_content"
1391 |   = c:(htmlentity / .)* { return tu.flattenIfArray(c); }
1392 | 
1393 | // Used by lang_variant productions to protect special language names or
1394 | // conversion strings.
1395 | nowiki_text
1396 |   = extToken:nowiki
1397 |   {
1398 |     var txt = Util.getExtArgInfo(extToken).dict.body.extsrc;
1399 |     return Util.decodeEntities(txt);
1400 |   }
1401 | 
1402 | /* Generic XML-like tags
1403 |  *
1404 |  * These also cover extensions (including Cite), which will hook into the
1405 |  * token stream for further processing. The content of extension tags is
1406 |  * parsed as regular inline, but the source positions of the tag are added
1407 |  * to allow reconstructing the unparsed text from the input. */
1408 | 
1409 | // See http://www.w3.org/TR/html5/syntax.html#tag-open-state and
1410 | // following paragraphs.
1411 | tag_name_chars = [^\t\n\v />\0]
1412 | tag_name = $([A-Za-z] tag_name_chars*)
1413 | 
1414 | xmlish_tag
1415 |   = & {
1416 |       // By the time we get to `doTableStuff` in the php parser, we've already
1417 |       // safely encoded element attributes. See 55313f4e in core.
1418 |       stops.push('table', false);
1419 |       stops.push('tableCellArg', false);
1420 |       return true;
1421 |     }
1422 |     "<" end:"/"?
1423 |     name:$(tn:tag_name & {
1424 |       return isXMLTag(tn, false);  // NOTE: 'extTag' stop was pushed.
1425 |     })
1426 |     attribs:generic_newline_attributes
1427 |     space_or_newline* // No need to preserve this -- canonicalize on RT via dirty diff
1428 |     selfclose:"/"?
1429 |     space* // not preserved - canonicalized on RT via dirty diff
1430 |     ">" {
1431 |         stops.pop('table');
1432 |         stops.pop('tableCellArg');
1433 |         stops.pop('extTag');
1434 | 
1435 |         var lcName = name.toLowerCase();
1436 | 
1437 |         // Extension tags don't necessarily have the same semantics as html tags,
1438 |         // so don't treat them as void elements.
1439 |         var isVoidElt = Util.isVoidElement(lcName) && !env.conf.wiki.extensionTags.has(lcName);
1440 | 
1441 |         // Support </br>
1442 |         if (lcName === 'br' && end) {
1443 |             end = null;
1444 |         }
1445 | 
1446 |         var res = tu.buildXMLTag(name, lcName, attribs, end, !!selfclose || isVoidElt, tsrOffsets());
1447 | 
1448 |         // change up data-attribs in one scenario
1449 |         // void-elts that aren't self-closed ==> useful for accurate RT-ing
1450 |         if (!selfclose && isVoidElt) {
1451 |             res.dataAttribs.selfClose = undefined;
1452 |             res.dataAttribs.noClose = true;
1453 |         }
1454 | 
1455 |         return maybeExtensionTag(res);
1456 |     }
1457 |     / "<" "/"? tag_name & { return stops.pop('extTag'); }
1458 |     / & { stops.pop('table'); return stops.pop('tableCellArg'); }
1459 | 
1460 | /*
1461 |  * A variant of xmlish_tag, but also checks if the tag name is a block-level
1462 |  * tag as defined in
1463 |  * http://www.w3.org/TR/html5/syntax.html#tag-open-state and
1464 |  * following paragraphs.
1465 |  */
1466 | block_tag
1467 |   = & {
1468 |       // By the time we get to `doTableStuff` in the php parser, we've already
1469 |       // safely encoded element attributes. See 55313f4e in core.
1470 |       stops.push('table', false);
1471 |       stops.push('tableCellArg', false);
1472 |       return true;
1473 |     }
1474 |     "<" end:"/"?
1475 |     name:$(tn:tag_name & {
1476 |       return isXMLTag(tn, true);  // NOTE: 'extTag' stop was pushed.
1477 |     })
1478 |     attribs:generic_newline_attributes
1479 |     space_or_newline*
1480 |     selfclose:"/"?
1481 |     ">" {
1482 |       stops.pop('table');
1483 |       stops.pop('tableCellArg');
1484 |       stops.pop('extTag');
1485 |       var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, !!selfclose, tsrOffsets());
1486 |       return [maybeExtensionTag(t)];
1487 |     }
1488 |     / "<" "/"? tag_name & { return stops.pop('extTag'); }
1489 |     / & { stops.pop('table'); return stops.pop('tableCellArg'); }
1490 | 
1491 | // A generic attribute that can span multiple lines.
1492 | generic_newline_attribute
1493 |   = s:space_or_newline*
1494 |     namePos0:("" { return endOffset(); })
1495 |     name:generic_attribute_name
1496 |     namePos:("" { return endOffset(); })
1497 |     vd:(space_or_newline* "=" v:generic_att_value? { return v; })?
1498 | {
1499 |     // NB: Keep in sync w/ table_attibute
1500 |     var res;
1501 |     // Encapsulate protected attributes.
1502 |     if (typeof name === 'string') {
1503 |         name = tu.protectAttrs(name);
1504 |     }
1505 |     if (vd !== null) {
1506 |         res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]);
1507 |         res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]);
1508 |     } else {
1509 |         res = new KV(name, '', [namePos0, namePos, namePos, namePos]);
1510 |     }
1511 |     if (Array.isArray(name)) {
1512 |         res.ksrc = input.substring(namePos0, namePos);
1513 |     }
1514 |     return res;
1515 | }
1516 | 
1517 | // A single-line attribute.
1518 | table_attribute
1519 |   = s:optionalSpaceToken
1520 |     namePos0:("" { return endOffset(); })
1521 |     name:table_attribute_name
1522 |     namePos:("" { return endOffset(); })
1523 |     vd:(optionalSpaceToken "=" v:table_att_value? { return v; })?
1524 | {
1525 |     // NB: Keep in sync w/ generic_newline_attribute
1526 |     var res;
1527 |     // Encapsulate protected attributes.
1528 |     if (typeof name === 'string') {
1529 |         name = tu.protectAttrs(name);
1530 |     }
1531 |     if (vd !== null) {
1532 |         res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]);
1533 |         res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]);
1534 |     } else {
1535 |         res = new KV(name, '', [namePos0, namePos, namePos, namePos]);
1536 |     }
1537 |     if (Array.isArray(name)) {
1538 |         res.ksrc = input.substring(namePos0, namePos);
1539 |     }
1540 |     return res;
1541 | }
1542 | 
1543 | // The arrangement of chars is to emphasize the split between what's disallowed
1544 | // by html5 and what's necessary to give directive a chance.
1545 | // See: http://www.w3.org/TR/html5/syntax.html#attributes-0
1546 | generic_attribute_name
1547 |   = q:$(["'=]?)  // From #before-attribute-name-state, < is omitted for directive
1548 |     r:( $[^ \t\r\n\0/=><&{}\-!|]+
1549 |         / !inline_breaks
1550 |           // \0/=> is the html5 attribute name set we do not want.
1551 |           t:( directive / !( space_or_newline / [\0/=>] ) c:. { return c; }
1552 |         ) { return t; }
1553 |     )*
1554 |     & { return r.length > 0 || q.length > 0; }
1555 |   { return tu.flattenString([q].concat(r)); }
1556 | 
1557 | // Also accept these chars in a wikitext table or tr attribute name position.
1558 | // They are normally not matched by the table_attribute_name.
1559 | broken_table_attribute_name_char = c:[\0/=>] { return new KV(c, ''); }
1560 | 
1561 | // Same as generic_attribute_name, except for accepting tags and wikilinks.
1562 | // (That doesn't make sense (ie. match php) in the generic case.)
1563 | // We also give a chance to break on \[ (see T2553).
1564 | table_attribute_name
1565 |   = q:$(["'=]?)  // From #before-attribute-name-state, < is omitted for directive
1566 |     r:( $[^ \t\r\n\0/=><&{}\-!|\[]+
1567 |         / !inline_breaks
1568 |           // \0/=> is the html5 attribute name set we do not want.
1569 |           t:(   $wikilink
1570 |               / directive
1571 |               // Accept insane tags-inside-attributes as attribute names.
1572 |               // The sanitizer will strip and shadow them for roundtripping.
1573 |               // Example: <hiddentext>generated with.. </hiddentext>
1574 |               / &xmlish_tag ill:inlineline { return ill; }
1575 |               / !( space_or_newline / [\0/=>] ) c:. { return c; }
1576 |         ) { return t; }
1577 |     )*
1578 |     & { return r.length > 0 || q.length > 0; }
1579 |   { return tu.flattenString([q].concat(r)); }
1580 | 
1581 | // Attribute value, quoted variants can span multiple lines.
1582 | // Missing end quote: accept /> look-ahead as heuristic.
1583 | // These need to be kept in sync with the attribute_preprocessor_text_*
1584 | generic_att_value
1585 |   = s:$(space_or_newline* "'") t:attribute_preprocessor_text_single? q:$("'" / &('/'? '>')) {
1586 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1587 |     }
1588 |   / s:$(space_or_newline* '"') t:attribute_preprocessor_text_double? q:$('"' / &('/'? '>')) {
1589 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1590 |     }
1591 |   / s:$space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') {
1592 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset());
1593 |     }
1594 | 
1595 | // Attribute value, restricted to a single line.
1596 | // Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic.
1597 | // These need to be kept in sync with the table_attribute_preprocessor_text_*
1598 | table_att_value
1599 |   = s:$(space* "'") t:table_attribute_preprocessor_text_single? q:$("'" / &('!!' / [|\r\n])) {
1600 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1601 |     }
1602 |   / s:$(space* '"') t:table_attribute_preprocessor_text_double? q:$('"' / &('!!' / [|\r\n])) {
1603 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1604 |     }
1605 |   / s:$space* t:table_attribute_preprocessor_text &(space_or_newline/ eof / '!!' / '|') {
1606 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset());
1607 |     }
1608 | 
1609 | /*********************************************************
1610 |  *   Lists
1611 |  *********************************************************/
1612 | list_item = dtdd / hacky_dl_uses / li
1613 | 
1614 | li = bullets:list_char+
1615 |      c:inlineline?
1616 |      // The inline_break is to check if we've hit a template end delimiter.
1617 |      &(eolf / inline_breaks)
1618 | {
1619 |     // Leave bullets as an array -- list handler expects this
1620 |     var tsr = tsrOffsets('start');
1621 |     tsr[1] += bullets.length;
1622 |     var li = new TagTk('listItem', [], { tsr: tsr });
1623 |     li.bullets = bullets;
1624 |     return [ li ].concat(c || []);
1625 | }
1626 | 
1627 | /*
1628 |  * This rule is required to support wikitext of this form
1629 |  *   ::{|border="1"|foo|bar|baz|}
1630 |  * where the leading colons are used to indent the entire table.
1631 |  * This hack was added back in 2006 in commit
1632 |  * a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl
1633 |  * Fürstenberg.
1634 |  */
1635 | hacky_dl_uses = bullets:":"+
1636 |                tbl:(table_line (sol table_line)*)
1637 |                line:inlineline?
1638 |                &comment_space_eolf
1639 | {
1640 |     // Leave bullets as an array -- list handler expects this
1641 |     var tsr = tsrOffsets('start');
1642 |     tsr[1] += bullets.length;
1643 |     var li = new TagTk('listItem', [], { tsr: tsr });
1644 |     li.bullets = bullets;
1645 |     return tu.flattenIfArray([li, tbl || [], line || []]);
1646 | }
1647 | 
1648 | dtdd
1649 |   = bullets:(!(";" !list_char) lc:list_char { return lc; })*
1650 |     ";"
1651 |     & {return stops.inc('colon');}
1652 |     c:inlineline?
1653 |     cpos:(":" { return endOffset(); })
1654 |     // Fortunately dtdds cannot be nested, so we can simply set the flag
1655 |     // back to 0 to disable it.
1656 |     & { stops.counters.colon = 0; return true;}
1657 |     d:inlineline?
1658 |     &eolf {
1659 |         // Leave bullets as an array -- list handler expects this
1660 |         // TSR: +1 for the leading ";"
1661 |         var numBullets = bullets.length + 1;
1662 |         var tsr = tsrOffsets('start');
1663 |         tsr[1] += numBullets;
1664 |         var li1 = new TagTk('listItem', [], { tsr: tsr });
1665 |         li1.bullets = bullets.slice();
1666 |         li1.bullets.push(";");
1667 |         // TSR: -1 for the intermediate ":"
1668 |         var li2 = new TagTk('listItem', [], { tsr: [cpos - 1, cpos], stx: 'row' });
1669 |         li2.bullets = bullets.slice();
1670 |         li2.bullets.push(":");
1671 | 
1672 |         return [ li1 ].concat(c || [], [ li2 ], d || []);
1673 |     }
1674 |   // Fall-back case to clear the colon flag
1675 |   / & { stops.counters.colon = 0; return false; }
1676 | 
1677 | 
1678 | list_char = [*#:;]
1679 | 
1680 | 
1681 | 
1682 | /******************************************************************************
1683 |  * Tables
1684 |  * ------
1685 |  * Table rules are geared to support independent parsing of fragments in
1686 |  * templates (the common table start / row / table end use case). The tokens
1687 |  * produced by these fragments then match up to a table while building the
1688 |  * DOM tree. For similar reasons, table rows do not emit explicit end tag
1689 |  * tokens.
1690 |  *
1691 |  * The separate table_line rule is faster than moving those rules
1692 |  * directly to block_lines.
1693 |  *
1694 |  * Notes about the full_table_in_link_caption rule
1695 |  * -----------------------------------------------------
1696 |  * However, for link-tables, we have introduced a stricter parse wherein
1697 |  * we require table-start and table-end tags to not come from a template.
1698 |  * In addition, this new rule doesn't accept fosterable-content in
1699 |  * the table unlike the more lax (sol table_line)+ rule.
1700 |  *
1701 |  * This is the best we can do at this time since we cannot distinguish
1702 |  * between table rows and image options entirely in the tokenizer.
1703 |  *
1704 |  * Consider the following examples:
1705 |  *
1706 |  * Example 1:
1707 |  *
1708 |  * [[Image:Foo.jpg|left|30px|Example 1
1709 |  * {{This-template-returns-a-table-start-tag}}
1710 |  * |foo
1711 |  * {{This-template-returns-a-table-end-tag}}
1712 |  * ]]
1713 |  *
1714 |  * Example 2:
1715 |  *
1716 |  * [[Image:Foo.jpg|left|30px|Example 1
1717 |  * {{echo|a}}
1718 |  * |foo
1719 |  * {{echo|b}}
1720 |  * ]]
1721 |  *
1722 |  * So, we cannot know a priori (without preprocessing or fully expanding
1723 |  * all templates) if "|foo" in the two examples is a table cell or an image
1724 |  * option. This is a limitation of our tokenizer-based approach compared to
1725 |  * the preprocessing-based approach of the PHP parser.
1726 |  *
1727 |  * Given this limitation, we are okay forcing a full-table context in
1728 |  * link captions (if necessary, we can relax the fosterable-content requirement
1729 |  * but that is broken wikitext anyway, so we can force that edge-case wikitext
1730 |  * to get fixed by rejecting it).
1731 |  ******************************************************************************/
1732 | 
1733 | full_table_in_link_caption
1734 |   = (! inline_breaks / & '{{!}}' )
1735 |     r:(
1736 |         // Note that "linkdesc" is suppressed here to provide a nested parsing
1737 |         // context in which to parse the table.  Otherwise, we may break on
1738 |         // on pipes in the `table_start_tag` and `table_row_tag` attributes.
1739 |         // However, as a result, this can be more permissive than the current
1740 |         // php implementation, but likelier to match the users intent.
1741 |         & { stops.push('linkdesc', false); return stops.push('table', true); }
1742 |         tbl:(
1743 |             table_start_tag optionalNewlines
1744 |             // Accept multiple end tags since a nested table may have been
1745 |             // opened in the table content line.
1746 |             ((sol (table_content_line / tplarg_or_template) optionalNewlines)*
1747 |             sol table_end_tag)+
1748 |         ){
1749 |             stops.pop('linkdesc');
1750 |             stops.pop('table');
1751 |             return tbl;
1752 |         }
1753 |       / & { stops.pop('linkdesc'); return stops.pop('table'); }
1754 |     ) { return r; }
1755 | 
1756 | // This rule assumes start-of-line position!
1757 | table_line
1758 |   = (! inline_breaks / & '{{!}}' )
1759 |     r:(
1760 |         & { return stops.push('table', true); }
1761 |         tl:(
1762 |              table_start_tag optionalNewlines
1763 |            / table_content_line optionalNewlines
1764 |            / table_end_tag
1765 |         ) {
1766 |             stops.pop('table');
1767 |             return tl;
1768 |         }
1769 |       / & { return stops.pop('table'); }
1770 |     ) { return r; }
1771 | 
1772 | table_content_line = (space / comment)* (
1773 |     table_heading_tags
1774 |     / table_row_tag
1775 |     / table_data_tags
1776 |     / table_caption_tag
1777 |   )
1778 | 
1779 | table_start_tag "table_start_tag"
1780 |   = sc:(space / comment)* startPos:("" { return endOffset(); }) b:"{" p:pipe
1781 |     // ok to normalize away stray |} on rt (see T59360)
1782 |     & { return stops.push('table', false); }
1783 |     ta:table_attributes
1784 |     tsEndPos:("" { stops.pop('table'); return endOffset(); })
1785 |     {
1786 |         var coms = tu.popComments(ta);
1787 |         if (coms) {
1788 |           tsEndPos = coms.commentStartPos;
1789 |         }
1790 | 
1791 |         var da = { tsr: [startPos, tsEndPos] };
1792 |         if (p !== "|") {
1793 |             // Variation from default
1794 |             da.startTagSrc = b + p;
1795 |         }
1796 | 
1797 |         sc.push(new TagTk('table', ta, da));
1798 |         if (coms) {
1799 |           sc = sc.concat(coms.buf);
1800 |         }
1801 |         return sc;
1802 |     }
1803 | 
1804 | // FIXME: Not sure if we want to support it, but this should allow columns.
1805 | table_caption_tag
1806 |     // avoid recursion via nested_block_in_table
1807 |   = ! { return stops.onStack('tableDataBlock'); }
1808 |     p:pipe "+"
1809 |     args:row_syntax_table_args?
1810 |     tagEndPos:("" { return endOffset(); })
1811 |     c:nested_block_in_table* {
1812 |         return tu.buildTableTokens("caption", "|+", args, [startOffset(), tagEndPos], endOffset(), c, true);
1813 |     }
1814 | 
1815 | table_row_tag
1816 |   = // avoid recursion via nested_block_in_table
1817 |     ! { return stops.onStack('tableDataBlock'); }
1818 |     p:pipe dashes:$"-"+
1819 |     & { return stops.push('table', false); }
1820 |     a:table_attributes
1821 |     tagEndPos:("" { stops.pop('table'); return endOffset(); })
1822 |     {
1823 |         var coms = tu.popComments(a);
1824 |         if (coms) {
1825 |           tagEndPos = coms.commentStartPos;
1826 |         }
1827 | 
1828 |         var da = {
1829 |           tsr: [ startOffset(), tagEndPos ],
1830 |           startTagSrc: p + dashes,
1831 |         };
1832 | 
1833 |         // We rely on our tree builder to close the row as needed. This is
1834 |         // needed to support building tables from fragment templates with
1835 |         // individual cells or rows.
1836 |         var trToken = new TagTk('tr', a, da);
1837 | 
1838 |         var res = [ trToken ];
1839 |         if (coms) {
1840 |           res = res.concat(coms.buf);
1841 |         }
1842 |         return res;
1843 |     }
1844 | 
1845 | tds
1846 |   = ( pp:( pipe_pipe / p:pipe & row_syntax_table_args { return p; } )
1847 |       tdt:table_data_tag {
1848 |         var da = tdt[0].dataAttribs;
1849 |         da.stx = "row";
1850 |         da.tsr[0] -= pp.length; // include "||"
1851 |         if (pp !== "||" || (da.startTagSrc && da.startTagSrc !== pp)) {
1852 |           // Variation from default
1853 |           da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : '');
1854 |         }
1855 |         return tdt;
1856 |       }
1857 |     )*
1858 | 
1859 | table_data_tags
1860 |     // avoid recursion via nested_block_in_table
1861 |   = ! { return stops.onStack('tableDataBlock'); }
1862 |     p:pipe
1863 |     ![+-] td:table_data_tag
1864 |     tagEndPos:("" { return endOffset(); })
1865 |     tds:tds {
1866 |         var da = td[0].dataAttribs;
1867 |         da.tsr[0] -= p.length; // include "|"
1868 |         if (p !== "|") {
1869 |             // Variation from default
1870 |             da.startTagSrc = p;
1871 |         }
1872 |         return td.concat(tds);
1873 |     }
1874 | 
1875 | table_data_tag
1876 |   = ! "}"
1877 |     arg:row_syntax_table_args?
1878 |     // use inline_breaks to break on tr etc
1879 |     tagEndPos:("" { return endOffset(); })
1880 |     td:nested_block_in_table*
1881 |     {
1882 |         return tu.buildTableTokens("td", "|", arg, [startOffset(), tagEndPos], endOffset(), td);
1883 |     }
1884 | 
1885 | table_heading_tags
1886 |   = "!"
1887 |     & { return stops.push('th', endOffset()); }
1888 |     th:table_heading_tag
1889 |     ths:( pp:("!!" / pipe_pipe) tht:table_heading_tag {
1890 |             var da = tht[0].dataAttribs;
1891 |             da.stx = 'row';
1892 |             da.tsr[0] -= pp.length; // include "!!" or "||"
1893 | 
1894 |             if (pp !== "!!" || (da.startTagSrc && da.startTagSrc !== pp)) {
1895 |                 // Variation from default
1896 |                 da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : '');
1897 |             }
1898 |             return tht;
1899 |           }
1900 |     )* {
1901 |         stops.pop('th');
1902 |         th[0].dataAttribs.tsr[0]--; // include "!"
1903 |         return th.concat(ths);
1904 |     }
1905 |     / & { return stops.onStack('th') !== false ? stops.pop('th') : false; }
1906 | 
1907 | table_heading_tag
1908 |   = arg:row_syntax_table_args?
1909 |     tagEndPos:("" { return endOffset(); })
1910 |     c:( & {
1911 |       // This SyntaxStop is only true until we hit the end of the line.
1912 |       if (stops.onStack('th') !== false &&
1913 |               /\n/.test(input.substring(stops.onStack('th'), endOffset()))) {
1914 |           // There's been a newline. Remove the break and continue
1915 |           // tokenizing nested_block_in_tables.
1916 |           stops.pop('th');
1917 |       }
1918 |       return true;
1919 |     } d:nested_block_in_table { return d; } )* {
1920 |         return tu.buildTableTokens("th", "!", arg, [startOffset(), tagEndPos], endOffset(), c);
1921 |     }
1922 | 
1923 | table_end_tag
1924 |   = sc:(space / comment)* startPos:("" { return endOffset(); }) p:pipe b:"}" {
1925 |       var tblEnd = new EndTagTk('table', [], { tsr: [startPos, endOffset()] });
1926 |       if (p !== "|") {
1927 |           // p+"<brace-char>" is triggering some bug in pegJS
1928 |           // I cannot even use that expression in the comment!
1929 |           tblEnd.dataAttribs.endTagSrc = p + b;
1930 |       }
1931 |       return sc.concat([tblEnd]);
1932 |   }
1933 | 
1934 | /**
1935 |  * Table parameters separated from the content by a single pipe. Does *not*
1936 |  * match if followed by double pipe (row-based syntax).
1937 |  */
1938 | row_syntax_table_args
1939 |   = & { return stops.push('tableCellArg', true); }
1940 |     as:table_attributes s:space* p:pipe !pipe {
1941 |         stops.pop('tableCellArg');
1942 |         return [as, s, p];
1943 |     }
1944 |     / & { return stops.pop('tableCellArg'); }
1945 | 
1946 | 
1947 | /*******************************************************************
1948 |  * Text variants and other general rules
1949 |  *******************************************************************/
1950 | 
1951 | /* All chars that cannot start syntactic structures in the middle of a line
1952 |  * XXX: ] and other end delimiters should probably only be activated inside
1953 |  * structures to avoid unnecessarily leaving the text rule on plain
1954 |  * content.
1955 |  *
1956 |  * TODO: Much of this is should really be context-dependent (syntactic
1957 |  * flags). The wikilink_preprocessor_text rule is an example where
1958 |  * text_char is not quite right and had to be augmented. Try to minimize /
1959 |  * clarify this carefully!
1960 |  */
1961 | 
1962 | text_char = [^-'<~[{\n\r:;\]}|!=]
1963 | 
1964 | /* Legend
1965 |  * '    quotes (italic/bold)
1966 |  * <    start of xmlish_tag
1967 |  * ~    signatures/dates
1968 |  * [    start of links
1969 |  * {    start of parser functions, transclusion and template args
1970 |  * \n   all sort of block-level markup at start of line
1971 |  * \r   ditto
1972 |  * A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC)
1973 |  *
1974 |  * _    behavior switches (e.g., '__NOTOC__') (XXX: not URL related)
1975 |  * ! and | table cell delimiters, might be better to specialize those
1976 |  * =    headings - also specialize those!
1977 |  *
1978 |  * The following chars are also included for now, but only apply in some
1979 |  * contexts and should probably be enabled only in those:
1980 |  * :    separate definition in ; term : definition
1981 |  * ]    end of link
1982 |  * }    end of parser func/transclusion/template arg
1983 |  * -    start of lang_variant -{ ... }-
1984 |  * ;    separator in lang_variant
1985 |  */
1986 | 
1987 | urltext = ( $[^-'<~[{\n/A-Za-z_|!:;\]} &=]+
1988 |           / & [/A-Za-z] al:autolink { return al; }
1989 |           / & "&" he:htmlentity { return he; }
1990 |           // Convert trailing space into &nbsp;
1991 |           // XXX: This should be moved to a serializer
1992 |           // This is a hack to force a whitespace display before the colon
1993 |           / ' ' & ':' {
1994 |               var toks = Util.placeholder('\u00a0', {
1995 |                 src: ' ',
1996 |                 tsr: tsrOffsets('start'),
1997 |                 isDisplayHack: true,
1998 |               }, { tsr: tsrOffsets('end'), isDisplayHack: true });
1999 |               var typeOf = toks[0].getAttribute('typeof');
2000 |               toks[0].setAttribute('typeof', 'mw:DisplaySpace ' + typeOf);
2001 |               return toks;
2002 |           }
2003 |           / & ('__') bs:behavior_switch { return bs; }
2004 |           // About 96% of text_char calls originate here.
2005 |           // pegjs 0.8 inlines this simple rule automatically.
2006 |           / text_char )+
2007 | 
2008 | raw_htmlentity = m:$("&" [#0-9a-zA-Z]+ ";") {
2009 |     return Util.decodeEntities(m);
2010 | }
2011 | 
2012 | htmlentity = cc:raw_htmlentity {
2013 |     // if this is an invalid entity, don't tag it with 'mw:Entity'
2014 |     if (cc.length > 2 /* decoded entity would be 1 or 2 UTF-16 characters */) {
2015 |         return cc;
2016 |     }
2017 |     return [
2018 |         new TagTk('span', [new KV('typeof', 'mw:Entity')], { src: text(), srcContent: cc, tsr: tsrOffsets('start') }),
2019 |         cc,
2020 |         new EndTagTk('span', [], { tsr: tsrOffsets('end') }),
2021 |     ];
2022 | }
2023 | 
2024 | spaces
2025 |   = $[ \t]+
2026 | 
2027 | space = [ \t]
2028 | 
2029 | optionalSpaceToken
2030 |   = s:$space* {
2031 |       if (s.length) {
2032 |           return [s];
2033 |       } else {
2034 |           return [];
2035 |       }
2036 |   }
2037 | 
2038 | /* This rule corresponds to \s in the PHP preg_* functions,
2039 |  * which is used frequently in the PHP parser.  The inclusion of
2040 |  * form feed (but not other whitespace, like vertical tab) is a quirk
2041 |  * of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular
2042 |  * Expressions) library.
2043 |  */
2044 | space_or_newline
2045 |   = [ \t\n\r\x0c]
2046 | 
2047 | /* This rule corresponds to \b in the PHP preg_* functions,
2048 |  * after a word character.  That is, it's a zero-width lookahead that
2049 |  * the next character is not a word character.
2050 |  */
2051 | end_of_word
2052 |   = eof / ![A-Za-z0-9_]
2053 | 
2054 | // Unicode "separator, space" category.  It covers the \u0020 space as well
2055 | // as \u3000 IDEOGRAPHIC SPACE (see bug 19052).  In PHP this is \p{Zs}.
2056 | // Keep this up-to-date with the characters tagged ;Zs; in
2057 | // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
2058 | unispace = [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]
2059 | 
2060 | // Non-newline whitespace, including non-breaking spaces.  Used for magic links.
2061 | space_or_nbsp
2062 |   = space // includes \t
2063 |   / unispace
2064 |   / he:htmlentity &{ return Array.isArray(he) && /^\u00A0$/.test(he[1]); }
2065 |     { return he; }
2066 | 
2067 | // Used within ISBN magic links
2068 | space_or_nbsp_or_dash
2069 |   = space_or_nbsp / "-"
2070 | 
2071 | // Extra newlines followed by at least another newline. Usually used to
2072 | // compress surplus newlines into a meta tag, so that they don't trigger
2073 | // paragraphs.
2074 | optionalNewlines
2075 |   = spc:$([\n\r\t ] &[\n\r])* {
2076 |         if (spc.length) {
2077 |             return [spc];
2078 |         } else {
2079 |             return [];
2080 |         }
2081 |     }
2082 | 
2083 | comment_or_includes = (comment / (
2084 |     ( & { return stops.push("sol_il", true); }
2085 |       i:include_limits
2086 |       & { stops.pop("sol_il"); return true; }
2087 |     ) { return i; }
2088 |     / & { return stops.pop("sol_il"); }
2089 |   ))*
2090 | 
2091 | sol = (empty_line_with_comments / sol_prefix) comment_or_includes
2092 | 
2093 | sol_prefix
2094 |   = newlineToken
2095 |   / & {
2096 |       // Use the sol flag only at the start of the input
2097 |       // NOTE: Explicitly check for 'false' and not a falsy value
2098 |       return endOffset() === 0 && options.sol !== false;
2099 |   } { return []; }
2100 | 
2101 | empty_line_with_comments
2102 |   = sp:sol_prefix p:("" { return endOffset(); }) c:(space* comment (space / comment)* newline)+ {
2103 |         return [
2104 |             sp,
2105 |             new SelfclosingTagTk("meta", [new KV('typeof', 'mw:EmptyLine')], {
2106 |                 tokens: tu.flattenIfArray(c),
2107 |                 tsr: [p, endOffset()],
2108 |             }),
2109 |         ];
2110 |     }
2111 | 
2112 | comment_space = comment / space
2113 | 
2114 | nl_comment_space = newlineToken / comment_space
2115 | 
2116 | /**
2117 |  * noinclude / includeonly / onlyinclude rules. These are normally
2118 |  * handled by the xmlish_tag rule, except where generic tags are not
2119 |  * allowed- for example in directives, which are allowed in various attribute
2120 |  * names and -values.
2121 |  *
2122 |  * Example test case:
2123 |  * {|
2124 |  * |-<includeonly>
2125 |  * foo
2126 |  * </includeonly>
2127 |  * |Hello
2128 |  * |}
2129 |  */
2130 | 
2131 | include_limits =
2132 |   il:("<" c:"/"? name:$(n:$[oyinclude]i+ & {
2133 |     var incl = n.toLowerCase();
2134 |     return incl === "noinclude" || incl === "onlyinclude" ||
2135 |       incl === "includeonly";
2136 |   }) space_or_newline* ">" {
2137 |     var incl = name.toLowerCase();
2138 |     var dp = { tsr: tsrOffsets() };
2139 | 
2140 |     // Record variant since tag is not in normalized lower case
2141 |     if (name !== incl) {
2142 |       dp.srcTagName = name;
2143 |     }
2144 | 
2145 |     // End tag only
2146 |     if (c) {
2147 |       return new EndTagTk(name, [], dp);
2148 |     }
2149 | 
2150 |     var restOfInput = input.substring(endOffset());
2151 |     var tagContent = restOfInput.match(new RegExp("^([\\s\\S]*?)(?:</\\s*" + incl + "\\s*>)", "m"));
2152 | 
2153 |     // Start tag only
2154 |     if (!tagContent || !tagContent[1]) {
2155 |       return new TagTk(name, [], dp);
2156 |     }
2157 | 
2158 |     // Get the content
2159 |     var inclContent = tagContent[1];
2160 | 
2161 |     // Preserve SOL where necessary (for onlyinclude and noinclude)
2162 |     // Note that this only works because we encounter <*include*> tags in
2163 |     // the toplevel content and we rely on the php preprocessor to expand
2164 |     // templates, so we shouldn't ever be tokenizing inInclude.
2165 |     // Last line should be empty (except for comments)
2166 |     if (incl !== "includeonly" && stops.onStack("sol_il")) {
2167 |       var last = lastItem(inclContent.split('\n'));
2168 |       if (!/^(<!--([^-]|-(?!->))*-->)*$/.test(last)) {
2169 |         return false;
2170 |       }
2171 |     }
2172 | 
2173 |     // Tokenize include content in a new tokenizer
2174 |     var inclContentToks = (new PegTokenizer(env)).tokenizeSync(inclContent);
2175 |     inclContentToks = Util.stripEOFTkfromTokens(inclContentToks);
2176 | 
2177 |     // Shift tsr
2178 |     Util.shiftTokenTSR(inclContentToks, endOffset());
2179 | 
2180 |     // Skip past content
2181 |     peg$currPos += inclContent.length;
2182 | 
2183 |     return [new TagTk(name, [], dp)].concat(inclContentToks);
2184 |   }) & { return !!il; } { return il; }
2185 | 
2186 | // Start of file
2187 | sof = & { return endOffset() === 0 && !options.pipelineOffset; }
2188 | 
2189 | // End of file
2190 | eof = & { return endOffset() === input.length; }
2191 | 
2192 | newline = '\n' / '\r\n'
2193 | 
2194 | newlineToken = newline { return [new NlTk(tsrOffsets())]; }
2195 | 
2196 | eolf = newline / eof
2197 | 
2198 | comment_space_eolf = (space+ / comment)* eolf
2199 | 
2200 | // 'Preprocessor' directive- higher-level things that can occur in otherwise
2201 | // plain-text content.
2202 | directive
2203 |   = comment
2204 |   / extension_tag
2205 |   / tplarg_or_template
2206 |   / & "-{" v:lang_variant_or_tpl { return v; }
2207 |   / & "&" e:htmlentity { return e; }
2208 |   / include_limits
2209 | 
2210 | wikilink_preprocessor_text
2211 |   = r:( t:$[^<[{\n\r\t|!\]}{ &\-]+
2212 |         // XXX gwicke: any more chars we need to allow here?
2213 |         / !inline_breaks wr:( directive / $( !"]]" ( text_char / [!<\-\}\]\n\r] ) ) )
2214 |         { return wr; }
2215 |     )+ {
2216 |       return tu.flattenStringlist(r);
2217 |   }
2218 | 
2219 | extlink_preprocessor_text
2220 |   // added special separator character class inline: separates url from
2221 |   // description / text
2222 |   = & {
2223 |     // Prevent breaking on pipes when we're in a link description.
2224 |     // See the test, 'Images with the "|" character in the comment'.
2225 |     return stops.push('linkdesc', false);
2226 |   }
2227 |   r:( $[^'<~[{\n\r|!\]}\-\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+
2228 |   / !inline_breaks s:( directive / no_punctuation_char / [&|{\-] ) { return s; }
2229 |   /// urlencoded_char
2230 |   // !inline_breaks no_punctuation_char
2231 |   / $([.:,] !(space / eolf))
2232 |   / $(['] ![']) // single quotes are ok, double quotes are bad
2233 |   )+ {
2234 |       stops.pop('linkdesc');
2235 |       return tu.flattenString(r);
2236 |   }
2237 |   / & { return stops.pop('linkdesc'); }
2238 | 
2239 | // Attribute values with preprocessor support
2240 | 
2241 | // n.b. / is a permissible char in the three rules below.
2242 | // We only break on />, enforced by the negated expression.
2243 | // Hence, it isn't included in the stop set.
2244 | 
2245 | // The stop set is space_or_newline and > which matches generic_att_value.
2246 | attribute_preprocessor_text
2247 |   = r:( $[^{}&<\-|/ \t\n\r\x0c>]+
2248 |   / !inline_breaks
2249 |     !'/>'
2250 |     s:( directive / [{}&<\-|/] ) { return s; }
2251 |   )+ {
2252 |     return tu.flattenString(r);
2253 |   }
2254 | 
2255 | // The stop set is '> which matches generic_att_value.
2256 | attribute_preprocessor_text_single
2257 |   = r:( $[^{}&<\-|/'>]+
2258 |   / !inline_breaks
2259 |     !'/>'
2260 |     s:( directive / [{}&<\-|/] ) { return s; }
2261 |   )* {
2262 |     return tu.flattenString(r);
2263 |   }
2264 | 
2265 | // The stop set is "> which matches generic_att_value.
2266 | attribute_preprocessor_text_double
2267 |   = r:( $[^{}&<\-|/">]+
2268 |   / !inline_breaks
2269 |     !'/>'
2270 |     s:( directive / [{}&<\-|/] ) { return s; }
2271 |   )* {
2272 |     return tu.flattenString(r);
2273 |   }
2274 | 
2275 | // Variants with the entire attribute on a single line
2276 | 
2277 | // n.b. ! is a permissible char in the three rules below.
2278 | // We only break on !! in th, enforced by the inline break.
2279 | // Hence, it isn't included in the stop set.
2280 | // [ is also permissible but we give a chance to break
2281 | // for the [[ special case in php's doTableStuff (See T2553).
2282 | 
2283 | // The stop set is space_or_newline and | which matches table_att_value.
2284 | table_attribute_preprocessor_text
2285 |   = r:( $[^{}&<\-!\[ \t\n\r\x0c|]+
2286 |   / !inline_breaks s:( directive / [{}&<\-!\[] ) { return s; }
2287 |   )+ {
2288 |     return tu.flattenString(r);
2289 |   }
2290 | 
2291 | // The stop set is '\r\n| which matches table_att_value.
2292 | table_attribute_preprocessor_text_single
2293 |   = r:( $[^{}&<\-!\['\r\n|]+
2294 |   / !inline_breaks s:( directive / [{}&<\-!\[] ) { return s; }
2295 |   )* {
2296 |     return tu.flattenString(r);
2297 |   }
2298 | 
2299 | // The stop set is "\r\n| which matches table_att_value.
2300 | table_attribute_preprocessor_text_double
2301 |   = r:( $[^{}&<\-!\["\r\n|]+
2302 |   / !inline_breaks s:( directive / [{}&<\-!\[] ) { return s; }
2303 |   )* {
2304 |     return tu.flattenString(r);
2305 |   }
2306 | 
2307 | // Special-case support for those pipe templates
2308 | pipe = "|" / "{{!}}"
2309 | 
2310 | // SSS FIXME: what about |{{!}} and {{!}}|
2311 | pipe_pipe = "||" / "{{!}}{{!}}"
2312 | 


--------------------------------------------------------------------------------
/wikitext/rules_test.go:
--------------------------------------------------------------------------------
  1 | package wikitext
  2 | 
  3 | import (
  4 | 	"path"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestRules(t *testing.T) {
  9 | 	cases := []struct {
 10 | 		rule  string
 11 | 		input string
 12 | 		match string
 13 | 	}{
 14 | 		{
 15 | 			"wikilink_preprocessor_text",
 16 | 			"asdf",
 17 | 			"asdf",
 18 | 		},
 19 | 		{
 20 | 			"wikilink_preprocessor_text",
 21 | 			"asdf|asdf",
 22 | 			"asdf",
 23 | 		},
 24 | 		{
 25 | 			"wikilink_preproc",
 26 | 			"[[asdf]]",
 27 | 			`<a href="./asdf">asdf</a>`,
 28 | 		},
 29 | 		{
 30 | 			"wikilink_preproc",
 31 | 			"[[a|b]]",
 32 | 			`<a href="./a">b</a>`,
 33 | 		},
 34 | 		{
 35 | 			"template",
 36 | 			"{{reflink}}",
 37 | 			"",
 38 | 		},
 39 | 		{
 40 | 			"block_lines",
 41 | 			"* foo",
 42 | 			"<li> foo</li>",
 43 | 		},
 44 | 		{
 45 | 			"heading",
 46 | 			"== Foos ==",
 47 | 			"<h2> Foos </h2>",
 48 | 		},
 49 | 		{
 50 | 			"inlineline",
 51 | 			"Foo's",
 52 | 			"Foo's",
 53 | 		},
 54 | 		{
 55 | 			"heading",
 56 | 			"== Foo's ==",
 57 | 			"<h2> Foo&#39;s </h2>",
 58 | 		},
 59 | 		{
 60 | 			"extlink",
 61 | 			"[http://example.com/ Yes Foo Bar]",
 62 | 			`<a href="http://example.com/" class="external" rel="nofollow">Yes Foo Bar</a>`,
 63 | 		},
 64 | 		{
 65 | 			"xmlish_tag",
 66 | 			"<div>foo</div>",
 67 | 			`<div _parsestart=""></div>`,
 68 | 		},
 69 | 		{
 70 | 			"xmlish_tag",
 71 | 			"</div>",
 72 | 			`<div _parseend=""></div>`,
 73 | 		},
 74 | 		{
 75 | 			"xmlish_tag",
 76 | 			"<div/>",
 77 | 			"<div></div>",
 78 | 		},
 79 | 		{
 80 | 			"xmlish_tag",
 81 | 			`<div foo="bar" />`,
 82 | 			`<div foo="bar"></div>`,
 83 | 		},
 84 | 	}
 85 | 
 86 | 	for _, c := range cases {
 87 | 		c := c
 88 | 		t.Run(path.Join(c.rule, c.input), func(t *testing.T) {
 89 | 			val, err := Parse(
 90 | 				"file",
 91 | 				[]byte(c.input),
 92 | 				GlobalStore("text", []byte(c.input)),
 93 | 				GlobalStore("len", len(c.input)),
 94 | 				Entrypoint(c.rule),
 95 | 				Recover(false),
 96 | 			)
 97 | 			if err != nil {
 98 | 				t.Error(err)
 99 | 			}
100 | 			text := concat(val)
101 | 			if c.match != text {
102 | 				t.Errorf("got %q; expected %q", text, c.match)
103 | 			}
104 | 		})
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/wikitext/tokens.go:
--------------------------------------------------------------------------------
 1 | package wikitext
 2 | 
 3 | import (
 4 | 	"golang.org/x/net/html"
 5 | )
 6 | 
 7 | func hasAttr(n *html.Node, key string) bool {
 8 | 	for _, attr := range n.Attr {
 9 | 		if attr.Key == key {
10 | 			return true
11 | 		}
12 | 	}
13 | 	return false
14 | }
15 | 
16 | func removeAttr(n *html.Node, key string) {
17 | 	var attrs []html.Attribute
18 | 	for _, attr := range n.Attr {
19 | 		if attr.Key == key {
20 | 			continue
21 | 		}
22 | 		attrs = append(attrs, attr)
23 | 	}
24 | 	n.Attr = attrs
25 | }
26 | 
27 | func processTokens(n *html.Node) []*html.Node {
28 | 	for child := n.FirstChild; child != nil; child = child.NextSibling {
29 | 		if hasAttr(child, "_parsestart") {
30 | 			removeAttr(child, "_parsestart")
31 | 			remaining := removeSiblingsAfter(child)
32 | 			//log.Printf("children: %q, %s", child.Data, spew.Sdump(remaining))
33 | 			addChildren(child, remaining)
34 | 		} else if hasAttr(child, "_parseend") {
35 | 			remaining := removeSiblingsAfter(child)
36 | 			child.Parent.RemoveChild(child)
37 | 			return remaining
38 | 		}
39 | 		addChildren(child.Parent, processTokens(child))
40 | 	}
41 | 	return nil
42 | }
43 | 
44 | func removeSiblingsAfter(n *html.Node) []*html.Node {
45 | 	var children []*html.Node
46 | 	for child := n.NextSibling; child != nil; child = child.NextSibling {
47 | 		children = append(children, child)
48 | 	}
49 | 	parent := n.Parent
50 | 	for _, child := range children {
51 | 		parent.RemoveChild(child)
52 | 	}
53 | 	return children
54 | }
55 | 
56 | func addChildren(n *html.Node, children []*html.Node) {
57 | 	for _, child := range children {
58 | 		n.AppendChild(child)
59 | 	}
60 | }
61 | 
62 | func numChildren(n *html.Node) int {
63 | 	count := 0
64 | 	for child := n.FirstChild; child != nil; child = child.NextSibling {
65 | 		count++
66 | 	}
67 | 	return count
68 | }
69 | 


--------------------------------------------------------------------------------
/wikitext/tokens_test.go:
--------------------------------------------------------------------------------
 1 | package wikitext
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"strings"
 7 | 	"testing"
 8 | 
 9 | 	"golang.org/x/net/html"
10 | )
11 | 
12 | func TestProcessTokens(t *testing.T) {
13 | 	cases := []struct {
14 | 		in, want string
15 | 	}{
16 | 		{
17 | 			"", "",
18 | 		},
19 | 		{
20 | 			"<div></div>",
21 | 			"<div></div>",
22 | 		},
23 | 		{
24 | 			"<div _parsestart></div> <p _parsestart></p> Foo <p _parseend></p> <div _parseend></div>",
25 | 			`<div _parsestart=""> <p _parsestart=""> Foo </p> </div>`,
26 | 		},
27 | 		{
28 | 			`<div _parsestart></div>Foo<div _parseend></div> asdf <p>Blah</p> <div _parsestart></div>Bar<div _parseend></div>`,
29 | 			`<div _parsestart="">Foo</div> asdf <p>Blah</p> <div _parsestart="">Bar</div>`,
30 | 		},
31 | 	}
32 | 
33 | 	for _, c := range cases {
34 | 		t.Run(c.in, func(t *testing.T) {
35 | 			doc, err := html.Parse(strings.NewReader(c.in))
36 | 			if err != nil {
37 | 				t.Fatal(err)
38 | 			}
39 | 
40 | 			//t.Log(concat(doc))
41 | 
42 | 			if remaining := processTokens(doc); len(remaining) > 0 {
43 | 				t.Errorf("got %d extra children", len(remaining))
44 | 			}
45 | 			var buf bytes.Buffer
46 | 			if err := html.Render(&buf, doc); err != nil {
47 | 				t.Fatal(err)
48 | 			}
49 | 			want := fmt.Sprintf("<html><head></head><body>%s</body></html>", c.want)
50 | 			out := buf.String()
51 | 			if out != want {
52 | 				t.Errorf("expected %q;\ngot %q", want, out)
53 | 			}
54 | 		})
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/wikitext/url.go:
--------------------------------------------------------------------------------
 1 | package wikitext
 2 | 
 3 | import "strings"
 4 | 
 5 | func URLToTitle(u string) string {
 6 | 	return strings.Replace(u, "_", " ", -1)
 7 | }
 8 | 
 9 | func TitleToURL(u string) string {
10 | 	return "./" + strings.Replace(u, " ", "_", -1)
11 | }
12 | 


--------------------------------------------------------------------------------
/wikitext/wikitext.go:
--------------------------------------------------------------------------------
  1 | package wikitext
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"log"
  7 | 	"regexp"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 
 11 | 	"github.com/microcosm-cc/bluemonday"
 12 | 	"github.com/pkg/errors"
 13 | 	"golang.org/x/net/html"
 14 | )
 15 | 
 16 | //go:generate pigeon -o wikitext.peg.go wikitext.peg
 17 | 
 18 | // Convert converts wikitext to HTML.
 19 | func Convert(text []byte, options ...ConvertOption) ([]byte, error) {
 20 | 	var opts opts
 21 | 	for _, opt := range options {
 22 | 		opt(&opts)
 23 | 	}
 24 | 	v, err := Parse(
 25 | 		"file.wikitext",
 26 | 		append(text, '\n'),
 27 | 		GlobalStore("len", len(text)),
 28 | 		GlobalStore("text", text),
 29 | 		GlobalStore("opts", opts),
 30 | 		//Memoize(true),
 31 | 		Recover(false),
 32 | 		//Debug(true),
 33 | 	)
 34 | 	if err != nil {
 35 | 		return nil, err
 36 | 	}
 37 | 
 38 | 	//spew.Dump(v)
 39 | 
 40 | 	var doc *html.Node
 41 | 
 42 | 	for doc == nil && v != nil {
 43 | 		switch val := v.(type) {
 44 | 		case *html.Node:
 45 | 			doc = val
 46 | 		case debugRun:
 47 | 			v = val.Value
 48 | 		}
 49 | 	}
 50 | 
 51 | 	if doc == nil {
 52 | 		return nil, errors.Errorf("expected *html.Node got: %T", v)
 53 | 	}
 54 | 
 55 | 	//log.Printf("Token doc: %q", concat(doc))
 56 | 
 57 | 	remaining := processTokens(doc)
 58 | 	if opts.strict && len(remaining) > 0 {
 59 | 		return nil, errors.Errorf("got %d extra children: doc %q, children %q", len(remaining), concat(doc), concat(remaining))
 60 | 	}
 61 | 	addChildren(doc, remaining)
 62 | 
 63 | 	var buf bytes.Buffer
 64 | 	if err := html.Render(&buf, doc); err != nil {
 65 | 		return nil, err
 66 | 	}
 67 | 
 68 | 	body := buf.Bytes()
 69 | 	body = wikitextPolicy().SanitizeBytes(body)
 70 | 	body = bytes.TrimSpace(body)
 71 | 	return body, nil
 72 | }
 73 | 
 74 | func wikitextPolicy() *bluemonday.Policy {
 75 | 	policy := bluemonday.UGCPolicy()
 76 | 
 77 | 	policy.AllowNoAttrs().OnElements("ref")
 78 | 
 79 | 	policy.RequireNoFollowOnLinks(false)
 80 | 	policy.RequireNoFollowOnFullyQualifiedLinks(true)
 81 | 	policy.AllowStyling()
 82 | 	policy.AllowAttrs("id", "name", "style").Globally()
 83 | 	policy.AllowAttrs("_parsestart", "_parseend", "_parsetoken").Globally()
 84 | 
 85 | 	return policy
 86 | }
 87 | 
 88 | type Attribute struct {
 89 | 	Key, Val interface{}
 90 | }
 91 | 
 92 | func (a Attribute) String() string {
 93 | 	if a.Val == nil {
 94 | 		return concat(a.Key)
 95 | 	}
 96 | 	return fmt.Sprintf("%s=%s", concat(a.Key), concat(a.Val))
 97 | }
 98 | 
 99 | type opts struct {
100 | 	templateHandler func(name string, attrs []Attribute) (interface{}, error)
101 | 	strict          bool
102 | }
103 | 
104 | type ConvertOption func(opts *opts)
105 | 
106 | // TemplateHandler sets the function that runs when a template is found. The
107 | // return value is included in the final document. Either *html.Node or string
108 | // values may be returned. String values will be inserted as escaped text.
109 | func TemplateHandler(f func(name string, attrs []Attribute) (interface{}, error)) ConvertOption {
110 | 	return func(opts *opts) {
111 | 		opts.templateHandler = f
112 | 	}
113 | }
114 | 
115 | func strict() ConvertOption {
116 | 	return func(opts *opts) {
117 | 		opts.strict = true
118 | 	}
119 | }
120 | 
121 | func flatten(fields ...interface{}) []interface{} {
122 | 	var out []interface{}
123 | 	for _, f := range fields {
124 | 		if f == nil {
125 | 			continue
126 | 		}
127 | 
128 | 		switch f := f.(type) {
129 | 		case []interface{}:
130 | 			out = append(out, flatten(f...)...)
131 | 		case []*html.Node:
132 | 			for _, n := range f {
133 | 				out = append(out, n)
134 | 			}
135 | 
136 | 		default:
137 | 			out = append(out, f)
138 | 		}
139 | 	}
140 | 	return out
141 | }
142 | 
143 | func Concat(fields ...interface{}) string {
144 | 	return concat(fields...)
145 | }
146 | 
147 | func concat(fields ...interface{}) string {
148 | 	var b strings.Builder
149 | 	for _, f := range flatten(fields...) {
150 | 		if f == nil {
151 | 			continue
152 | 		}
153 | 
154 | 		switch f := f.(type) {
155 | 		case int:
156 | 			b.WriteString(strconv.Itoa(f))
157 | 
158 | 		case string:
159 | 			b.WriteString(f)
160 | 
161 | 		case []byte:
162 | 			b.Write(f)
163 | 
164 | 		case *html.Node:
165 | 			var buf bytes.Buffer
166 | 			if err := html.Render(&buf, f); err != nil {
167 | 				panic(err)
168 | 			}
169 | 			b.Write(buf.Bytes())
170 | 
171 | 		case debugRun:
172 | 			b.WriteString(concat(f.Value))
173 | 
174 | 		case Attribute:
175 | 			b.WriteString(f.String())
176 | 
177 | 		default:
178 | 			panic(errors.Errorf("concat: unsupported f type %T: %+v", f, f))
179 | 		}
180 | 	}
181 | 	return b.String()
182 | }
183 | 
184 | func addChild(n *html.Node, children interface{}) bool {
185 | 	if children == nil {
186 | 		return false
187 | 	}
188 | 
189 | 	switch children := children.(type) {
190 | 	case []interface{}:
191 | 		added := false
192 | 		for _, c := range children {
193 | 			if addChild(n, c) {
194 | 				added = true
195 | 			}
196 | 		}
197 | 		return added
198 | 
199 | 	case *html.Node:
200 | 		n.AppendChild(children)
201 | 		return true
202 | 
203 | 	case []byte:
204 | 		return addChild(n, string(children))
205 | 
206 | 	case string:
207 | 		return addChild(n, &html.Node{
208 | 			Type: html.TextNode,
209 | 			Data: children,
210 | 		})
211 | 
212 | 	default:
213 | 		log.Fatalf("unsupported children type %T: %#v", children, children)
214 | 		return false
215 | 	}
216 | }
217 | 
218 | func inc(c *current, tag string) {
219 | 	v, _ := c.state[tag].(int)
220 | 	v++
221 | 	c.state[tag] = v
222 | }
223 | 
224 | func dec(c *current, tag string) {
225 | 	v, ok := c.state[tag].(int)
226 | 	if ok {
227 | 		v--
228 | 		if v == 0 {
229 | 			delete(c.state, tag)
230 | 		} else {
231 | 			c.state[tag] = v
232 | 		}
233 | 	}
234 | }
235 | 
236 | func count(c *current, tag string) int {
237 | 	v, _ := c.state[tag].(int)
238 | 	return v
239 | }
240 | 
241 | type stack []interface{}
242 | 
243 | func (s stack) Clone() interface{} {
244 | 	out := make(stack, len(s))
245 | 	for k, v := range s {
246 | 		if c, ok := v.(Cloner); ok {
247 | 			out[k] = c.Clone()
248 | 		} else {
249 | 			out[k] = v
250 | 		}
251 | 	}
252 | 	return out
253 | }
254 | 
255 | var _ Cloner = stack{}
256 | 
257 | func push(c *current, tag string, val interface{}) int {
258 | 	v, _ := c.state[tag].(stack)
259 | 	v = append(v, val)
260 | 	c.state[tag] = v
261 | 	return len(v) - 1
262 | }
263 | 
264 | func pop(c *current, tag string) interface{} {
265 | 	v, _ := c.state[tag].(stack)
266 | 	if len(v) == 0 {
267 | 		return nil
268 | 	}
269 | 	val := v[len(v)-1]
270 | 	if len(v) == 1 {
271 | 		delete(c.state, tag)
272 | 	} else {
273 | 		c.state[tag] = v[:len(v)-1]
274 | 	}
275 | 	return val
276 | }
277 | 
278 | func popTo(c *current, tag string, n int) {
279 | 	v, _ := c.state[tag].(stack)
280 | 	if len(v) > n {
281 | 		if n == 0 {
282 | 			delete(c.state, tag)
283 | 		} else {
284 | 			c.state[tag] = v[:n]
285 | 		}
286 | 	}
287 | }
288 | 
289 | func peek(c *current, tag string) interface{} {
290 | 	v, _ := c.state[tag].(stack)
291 | 	if len(v) == 0 {
292 | 		return nil
293 | 	}
294 | 	return v[len(v)-1]
295 | }
296 | 
297 | var inlineBreaksRegexp = regexp.MustCompile(`[=|!{}:;\r\n[\]<\-]`)
298 | 
299 | func match(pattern string, input []byte) bool {
300 | 	match, err := regexp.Match(pattern, input)
301 | 	if err != nil {
302 | 		panic(err)
303 | 	}
304 | 	return match
305 | }
306 | 
307 | func inlineBreaks(c *current) (bool, error) {
308 | 	pos := c.pos.offset + len(c.text)
309 | 	//log.Printf("inlineBreaks %s, %q, pos %d", c.pos, c.text, pos)
310 | 	input := c.globalStore["text"].([]byte)
311 | 	if len(input) <= pos {
312 | 		log.Printf("inlinebreak false")
313 | 		return false, nil
314 | 	}
315 | 	ch := input[pos]
316 | 	if !inlineBreaksRegexp.Match([]byte{ch}) {
317 | 		//log.Printf("inlinebreak match fail: %s", []byte{ch})
318 | 		return false, nil
319 | 	}
320 | 
321 | 	switch ch {
322 | 	case '=':
323 | 		if arrow, _ := peek(c, "arrow").(bool); arrow && input[pos+1] == '>' {
324 | 			return true, nil
325 | 		}
326 | 		equal, _ := peek(c, "equal").(bool)
327 | 		return equal || (count(c, "h") > 0 && (pos == len(input)-1 ||
328 | 			// possibly more equals followed by spaces or comments
329 | 			//TODO: use match(`^=*(?:[ \t]|<\!--(?:(?!-->)[^])*-->)*(?:[\r\n]|$)`, input[pos+1:]))), nil
330 | 			match(`^=*(?:[ \t]|<\!--.*-->)*(?:[\r\n]|$)`, input[pos+1:]))), nil
331 | 
332 | 	case '|':
333 | 		templateArg, _ := peek(c, "templateArg").(bool)
334 | 		extTag, _ := peek(c, "extTag").(bool)
335 | 		tableCellArg, _ := peek(c, "tableCellArg").(bool)
336 | 		linkdesc, _ := peek(c, "linkdesc").(bool)
337 | 		table, _ := peek(c, "table").(bool)
338 | 
339 | 		return (templateArg &&
340 | 			!(extTag)) ||
341 | 			tableCellArg ||
342 | 			linkdesc ||
343 | 			(table && (pos < len(input)-1 &&
344 | 				match(`[}|]`, []byte{input[pos+1]}))), nil
345 | 
346 | 	case '!':
347 | 		th, _ := peek(c, "th").(bool)
348 | 		return th &&
349 | 			count(c, "templatedepth") == 0 &&
350 | 			input[pos+1] == '!', nil
351 | 
352 | 	case '{':
353 | 		// {{!}} pipe templates..
354 | 		// FIXME: Presumably these should mix with and match | above.
355 | 		tableCellArg, _ := peek(c, "tableCellArg").(bool)
356 | 		table, _ := peek(c, "table").(bool)
357 | 		return ((tableCellArg && string(input[pos:pos+5]) == "{{!}}") ||
358 | 			(table && string(input[pos:pos+10]) == "{{!}}{{!}}")), nil
359 | 
360 | 	case '}':
361 | 		preproc, _ := peek(c, "preproc").(string)
362 | 		//log.Printf("inlineBreaks: } %q %q", preproc, input[pos:pos+2])
363 | 		return string(input[pos:pos+2]) == preproc, nil
364 | 
365 | 	case ':':
366 | 		return count(c, "colon") > 0 &&
367 | 			!peek(c, "extlink").(bool) &&
368 | 			count(c, "templatedepth") == 0 &&
369 | 			!peek(c, "linkdesc").(bool) &&
370 | 			!(peek(c, "preproc").(string) == "}-"), nil
371 | 
372 | 	case ';':
373 | 		semicolon, _ := peek(c, "semicolon").(bool)
374 | 		return semicolon, nil
375 | 
376 | 	case '\r':
377 | 		table, _ := peek(c, "table").(bool)
378 | 		return table && match(`\r\n?\s*[!|]`, input[pos:]), nil
379 | 
380 | 	case '\n':
381 | 		// The code below is just a manual / efficient
382 | 		// version of this check.
383 | 		//
384 | 		// peek(c,'table') && /^\n\s*[!|]/.test(input.substr(pos));
385 | 		//
386 | 		// It eliminates a substr on the string and eliminates
387 | 		// a potential perf problem since "\n" and the inline_breaks
388 | 		// test is common during tokenization.
389 | 		if table, _ := peek(c, "table").(bool); !table {
390 | 			return false, nil
391 | 		}
392 | 
393 | 		// Allow leading whitespace in tables
394 | 
395 | 		// Since we switched on 'c' which is input[pos],
396 | 		// we know that input[pos] is "\n".
397 | 		// So, the /^\n/ part of the regexp is already satisfied.
398 | 		// Look for /\s*[!|]/ below.
399 | 		n := len(input)
400 | 		for i := pos + 1; i < n; i++ {
401 | 			d := input[i]
402 | 			if match(`[!|]`, []byte{d}) {
403 | 				return true, nil
404 | 			} else if !match(`\s`, []byte{d}) {
405 | 				return false, nil
406 | 			}
407 | 		}
408 | 		return false, nil
409 | 
410 | 	case '[':
411 | 		// This is a special case in php's doTableStuff, added in
412 | 		// response to T2553.  If it encounters a `[[`, it bails on
413 | 		// parsing attributes and interprets it all as content.
414 | 		tableCellArg, _ := peek(c, "tableCellArg").(bool)
415 | 		return tableCellArg && string(input[pos:pos+2]) == "[[", nil
416 | 
417 | 	case '-':
418 | 		// Same as above: a special case in doTableStuff, added
419 | 		// as part of T153140
420 | 		tableCellArg, _ := peek(c, "tableCellArg").(bool)
421 | 		return tableCellArg && string(input[pos:pos+2]) == "-{", nil
422 | 
423 | 	case ']':
424 | 		extlink, _ := peek(c, "extlink").(bool)
425 | 		if extlink {
426 | 			return true, nil
427 | 		}
428 | 		preproc, _ := peek(c, "preproc").(string)
429 | 		//log.Printf("inlineBreaks extlink:%#v, preproc:%#v", extlink, preproc)
430 | 		return string(input[pos:pos+2]) == preproc, nil
431 | 
432 | 	case '<':
433 | 		return (count(c, "noinclude") > 0 && string(input[pos:pos+12]) == "</noinclude>") ||
434 | 			(count(c, "includeonly") > 0 && string(input[pos:pos+14]) == "</includeonly>") ||
435 | 			(count(c, "onlyinclude") > 0 && string(input[pos:pos+14]) == "</onlyinclude>"), nil
436 | 	default:
437 | 		return false, errors.Errorf("Unhandled case!")
438 | 	}
439 | }
440 | 


--------------------------------------------------------------------------------
/wikitext/wikitext.peg:
--------------------------------------------------------------------------------
   1 | {
   2 | package wikitext
   3 | 
   4 | }
   5 | 
   6 | /*********************************************************
   7 |  * The top-level rule
   8 |  *********************************************************/
   9 | 
  10 | start <- tlb:tlb* newlineToken* {
  11 |   n := &html.Node{
  12 |     Type: html.DocumentNode,
  13 |   }
  14 |   addChild(n, tlb)
  15 |   if len(c.state) > 0 {
  16 |     panic(errors.Errorf("poluted state! %#v", c.state))
  17 |   }
  18 |   return n, nil
  19 | }
  20 | 
  21 | /*
  22 |  * Redirects can only occur as the first thing in a document.  See
  23 |  * WikitextContent::getRedirectTarget()
  24 |  */
  25 | redirect <- redirect_word
  26 |     space_or_newline*
  27 |     (":" space_or_newline*)?
  28 |     wl:wikilink & {
  29 |     /*
  30 |       return wl.length === 1 && wl[0] && wl[0].constructor !== String;
  31 |       */
  32 |       return false, nil
  33 |   } {
  34 |   /*
  35 |     var link = wl[0];
  36 |     if (sp) { rw += sp; }
  37 |     if (c) { rw += c; }
  38 | // Build a redirect token
  39 |     var redirect = new SelfclosingTagTk('mw:redirect',
  40 | // Put 'href' into attributes so it gets template-expanded
  41 |             [Util.lookupKV(link.attribs, 'href')],
  42 |             {
  43 |                 src: rw,
  44 |                 tsr: tsrOffsets(),
  45 |                 linkTk: link,
  46 |             });
  47 |     return redirect;
  48 |     */
  49 |     return "todo redirect", nil
  50 | }
  51 | 
  52 | // These rules are exposed as start rules.
  53 | generic_newline_attributes <- generic_newline_attribute*
  54 | 
  55 | table_attributes
  56 |   <- (table_attribute / optionalSpaceToken b:broken_table_attribute_name_char {
  57 |   return b, nil })*
  58 | 
  59 | //The 'redirect' magic word.
  60 | // The leading whitespace allowed is due to the PHP trim() function.
  61 | 
  62 | redirect_word
  63 |   <- ([ \t\n\r]*
  64 |     (!space_or_newline ![:[] .)+
  65 |     & {return false, nil /*return env.conf.wiki.getMagicWordMatcher('redirect').test(rw);*/ })
  66 | 
  67 | 
  68 | //# This rule exists to support tokenizing the document in chunks.
  69 | //# The parser's streaming interface will stop tokenization after each iteration
  70 | //# of the starred subexpression, and yield to the node.js event-loop to
  71 | //# schedule other pending event handlers.
  72 | //#
  73 | start_async
  74 |   <- (tlb
  75 |     / newlineToken* &{
  76 |     return false, nil
  77 |     /*
  78 |       if (endOffset() === input.length) {
  79 |           emitChunk([ new EOFTk() ]);
  80 |       }
  81 | // terminate the loop
  82 |       return false;
  83 |     */
  84 |     }
  85 |     )*
  86 | 
  87 | 
  88 | // A document (start rule) is a sequence of toplevelblocks. Tokens are
  89 | // emitted in chunks per toplevelblock to avoid buffering the full document.
  90 | //
  91 | tlb <- !eof b:block {
  92 |   return b, nil
  93 | }
  94 | 
  95 | 
  96 | // The actual contents of each block.
  97 | //
  98 | block
  99 |       // has to be first alternative; otherwise gets parsed as a <ol>
 100 |     <- &sof redirect comment_or_includes block_line? {return "comment_or_includes", nil  /*return [r].concat(cil, bl || []);*/ }
 101 |     / block_lines
 102 |     / & '<' rs:( cm:comment &eolf {return cm, nil  /*return c;*/ }
 103 |            // avoid a paragraph if we know that the line starts with a block tag
 104 |            / block_tag
 105 |            ) {return rs, nil  /*return rs;*/ }
 106 |     / paragraph
 107 |     // Inlineline includes generic tags; wrapped into paragraphs in token
 108 |     // transform and DOM postprocessor
 109 |     / inlineline
 110 |     / s:sol !inline_breaks {return s, nil /*return s;*/ }
 111 | 
 112 | 
 113 | // A block nested in other constructs. Avoid eating end delimiters for other
 114 | // constructs by checking against inline_breaks first.
 115 | //
 116 | nested_block <- !inline_breaks b:block {return b, nil /*return b;*/ }
 117 | 
 118 | 
 119 | // The same, but suitable for use inside a table construct.
 120 | // Doesn't match table_heading_tag, table_row_tag, table_data_tag,
 121 | // table_caption tag, or table_end_tag, although it does allow
 122 | // table_start_tag (for nested tables).
 123 | //
 124 | nested_block_in_table
 125 |   <-
 126 |     // avoid recursion via nested_block_in_table, as that can lead to stack
 127 |     // overflow in large tables
 128 |     // See https://phabricator.wikimedia.org/T59670
 129 |     #{
 130 |     push(c, "tableDataBlock", true)
 131 |     return nil
 132 |     /*
 133 |     return stops.push('tableDataBlock', true);
 134 |     */
 135 |     }
 136 |     // XXX: don't rely on a lame look-ahead like this; use syntax stops
 137 |     // instead, so that multi-line th content followed by a line prefixed with
 138 |     // a comment is also handled. Alternatively, implement a sol look-behind
 139 |     // assertion accepting spaces and comments.
 140 |     !(sol (space* sol)? space* (pipe / "!")) b:nested_block
 141 |     #{pop(c, "tableDataBlock"); return nil}
 142 |     {
 143 |     return b, nil
 144 |     /*
 145 |         stops.pop('tableDataBlock');
 146 |         return b;
 147 |         */
 148 |     }
 149 | 
 150 | 
 151 | // Line-based block constructs.
 152 | //
 153 | block_lines
 154 |   <- s:sol
 155 |     // eat an empty line before the block
 156 |     (s2:(os:optionalSpaceToken so:sol))?
 157 |     bl:block_line
 158 | 
 159 | // Horizontal rules
 160 | hr <- "----" "-"*
 161 | // Check if a newline or content follows
 162 |   ( &sol "" {return nil, nil /*return undefined;*/ } / "" {return true, nil /*return true;*/ } ) {
 163 |   return &html.Node{
 164 |     Type: html.ElementNode,
 165 |     Data: "hr",
 166 |   }, nil
 167 |   /*
 168 |     var dataAttribs = {
 169 |       tsr: tsrOffsets(),
 170 |       lineContent: lineContent,
 171 |     };
 172 |     if (d.length > 0) {
 173 |       dataAttribs.extra_dashes = d.length;
 174 |     }
 175 |     return new SelfclosingTagTk('hr', [], dataAttribs);
 176 |     */
 177 |   }
 178 | 
 179 | 
 180 | // Block structures with start-of-line wiki syntax
 181 | //
 182 | block_line
 183 |   <- heading
 184 |   / list_item
 185 |   / hr
 186 |   / st: space_or_newline*
 187 |     r:( & [ <{}|!] tl:table_line {return tl, nil /*return tl;*/ }
 188 | // tag-only lines should not trigger pre either
 189 |       / bts:(bt:block_tag stl:optionalSpaceToken {return concat(bt, stl), nil /*return bt.concat(stl);*/ })+
 190 |         &eolf {return bts, nil /*return bts;*/ }
 191 |       ) {return concat(st, r), nil
 192 |       /*
 193 |           return st.concat(r);
 194 |           */
 195 |       }
 196 | 
 197 | 
 198 | // A paragraph. We don't emit 'p' tokens to avoid issues with template
 199 | // transclusions, <p> tags in the source and the like. Instead, we perform
 200 | // some paragraph wrapping on the token stream and the DOM.
 201 | //
 202 | paragraph
 203 |   <- s1:sol s2:sol c1:inlineline {
 204 |   n := &html.Node{
 205 |     Type: html.ElementNode,
 206 |     Data: "p",
 207 |   }
 208 |   addChild(n, c1)
 209 |   return n, nil
 210 | }
 211 | 
 212 | br <- optionalSpaceToken &newline {
 213 |   return &html.Node{
 214 |     Type: html.ElementNode,
 215 |     Data: "br",
 216 |   }, nil
 217 | /*
 218 |     return s.concat([
 219 |       new SelfclosingTagTk('br', [], { tsr: tsrOffsets() }),
 220 |     ]);
 221 |     */
 222 | }
 223 | 
 224 | inline_breaks <- & { return inlineBreaks(c) }
 225 | 
 226 | inlineline
 227 |   <- ((r:urltext)
 228 |     / inlineline_element)+
 229 | 
 230 | inlineline_element
 231 |   <- !inline_breaks
 232 |     r:(inline_element / [^\r\n])
 233 |     {return r, nil}
 234 | 
 235 | inline_element
 236 |   <- & '<' r:( xmlish_tag
 237 |           / comment
 238 |           ) {return r, nil /*return r;*/ }
 239 |     / & '{' r:tplarg_or_template {return r, nil/* return r; */}
 240 |     / & "-{" r:lang_variant_or_tpl {return r, nil/* return r; */}
 241 | // FIXME: The php parser's replaceInternalLinks2 splits on [[, resulting
 242 | // in sequences with odd number of brackets parsing as text, and sequences
 243 | // with even number of brackets having its innermost pair parse as a
 244 | // wikilink.  For now, we faithfully reproduce what's found there but
 245 | // wikitext, the language, shouldn't be defined by odd tokenizing behaviour
 246 | // in the php parser.  Flagging this for a future cleanup.
 247 |     / ("[[" &'[')+
 248 |     / & '[' r:( wikilink / extlink ) {return r, nil/* return r; */}
 249 |     / & "'" r:quote {return r, nil/* return r; */}
 250 | 
 251 | // Headings  */
 252 | 
 253 | heading <- & "=" // guard, to make sure '='+ will match.
 254 | // XXX: Also check to end to avoid inline parsing?
 255 |     r:(
 256 |      #{ inc(c, "h"); return nil /*return stops.inc('h');*/ }
 257 |      s:'='+ // moved in here to make s accessible to inner action
 258 |      ce:(
 259 |        (ill:(inlineline?))
 260 |        '='+ {return ill, nil}
 261 |      )?
 262 |      & {
 263 |        return ce!=nil || len(concat(s)) > 2, nil
 264 |        /*return ce || s.length > 2;*/
 265 |      }
 266 |      //("" {return nil, nil /*return endOffset();*/ })
 267 |      spc:(spaces / comment)*
 268 |      &eolf
 269 |      #{dec(c, "h"); return nil}
 270 |      {
 271 |      n := &html.Node{
 272 |        Type: html.ElementNode,
 273 |        Data: "h"+strconv.Itoa(len(concat(s))),
 274 |      }
 275 |      addChild(n, []interface{}{ce, spc})
 276 |      return n, nil
 277 |      /*
 278 |         var c;
 279 |         var e;
 280 |         var level;
 281 |         stops.dec('h');
 282 |         if (ce) {
 283 |             c = ce[0];
 284 |             e = ce[1];
 285 |             level = Math.min(s.length, e.length);
 286 |         } else {
 287 | // split up equal signs into two equal parts, with at least
 288 | // one character in the middle.
 289 |             level = Math.floor((s.length - 1) / 2);
 290 |             c = ['='.repeat(s.length - 2 * level)];
 291 |             s = e = '='.repeat(level);
 292 |         }
 293 |         level = Math.min(6, level);
 294 | // convert surplus equals into text
 295 |         if (s.length > level) {
 296 |             var extras1 = s.substr(0, s.length - level);
 297 |             if (c[0].constructor === String) {
 298 |                 c[0] = extras1 + c[0];
 299 |             } else {
 300 |                 c.unshift(extras1);
 301 |             }
 302 |         }
 303 |         if (e.length > level) {
 304 |             var extras2 = e.substr(0, e.length - level);
 305 |             var lastElem = lastItem(c);
 306 |             if (lastElem.constructor === String) {
 307 |                 c[c.length - 1] += extras2;
 308 |             } else {
 309 |                 c.push(extras2);
 310 |             }
 311 |         }
 312 | 
 313 |         var tsr = tsrOffsets('start');
 314 |         tsr[1] += level;
 315 |         return [
 316 |           new TagTk('h' + level, [], { tsr: tsr }),
 317 |         ].concat(c, [
 318 |           new EndTagTk('h' + level, [], { tsr: [endTPos - level, endTPos] }),
 319 |           spc,
 320 |         ]);
 321 |         */
 322 |       }
 323 |     ) {
 324 |     return r, nil /*return r;*/
 325 |   }
 326 | 
 327 | 
 328 | // Comments */
 329 | 
 330 | // The php parser does a straight str.replace(/<!--((?!-->).)*-->/g, "")
 331 | // but, as always, things around here are a little more complicated.
 332 | //
 333 | // We accept the same comments, but because we emit them as HTML comments
 334 | // instead of deleting them, we have to encode the data to ensure that
 335 | // we always emit a valid HTML5 comment.  See the encodeComment helper
 336 | // for further details.
 337 | 
 338 | comment
 339 |     <- "<!--" c1:(!"-->" .)* ("-->" / eof) {
 340 |       return &html.Node{
 341 |       Type: html.CommentNode,
 342 |       Data: concat(c1),
 343 |       }, nil
 344 |     /*
 345 |         var data = DU.encodeComment(c);
 346 |         return [new CommentTk(data, { tsr: tsrOffsets() })];
 347 |         */
 348 |     }
 349 | 
 350 | 
 351 | // Behavior switches. See:
 352 | // https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches
 353 | behavior_switch
 354 |   <- ("__" behavior_text "__") {return "behavior_text", nil
 355 |   /*
 356 |     if (env.conf.wiki.isMagicWord(bs)) {
 357 |       return [
 358 |         new SelfclosingTagTk('behavior-switch', [ new KV('word', bs) ],
 359 |           { tsr: tsrOffsets(), src: bs, magicSrc: bs }
 360 |         ),
 361 |       ];
 362 |     } else {
 363 |       return [ bs ];
 364 |     }
 365 |     */
 366 |   }
 367 | 
 368 | // Instead of defining a charset, php's doDoubleUnderscore concats a regexp of
 369 | // all the language specific aliases of the behavior switches and then does a
 370 | // match and replace. Just be as permissive as possible and let the
 371 | // BehaviorSwitchPreprocessor back out of any overreach.
 372 | behavior_text <- ( !"__" [^'"<~[{\n\r:;\]}|!=] )+
 373 | 
 374 | 
 375 | // ************************************************************
 376 | // External (bracketed and autolinked) links
 377 | // ************************************************************/
 378 | 
 379 | autolink
 380 |   <- ! {
 381 |     extlink, _ := peek(c, "extlink").(bool)
 382 |     return extlink, nil
 383 |     /*return stops.onStack('extlink');*/
 384 |   }
 385 |     // this must be a word boundary, so previous character must be non-word
 386 |     ! {return true, nil /*return /\w/.test(input[endOffset() - 1] || '');*/ }
 387 |   r:(
 388 |       // urllink, inlined
 389 |       target:autourl {
 390 |       return target, nil
 391 |       /*
 392 |         var res = [new SelfclosingTagTk('urllink', [new KV('href', target)], { tsr: tsrOffsets() })];
 393 |           return res;
 394 |           */
 395 |       }
 396 |     / autoref
 397 |     / isbn) {return r, nil /*return r;*/ }
 398 | 
 399 | extlink
 400 |   <- ! {
 401 |       extlink, _ := peek(c, "extlink").(bool)
 402 |       return extlink, nil
 403 |       /* return stops.onStack('extlink'); */
 404 |     } // extlink cannot be nested
 405 |         "["
 406 |         # {push(c, "extlink", true); return nil /*return stops.push('extlink', true);*/ }
 407 |         addr:(url_protocol urladdr / "")
 408 |         target:(extlink_preprocessor_text / "")
 409 |         & {
 410 |           // TODO: smarter check
 411 |           return true, nil
 412 |         /*
 413 |           // Protocol must be valid and there ought to be at least one
 414 |           // post-protocol character.  So strip last char off target
 415 |           // before testing protocol.
 416 |           var flat = tu.flattenString([addr, target]);
 417 |           if (Array.isArray(flat)) {
 418 |           // There are templates present, alas.
 419 |              return flat.length > 0;
 420 |           }
 421 |           return Util.isProtocolValid(flat.slice(0, -1), env);
 422 |           */
 423 |         }
 424 |         ( space / unispace )*
 425 |         //( "" {return nil, nil /*return endOffset();*/ })
 426 |         content:inlineline?
 427 |         "]"
 428 |         #{ pop(c, "extlink"); return nil }
 429 |         {
 430 |         n := &html.Node{
 431 |           Type: html.ElementNode,
 432 |           Data: "a",
 433 |           Attr: []html.Attribute{
 434 |             {Key: "href", Val: concat(addr, target)},
 435 |             {Key: "class", Val: "external"},
 436 |             {Key: "rel", Val: "nofollow"},
 437 |           },
 438 |         }
 439 |         addChild(n, content)
 440 |         return n, nil
 441 |         /*
 442 |             stops.pop('extlink');
 443 |             return [
 444 |                 new SelfclosingTagTk('extlink', [
 445 |                     new KV('href', tu.flattenString([addr, target])),
 446 |                     new KV('mw:content', content || ''),
 447 |                     new KV('spaces', sp),
 448 |                 ], {
 449 |                     targetOff: targetOff,
 450 |                     tsr: tsrOffsets(),
 451 |                     contentOffsets: [targetOff, endOffset() - 1],
 452 |                 }),
 453 |             ];
 454 |             */
 455 |         }
 456 | 
 457 | autoref
 458 |   <- ("RFC" / "PMID") space_or_nbsp+ [0-9]+ end_of_word
 459 | { return nil, nil
 460 | /*
 461 |     var base_urls = {
 462 |       'RFC': 'https://tools.ietf.org/html/rfc%s',
 463 |       'PMID': '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract',
 464 |     };
 465 |     return [
 466 |         new SelfclosingTagTk('extlink', [
 467 |            new KV('href', tu.sprintf(base_urls[ref], identifier)),
 468 |            new KV('mw:content', tu.flattenString([ref, sp, identifier])),
 469 |            new KV('typeof', 'mw:ExtLink/' + ref),
 470 |         ],
 471 |         { stx: "magiclink", tsr: tsrOffsets() }),
 472 |     ];
 473 |     */
 474 | }
 475 | 
 476 | isbn
 477 |   <- "ISBN" space_or_nbsp+ (
 478 |       [0-9]
 479 |       (space_or_nbsp_or_dash &[0-9] {return nil, nil/* return s; */} / [0-9])+
 480 |       ((space_or_nbsp_or_dash / "") [xX] / "")
 481 |     ) (
 482 |       end_of_word
 483 |       {return nil, nil
 484 |       /*
 485 | // Convert isbn token-and-entity array to stripped string.
 486 |         return tu.flattenStringlist(isbn).filter(function(e) {
 487 |           return e.constructor === String;
 488 |         }).join('').replace(/[^\dX]/ig, '').toUpperCase();
 489 |         */
 490 |       }
 491 |     ) &{
 492 |     return false, nil
 493 |     /*
 494 | // ISBNs can only be 10 or 13 digits long (with a specific format)
 495 |        return isbncode.length === 10 ||
 496 |              (isbncode.length === 13 && /^97[89]/.test(isbncode));
 497 |              */
 498 |     } {return nil, nil
 499 |     /*
 500 |       return [
 501 |         new SelfclosingTagTk('extlink', [
 502 |            new KV('href', 'Special:BookSources/' + isbncode),
 503 |            new KV('mw:content', tu.flattenString(['ISBN', sp, isbn])),
 504 |            new KV('typeof', 'mw:WikiLink/ISBN'),
 505 |         ],
 506 |         { stx: "magiclink", tsr: tsrOffsets() }),
 507 |       ];
 508 |       */
 509 | }
 510 | 
 511 | 
 512 | // Default URL protocols in MediaWiki (see DefaultSettings). Normally
 513 | // these can be configured dynamically. */
 514 | 
 515 | url_protocol <-
 516 |     & {return false, nil/* return Util.isProtocolValid(input.substr(endOffset()), env); */}
 517 |     ( "//" / [A-Za-z] [-A-Za-z0-9+.]* ":" "//"? ) {return nil, nil/* return p;*/ }
 518 | 
 519 | // no punctuation, and '{<' to trigger directives
 520 | no_punctuation_char <- [^ :\][\r\n"'<>,.&%{]
 521 | //TODO:  no_punctuation_char <- [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
 522 | 
 523 | // this is the general url rule
 524 | // on the PHP side, the path part matches EXT_LINK_URL_CLASS
 525 | // which is '[^][<>"\x00-\x20\x7F\p{Zs}]'
 526 | // the 's' and 'r' pieces below match the characters in
 527 | // EXT_LINK_URL_CLASS which aren't included in no_punctuation_char
 528 | url
 529 |   <- proto:url_protocol
 530 |     addr:(urladdr / "")
 531 |     path:(  ( !inline_breaks
 532 |               c1:no_punctuation_char
 533 |               {return c1, nil /*return c; */}
 534 |             )
 535 |             / s:[.:,']  {return s, nil/* return s; */}
 536 |             / comment
 537 |             / tplarg_or_template
 538 |             / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
 539 |                 r:(
 540 |                     & "&" he:htmlentity {return he, nil/* return he; */}
 541 |                   / [&%{]
 542 |                 ) {return r, nil /*return r;*/ }
 543 |          )*
 544 | // Must be at least one character after the protocol
 545 |          & {return false, nil /*return addr.length > 0 || path.length > 0;*/ }
 546 | {return []interface{}{proto, addr, path}, nil
 547 | /*
 548 |     return tu.flattenString([proto, addr].concat(path));
 549 |     */
 550 | }
 551 | 
 552 | // this is the somewhat-restricted rule used in autolinks
 553 | // See Parser::doMagicLinks and Parser.php::makeFreeExternalLink.
 554 | // The `path` portion matches EXT_LINK_URL_CLASS, as in the general
 555 | // url rule.  As in PHP, we do some fancy fixup to yank out
 556 | // trailing punctuation, perhaps including parentheses.
 557 | // The 's' and 'r' pieces match the characters in EXT_LINK_URL_CLASS
 558 | // which aren't included in no_punctuation_char
 559 | autourl
 560 |   <- &{return true, nil /*return stops.push('autourl', { sawLParen: false }); */}
 561 |     ! "//" // protocol-relative autolinks not allowed (T32269)
 562 |     (
 563 |     url_protocol
 564 |     (urladdr / "")
 565 |     (  ( !inline_breaks
 566 |               ! "("
 567 |               c1:no_punctuation_char
 568 |               {return c1, nil/* return c; */}
 569 |             )
 570 |             / "(" {return "(", nil/* stops.onStack('autourl').sawLParen = true; return "("; */}
 571 |             / [.:,]
 572 |             / (['] ![']) // single quotes are ok, double quotes are bad
 573 |             / comment
 574 |             / tplarg_or_template
 575 |             / ! ( raw_htmlentity &{return false, nil /* return /^[<>\u00A0]$/.test(rhe); */} )
 576 |                 r:(
 577 |                     & "&" he:htmlentity {return he, nil/* return he; */}
 578 |                   / [&%{]
 579 |                 ) {return r, nil/* return r; */}
 580 |          )*
 581 | {return "TODO: autourl",nil
 582 | /*
 583 | // as in Parser.php::makeFreeExternalLink, we're going to
 584 | // yank trailing punctuation out of this match.
 585 |     var url = tu.flattenStringlist([proto, addr].concat(path));
 586 | // only need to look at last element; HTML entities are strip-proof.
 587 |     var last = lastItem(url);
 588 |     var trim = 0;
 589 |     if (last && last.constructor === String) {
 590 |       var strip = ',;\\.:!?';
 591 |       if (!stops.onStack('autourl').sawLParen) {
 592 |         strip += ')';
 593 |       }
 594 |       strip = new RegExp('[' + JSUtils.escapeRegExp(strip) + ']*$');
 595 |       trim = strip.exec(last)[0].length;
 596 |       url[url.length - 1] = last.slice(0, last.length - trim);
 597 |     }
 598 |     url = tu.flattenStringlist(url);
 599 |     if (url.length === 1 && url[0].constructor === String && url[0].length <= proto.length) {
 600 |       return null; // ensure we haven't stripped everything: T106945
 601 |     }
 602 |     peg$currPos -= trim;
 603 |     stops.pop('autourl');
 604 |     return url;
 605 |     */
 606 | } ) &{return false, nil/* return r !== null; */} {return nil, nil/*return r; */}
 607 |     / &{return false, nil /*return stops.pop('autourl');*/ }
 608 | 
 609 | // This is extracted from EXT_LINK_ADDR in Parser.php: a simplified
 610 | // expression to match an IPv6 address.  The IPv4 address and "at least
 611 | // one character of a host name" portions are punted to the `path`
 612 | // component of the `autourl` and `url` productions
 613 | urladdr
 614 |   <- ( "[" [0-9A-Fa-f:.]+ "]" )
 615 | 
 616 | // ************************************************************
 617 | // Templates, -arguments and wikilinks
 618 | // ************************************************************/
 619 | 
 620 | 
 621 | // Precedence: template arguments win over templates. See
 622 | // http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
 623 | // 4: {{{{·}}}} → {·{{{·}}}·}
 624 | // 5: {{{{{·}}}}} → {{·{{{·}}}·}}
 625 | // 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
 626 | // 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
 627 | // This is only if close has > 3 braces; otherwise we just match open
 628 | // and close as we find them.
 629 | //
 630 | tplarg_or_template
 631 |   <- &"{{" //&{return false, nil}
 632 | //
 633 | //// Refuse to recurse beyond `maxDepth` levels. Default in the PHP parser
 634 | //// is $wgMaxTemplateDepth = 40; This is to prevent crashing from
 635 | //// buggy wikitext with lots of unclosed template calls, as in
 636 | //// eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094
 637 | //      if (stops.onCount('templatedepth') === undefined ||
 638 | //          stops.onCount('templatedepth') < env.conf.parsoid.maxDepth) {
 639 | //        return true;
 640 | //      } else {
 641 | //        return false;
 642 | //      }
 643 |     t:tplarg_or_template_guarded {return t, nil /*return t;*/ }
 644 | 
 645 | tplarg_or_template_guarded
 646 |   <- #{inc(c, "templatedepth"); return nil /* return stops.inc('templatedepth');*/ }
 647 |     r:( &("{{" &("{{{"+ !'{') tplarg) a:(template/broken_template) {return a, nil /*return a;*/ }
 648 |       / a:('{' &("{{{"+ !'{'))? b:tplarg {return concat(a, b), nil /*return [a].concat(b);*/ }
 649 |       / a:('{' &("{{" !'{'))? b:template {return concat(a, b), nil /*return [a].concat(b);*/ }
 650 |       / a:broken_template {return a, nil /*return a;*/ }
 651 |     ) #{
 652 |       dec(c, "templatedepth")
 653 |       return nil
 654 |     } {
 655 |     return r, nil
 656 |     /*
 657 |       stops.dec('templatedepth');
 658 |       return r;
 659 |       */
 660 |     }
 661 | 
 662 | tplarg_or_template_or_bust
 663 |     <- (tplarg_or_template / .)+
 664 | 
 665 | template
 666 |   <- #{
 667 |       push(c, "level", push(c, "preproc", /*{{*/ "}}"))
 668 |       return nil
 669 |       /* return stops.push('preproc', / * {{ * /"}}"); */
 670 |     }
 671 |     t:template_preproc
 672 |     #{
 673 |       popTo(c, "preproc", pop(c, "level").(int))
 674 |       return nil
 675 |     }
 676 |     {return t, nil/* stops.popTo('preproc', stopLen); return t; */}
 677 | 
 678 | // The PHP preprocessor maintains a single stack of "closing token we
 679 | // are currently looking for", with no backtracking.  This means that
 680 | // once you see `[[ {{` you are looking only for `}}` -- if that template
 681 | // turns out to be broken you will never pop the `}}` and there is no way
 682 | // to close the `[[`.  Since the PEG tokenizer in Parsoid uses backtracking
 683 | // and parses in a single pass (instead of PHP's split preprocessor/parser)
 684 | // we have to be a little more careful when we emulate this behavior.
 685 | // If we use a rule like:
 686 | //   template = "{{" tplname tplargs* "}}"?
 687 | // Then we end up having to reinterpret `tplname tplargs*` as a tlb if it
 688 | // turns out we never find the `}}`, which involves a lot of tedious gluing
 689 | // tokens back together with fingers crossed we haven't discarded any
 690 | // significant newlines/whitespace/etc.  An alternative would be a rule like:
 691 | //   broken_template = "{{" tlb
 692 | // but again, `template` is used in many different contexts; `tlb` isn't
 693 | // necessarily the right one to recursively invoke.  Instead we get the
 694 | // broken template off of the PEGjs production stack by returning immediately
 695 | // after `{{`, but we leave a "broken token" on top of the preprocessor
 696 | // stops stack to indicate we're "still in" the {{ context and shouldn't
 697 | // ever inlineBreak for any closing tokens above this one.  For example:
 698 | //   [[Foo{{Bar]]
 699 | // This will match as:
 700 | //   wikilink->text,template->text             --> FAILS looking for }}
 701 | //     backtracks, popping "]]" and "}}" off preproc stack
 702 | //   wikilink->text,broken_template,text       --> FAILS looking for ]]
 703 | //     backtracks, popping "]]" and "broken" off preproc stack
 704 | //   broken_wikilink,text,broken_template,text --> OK
 705 | //     with ["broken", "broken"] left on the preproc stops stack
 706 | // Note that we use stops.popTo() to make sure the preproc stack is
 707 | // cleaned up properly during backtracking, even if there were broken-FOO
 708 | // productions taken which (deliberately) left elements on the preproc stack.
 709 | 
 710 | broken_template
 711 |   <- &"{{" #{push(c, "preproc", "broken"); return nil/* return stops.push('preproc', 'broken'); */}
 712 | // for broken-template,  deliberately fail to pop the preproc stops stack
 713 |     t:"{{"
 714 |     #{pop(c, "preproc"); return nil}
 715 |     {return t, nil/* return t; */}
 716 | 
 717 | template_preproc
 718 |   <- "{{" nl_comment_space*
 719 |     target:template_param_value
 720 |     attributes:(nl_comment_space* "|"
 721 |                 r:(
 722 |                     nl_comment_space*
 723 |                     &("|" / "}}")
 724 |                     {return nil, nil/* return new KV('', tu.flattenIfArray(v), [p0, p0, p0,
 725 |                     p]);*/
 726 |                     } // empty argument
 727 |                     / template_param
 728 |                   ) {return r, nil/* return r; */}
 729 |             )*
 730 |     nl_comment_space*
 731 |     inline_breaks "}}" {
 732 |       opts, ok := c.globalStore["opts"].(opts)
 733 |       if !ok {
 734 |         return nil, nil
 735 |       }
 736 |       if opts.templateHandler == nil {
 737 |         return nil, nil
 738 |       }
 739 |       var attrs []Attribute
 740 |       for _, attr := range flatten(attributes) {
 741 |         attr := attr.(Attribute)
 742 |         attrs = append(attrs, attr)
 743 |       }
 744 |       val, err := opts.templateHandler(strings.TrimSpace(concat(target)), attrs)
 745 |       if err != nil {
 746 |         return fmt.Sprintf("{{ template error: %s }}", err.Error()), nil
 747 |       }
 748 |       return val, nil
 749 |     /*
 750 | // Insert target as first positional attribute, so that it can be
 751 | // generically expanded. The TemplateHandler then needs to shift it out
 752 | // again.
 753 |       params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets));
 754 |       var obj = new SelfclosingTagTk('template', params, { tsr: tsrOffsets(), src: text() });
 755 |       return obj;
 756 |       */
 757 |     } / ("{{" space_or_newline* "}}")
 758 | 
 759 | tplarg
 760 |   <- //("" {return nil, nil /*return stops.push('preproc', / * {{ * /"}}"); */})
 761 |     t:(tplarg_preproc / &{return false, nil  /*return stops.popTo('preproc', stopLen); */} )
 762 |     {return t, nil/* stops.popTo('preproc', stopLen); return t; */}
 763 | 
 764 | tplarg_preproc
 765 |   <- "{{{"
 766 |     //("" {return nil, nil/* return endOffset(); */})
 767 |     target:template_param_value?
 768 |     params:(nl_comment_space* "|"
 769 |                 ( ("" {return nil, nil/* return endOffset(); */})
 770 |                     nl_comment_space*
 771 |                     ("" {return nil, nil/* return endOffset(); */})
 772 |                     &("|" / "}}}")
 773 |                     {return nil, nil/* return {return nil, nil tokens: v, srcOffsets: [p0, p1] }; */}  // empty argument
 774 |                     / template_param_value
 775 |                   ) {return nil, nil/* return r; */}
 776 |             )*
 777 |     nl_comment_space*
 778 |     inline_breaks "}}}" {return concat(target, params), nil
 779 |     /*
 780 |       params = params.map(function(o) {
 781 |         var s = o.srcOffsets;
 782 |         return new KV('', tu.flattenIfArray(o.tokens), [s[0], s[0], s[0], s[1]]);
 783 |       });
 784 |       if (target === null) { target = { tokens: '', srcOffsets: [p, p, p, p] }; }
 785 | // Insert target as first positional attribute, so that it can be
 786 | // generically expanded. The TemplateHandler then needs to shift it out
 787 | // again.
 788 |       params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets));
 789 |       var obj = new SelfclosingTagTk('templatearg', params, { tsr: tsrOffsets(), src: text() });
 790 |       return obj;
 791 |       */
 792 |     }
 793 | 
 794 | template_param
 795 |   <- key:template_param_name
 796 |     val:(
 797 |         //("" {return nil, nil/* return endOffset(); */})
 798 |         optionalSpaceToken
 799 |         "="
 800 |         //("" {return nil, nil/* return endOffset(); */})
 801 |         optionalSpaceToken
 802 |         tpv:template_param_value? {return tpv, nil
 803 |         /*
 804 |             return { kEndPos: kEndPos, vStartPos: vStartPos, value: (tpv && tpv.tokens) || [] };
 805 |             */
 806 |         }
 807 |     )? {
 808 |     return Attribute{
 809 |       Key: key,
 810 |       Val: val,
 811 |     }, nil
 812 |     /*
 813 |       if (val !== null) {
 814 |           if (val.value !== null) {
 815 |             return new KV(name, tu.flattenIfArray(val.value), [startOffset(), val.kEndPos, val.vStartPos, endOffset()]);
 816 |           } else {
 817 |             return new KV(tu.flattenIfArray(name), '', [startOffset(), val.kEndPos, val.vStartPos, endOffset()]);
 818 |           }
 819 |       } else {
 820 |         return new KV('', tu.flattenIfArray(name), [startOffset(), startOffset(), startOffset(), endOffset()]);
 821 |       }
 822 |       */
 823 |     }
 824 | // empty parameter
 825 |   / & [|}] {return nil, nil
 826 |   /*
 827 |     return new KV('', '', [startOffset(), startOffset(), startOffset(), endOffset()]);
 828 |     */
 829 |   }
 830 | 
 831 | template_param_name
 832 |   <- & {
 833 |   push(c, "equal", true)
 834 |   return true, nil /*return stops.push('equal', true); */}
 835 |     tpt:(template_param_text / &'=' {return "", nil/* return ''; */})
 836 |     {
 837 |     pop(c, "equal")
 838 |     return tpt, nil
 839 |     /*
 840 |         stops.pop('equal');
 841 |         return tpt;
 842 |         */
 843 |     }
 844 | 
 845 |   / & {
 846 |     pop(c, "equal")
 847 |     return false, nil
 848 |     /* return stops.pop('equal'); */
 849 |   }
 850 | 
 851 | template_param_value
 852 |   <- #{ push(c, "equal", false); return nil }
 853 |     tpt:template_param_text
 854 |     #{ pop(c, "equal"); return nil }
 855 |     {
 856 |     return tpt, nil
 857 |     /*
 858 |         stops.pop('equal');
 859 |         return { tokens: tpt, srcOffsets: tsrOffsets() };
 860 |         */
 861 |     }
 862 | 
 863 | template_param_text
 864 |   <- #{
 865 |   push(c, "table", false)
 866 |   push(c, "extlink", false)
 867 |   push(c, "templateArg", true)
 868 |   push(c, "tableCellArg", false)
 869 |   inc(c, "template")
 870 |   return nil
 871 |   /*
 872 |   // re-enable tables within template parameters
 873 |         stops.push('table', false);
 874 |         stops.push('extlink', false);
 875 |         stops.push('templateArg', true);
 876 |         stops.push('tableCellArg', false);
 877 |         return stops.inc('template');
 878 |         */
 879 |     }
 880 |     il:(nested_block / newlineToken)+ #{
 881 |       pop(c, "table")
 882 |       pop(c, "extlink")
 883 |       pop(c, "templateArg")
 884 |       pop(c, "tableCellArg")
 885 |       dec(c, "template")
 886 |       return nil
 887 |     }
 888 |     {
 889 |     return il, nil
 890 |     /*
 891 |         stops.pop('table');
 892 |         stops.pop('extlink');
 893 |         stops.pop('templateArg');
 894 |         stops.pop('tableCellArg');
 895 |         stops.dec('template');
 896 | // il is guaranteed to be an array -- so, tu.flattenIfArray will
 897 | // always return an array
 898 |         var r = tu.flattenIfArray(il);
 899 |         if (r.length === 1 && r[0].constructor === String) {
 900 |             r = r[0];
 901 |         }
 902 |         return r;
 903 |         */
 904 |     }
 905 | 
 906 | //// Language converter block markup of language variants: -{ ... }-
 907 | 
 908 | // Note that "rightmost opening" precedence rule (see
 909 | // https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means
 910 | // that neither -{{ nor -{{{ are parsed as a -{ token, although
 911 | // -{{{{ is (since {{{ has precedence over {{).
 912 | 
 913 | lang_variant_or_tpl
 914 |   <- &("-{" &("{{{"+ !'{') tplarg) a:lang_variant {return a, nil/* return a; */}
 915 |   / a:('-' &("{{{"+ !'{')) b:tplarg {return concat(a, b), nil /*return [a].concat(b);*/ }
 916 |   / a:('-' &("{{" "{{{"* !'{')) b:template {return concat(a, b), nil/* return [a].concat(b); */}
 917 |   / &"-{" a:lang_variant {return a, nil /*return a; */}
 918 | 
 919 | broken_lang_variant
 920 |   <- &{return true, nil  /*return stops.push('preproc', 'broken'); */}
 921 | // for broken-lang-variant, deliberately fail to pop the stops stack
 922 |     r:"-{" {return r, nil /*return r; */}
 923 | 
 924 | lang_variant
 925 |   <- ("" {return nil, nil /*return stops.push('preproc', /* -{ * / '}-'); */})
 926 |     lv:(lang_variant_preproc / &{return false, nil  /*return stops.popTo('preproc', stopLen); */})
 927 |     {return lv, nil /*stops.popTo('preproc', stopLen); return lv; */}
 928 |   / broken_lang_variant
 929 | 
 930 | lang_variant_preproc
 931 |   <- ("-{" {return nil, nil/* return startOffset(); */})
 932 |     (
 933 |        &{return false, nil /* return env.langConverterEnabled(); */}
 934 |        ff:opt_lang_variant_flags {return ff, nil
 935 |        /*
 936 | // Avoid mutating cached expression results
 937 |          ff = Util.clone(ff, true);
 938 | // if flags contains 'R', then don't treat ; or : specially inside.
 939 |          if (ff.flags) {
 940 |            ff.raw = ff.flags.has('R') || ff.flags.has('N');
 941 |          } else if (ff.variants) {
 942 |            ff.raw = true;
 943 |          }
 944 |          return ff;
 945 |          */
 946 |        } /
 947 |        &{return false, nil  /*return !env.langConverterEnabled(); */}
 948 |        "" {return nil, nil
 949 |        /*
 950 | // if language converter not enabled, don't try to parse inside.
 951 |          return { raw: true };
 952 |          */
 953 |        }
 954 |     )
 955 |     (
 956 |       &{return false, nil  /*return f.raw; */} lv:lang_variant_text {return lv, nil/* return [{ text: lv }]; */}
 957 |       /
 958 |       &{return false, nil /* return !f.raw; */} lv:lang_variant_option_list {return lv, nil/* return lv; */}
 959 |     )
 960 |     inline_breaks
 961 |     ("}-" {return nil, nil/* return endOffset(); */}) {return "TODO lang_variant_preproc", nil
 962 |     /*
 963 | 
 964 |       if (!env.langConverterEnabled()) {
 965 |         return [ "-{", ts[0].text.tokens, "}-" ];
 966 |       }
 967 |       var lvsrc = input.substring(lv0, lv1);
 968 |       var attribs = [];
 969 | 
 970 | // Do a deep clone since we may be destructively modifying
 971 | // (the `t[fld] = name;` below) the result of a cached expression
 972 |       ts = Util.clone(ts, true);
 973 | 
 974 |       ts.forEach(function(t) {
 975 | // move token strings into KV attributes so that they are
 976 | // properly expanded by early stages of the token pipeline
 977 |         ['text','from','to'].forEach(function(fld) {
 978 |           if (t[fld] === undefined) { return; }
 979 |           var name = 'mw:lv' + attribs.length;
 980 |           attribs.push(new KV(name, t[fld].tokens, t[fld].srcOffsets));
 981 |           t[fld] = name;
 982 |         });
 983 |       });
 984 |       return [
 985 |         new SelfclosingTagTk(
 986 |           'language-variant',
 987 |            attribs,
 988 |            {return nil, nil
 989 |              tsr: [lv0, lv1],
 990 |              src: lvsrc,
 991 |              flags: f.flags && Array.from(f.flags).sort(),
 992 |              variants: f.variants && Array.from(f.variants).sort(),
 993 |              original: f.original,
 994 |              flagSp: f.sp,
 995 |              texts: ts,
 996 |            }),
 997 |       ];
 998 |       */
 999 |     }
1000 | 
1001 | opt_lang_variant_flags
1002 |   <- f:( ff:lang_variant_flags "|" {return ff, nil/* return ff; */} )? {return f, nil
1003 |   /*
1004 | // Collect & separate flags and variants into a set and ordered list
1005 |     var flags = new Set();
1006 |     var variants = new Set();
1007 |     var flagList = [];
1008 |     var flagSpace = [];
1009 |     var variantList = [];
1010 |     var variantSpace = [];
1011 |     var useVariants = false;
1012 |     var internalSp = []; // internal whitespace, for round-tripping
1013 |     if (f !== null) {
1014 | // lang_variant_flags returns arrays in reverse order.
1015 |       f.flags.reverse();
1016 |       f.sp.reverse();
1017 |       var spPtr = 0;
1018 |       f.flags.forEach(function(item) {
1019 |         if (item.flag) {
1020 |           flagSpace.push(f.sp[spPtr++]);
1021 |           flags.add(item.flag);
1022 |           flagList.push(item.flag);
1023 |           flagSpace.push(f.sp[spPtr++]);
1024 |         }
1025 |         if (item.variant) {
1026 |           variantSpace.push(f.sp[spPtr++]);
1027 |           variants.add(item.variant);
1028 |           variantList.push(item.variant);
1029 |           variantSpace.push(f.sp[spPtr++]);
1030 |         }
1031 |       });
1032 |       if (spPtr < f.sp.length) {
1033 | // handle space after a trailing semicolon
1034 |         flagSpace.push(f.sp[spPtr]);
1035 |         variantSpace.push(f.sp[spPtr]);
1036 |       }
1037 |     }
1038 | // Parse flags (this logic is from core/languages/ConverterRule.php
1039 | // in the parseFlags() function)
1040 |     if (flags.size === 0 && variants.size === 0) {
1041 |       flags.add('$S');
1042 |     } else if (flags.has('R')) {
1043 |       flags = new Set(['R']); // remove other flags
1044 |     } else if (flags.has('N')) {
1045 |       flags = new Set(['N']); // remove other flags
1046 |     } else if (flags.has('-')) {
1047 |       flags = new Set(['-']); // remove other flags
1048 |     } else if (flags.has('T') && flags.size === 1) {
1049 |       flags.add('H');
1050 |     } else if (flags.has('H')) {
1051 | // Replace A flag, and remove other flags except T and D
1052 |       var nf = new Set(['$+', 'H']);
1053 |       if (flags.has('T')) { nf.add('T'); }
1054 |       if (flags.has('D')) { nf.add('D'); }
1055 |       flags = nf;
1056 |     } else if (variants.size > 0) {
1057 |       useVariants = true;
1058 |     } else {
1059 |       if (flags.has('A')) {
1060 |         flags.add('$+');
1061 |         flags.add('$S');
1062 |       }
1063 |       if (flags.has('D')) {
1064 |         flags.delete('$S');
1065 |       }
1066 |     }
1067 |     if (useVariants) {
1068 |       return { variants: variants, original: variantList, sp: variantSpace };
1069 |     } else {
1070 |       return { flags: flags, original: flagList, sp: flagSpace };
1071 |     }
1072 |     */
1073 |   }
1074 | 
1075 | lang_variant_flags
1076 |   <- (space_or_newline*) lang_variant_flag (space_or_newline*)
1077 |     ( ";" lang_variant_flags? )? {return nil, nil
1078 |     /*
1079 |     var r = more && more[1] ? more[1] : { sp: [], flags: [] };
1080 | // Note that sp and flags are in reverse order, since we're using
1081 | // right recursion and want to push instead of unshift.
1082 |     r.sp.push(sp2.join(''));
1083 |     r.sp.push(sp1.join(''));
1084 |     r.flags.push(f);
1085 |     return r;
1086 |     */
1087 |   }
1088 |   / (space_or_newline*) {return nil, nil
1089 |   /*
1090 |     return { sp: [ sp.join('') ], flags: [] };
1091 |     */
1092 |   }
1093 | 
1094 | lang_variant_flag
1095 |   <- [-+A-Z]           {return nil, nil /*return { flag: f }; */}
1096 |   / lang_variant_name {return nil, nil/* return { variant: v }; */}
1097 |   / (!space_or_newline !nowiki [^{}|;])+ {return nil, nil/* return { bogus: b.join('') }; /*
1098 |   bad flag * /*/}
1099 | 
1100 | lang_variant_name // language variant name, like zh, zh-cn, etc.
1101 |   <- [a-z] [-a-z]+ {return nil, nil/* return h + t.join(''); */}
1102 | // Escaped otherwise-unrepresentable language names
1103 | // Primarily for supporting html2html round trips; PHP doesn't support
1104 | // using nowikis here (yet!)
1105 |   / nowiki_text
1106 | 
1107 | lang_variant_option_list
1108 |   <- lang_variant_option ( ";" lang_variant_option {return nil, nil/* return oo; */})*
1109 |     ( ";" space_or_newline* )? // optional trailing semicolon
1110 |     {return nil, nil
1111 |     /*
1112 |       var r = [ o ].concat(rest);
1113 |       if (tr) { r.push({ semi: true, sp: tr[1].join('') }); }
1114 |       return r;
1115 |       */
1116 |     }
1117 |   / lang_variant_text {return nil, nil/* return [{ text: lvtext }]; */}
1118 | 
1119 | lang_variant_option
1120 |   <- (space_or_newline*) lang_variant_name
1121 |     (space_or_newline*) ":"
1122 |     (space_or_newline*)
1123 |     (lang_variant_nowiki / lang_variant_text_no_semi)
1124 |     {return nil, nil
1125 |     /*
1126 |       return {
1127 |         twoway: true,
1128 |         lang: lang,
1129 |         text: lvtext,
1130 |         sp: [sp1.join(''), sp2.join(''), sp3.join('')]
1131 |       };
1132 |       */
1133 |     }
1134 |   / (space_or_newline*)
1135 |     (lang_variant_nowiki / lang_variant_text_no_semi_or_arrow)
1136 |     "=>"
1137 |     (space_or_newline*) lang_variant_name
1138 |     (space_or_newline*) ":"
1139 |     (space_or_newline*)
1140 |     (lang_variant_nowiki / lang_variant_text_no_semi)
1141 |     {return nil, nil
1142 |     /*
1143 |       return {
1144 |         oneway: true,
1145 |         from: from,
1146 |         lang: lang,
1147 |         to: to,
1148 |         sp: [sp1.join(''), sp2.join(''), sp3.join(''), sp4.join('')]
1149 |       };
1150 |       */
1151 |     }
1152 | 
1153 | // html2wt support: If a language name or conversion string can't be
1154 | // represented w/o breaking wikitext, just wrap it in a <nowiki>.
1155 | // PHP doesn't support this (yet), but Parsoid does.
1156 | lang_variant_nowiki
1157 |   <- ("" {return nil, nil/*return startOffset();*/})
1158 |     nowiki_text
1159 |     ("" {return nil, nil/* return endOffset();*/})
1160 |     space_or_newline* {return nil, nil
1161 |     /*
1162 |   return { tokens: [ n ], srcOffsets: [start, end] };
1163 |   */
1164 | }
1165 | 
1166 | lang_variant_text
1167 |   <- ("" {return nil, nil/*return startOffset();*/})
1168 |     (inlineline / "|" )*
1169 |     ("" {return nil, nil/*return endOffset();*/})
1170 |     {return nil, nil/* return { tokens: tokens || [], srcOffsets: [start, end] }; */}
1171 | 
1172 | lang_variant_text_no_semi
1173 |   <- & {return false, nil/* return stops.push('semicolon', true); */}
1174 |     lang_variant_text
1175 |     {return nil, nil/* stops.pop('semicolon'); return lvtext; */}
1176 |   / & {return false, nil/* return stops.pop('semicolon'); */}
1177 | 
1178 | lang_variant_text_no_semi_or_arrow
1179 |   <- & {return false, nil/* return stops.push('arrow', true); */}
1180 |     lang_variant_text_no_semi {return nil, nil/* stops.pop('arrow'); return lvtext; */}
1181 |   / & {return false, nil/* return stops.pop('arrow'); */}
1182 | 
1183 | wikilink_content
1184 |   <-  (pipe lt:link_text? {
1185 |     return lt, nil
1186 |     /*
1187 |         var maybeContent = new KV('mw:maybeContent', lt, [startPos, endOffset()]);
1188 |         maybeContent.vsrc = input.substring(startPos, endOffset());
1189 |         return maybeContent;
1190 |         */
1191 |   })*
1192 | 
1193 | wikilink <- wikilink_preproc / broken_wikilink
1194 | 
1195 | // `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the
1196 | // second bracket could start an extlink.  Deliberately leave entry
1197 | // on preproc stack since we haven't seen a double-close bracket.
1198 | // (See full explanation above broken_template production.)
1199 | broken_wikilink
1200 |   <- &"[[" #{
1201 |       push(c, "preproc", "broken")
1202 |       return nil
1203 |       /* return stops.push('preproc', 'broken'); */
1204 |     }
1205 |     a:("[" (extlink / "["))
1206 |     #{ pop(c, "preproc"); return nil }
1207 |     {
1208 |       return a, nil
1209 |       /* return a; */
1210 |     }
1211 | 
1212 | wikilink_preproc
1213 |   <- "[["
1214 |     #{ push(c, "preproc", "]]"); return nil }
1215 |     target:wikilink_preprocessor_text?
1216 |     //("" {return nil, nil/* return endOffset(); */})
1217 |     lcs:wikilink_content
1218 |     inline_breaks "]]"
1219 |     #{ pop(c, "preproc"); return nil }
1220 |   {
1221 |     targetStr := concat(target)
1222 |     if strings.HasPrefix(targetStr, "File:") || strings.HasPrefix(targetStr, "Image:") {
1223 |       n := &html.Node{
1224 |         Type: html.ElementNode,
1225 |         Data: "div",
1226 |         Attr: []html.Attribute{
1227 |           {Key: "class", Val: "image"},
1228 |         },
1229 |       }
1230 |       link := &html.Node{
1231 |         Type: html.ElementNode,
1232 |         Data: "a",
1233 |         Attr: []html.Attribute{
1234 |           {Key: "href", Val: TitleToURL(targetStr)},
1235 |         },
1236 |       }
1237 |       addChild(link, targetStr)
1238 |       addChild(n, link)
1239 |       children, ok := lcs.([]interface{})
1240 |       if ok && len(children) > 0 {
1241 |         descDiv := &html.Node{
1242 |           Type: html.ElementNode,
1243 |           Data: "div",
1244 |           Attr: []html.Attribute{
1245 |             {Key: "class", Val: "caption"},
1246 |           },
1247 |         }
1248 |         addChild(descDiv, children[len(children)-1])
1249 |         addChild(n, descDiv)
1250 |       }
1251 |       return n, nil
1252 |     }
1253 |     n := &html.Node{
1254 |       Type: html.ElementNode,
1255 |       Data: "a",
1256 |       Attr: []html.Attribute{
1257 |         {Key: "href", Val: TitleToURL(targetStr)},
1258 |       },
1259 |     }
1260 |     if !addChild(n, lcs) {
1261 |       addChild(n, targetStr)
1262 |     }
1263 |     return n, nil
1264 |   /*
1265 |       var pipeTrick = (lcs.length === 1 && lcs[0].v === null);
1266 |       var textTokens = [];
1267 |       if (target === null || pipeTrick) {
1268 |         textTokens.push("[[");
1269 |         if (target) {
1270 |           textTokens.push(target);
1271 |         }
1272 |         lcs.forEach(function(a) {
1273 | // a is a mw:maybeContent attribute
1274 |           textTokens.push("|");
1275 |           if (a.v !== null) { textTokens.push(a.v); }
1276 |         });
1277 |         textTokens.push("]]");
1278 |         return textTokens;
1279 |       }
1280 |       var obj = new SelfclosingTagTk('wikilink');
1281 |       var hrefKV = new KV('href', target);
1282 |       hrefKV.vsrc = input.substring(startOffset() + 2, tpos);
1283 | // XXX: Point to object with path, revision and input information
1284 | // obj.source = input;
1285 |       obj.attribs.push(hrefKV);
1286 |       obj.attribs = obj.attribs.concat(lcs);
1287 |       obj.dataAttribs = {
1288 |           tsr: tsrOffsets(),
1289 |           src: text(),
1290 |       };
1291 |       return [obj];
1292 |       */
1293 |   }
1294 | 
1295 | // Tables are allowed inside image captions.
1296 | link_text
1297 |   <- #{
1298 |       // Suppress the flag temporarily in this rule to consume the '=' here.
1299 |       push(c, "equal", false)
1300 |       push(c, "linkdesc", true)
1301 |       return nil
1302 |     }
1303 |     c1:(  // This group is similar to "block_line" but "list_item"
1304 |          // is omitted since `doBlockLevels` happens after
1305 |          // `replaceInternalLinks2`, where newlines are stripped.
1306 |          (sol (heading / hr / full_table_in_link_caption))
1307 |        / urltext
1308 |        / (!inline_breaks
1309 |           r:( inline_element / '[' text_char+ ']' (&(!']' / "]]")) / . ) {return r, nil}
1310 |          )
1311 |     )+ #{
1312 |       pop(c, "equal")
1313 |       pop(c, "linkdesc")
1314 |       return nil
1315 |     }
1316 |     {
1317 |       return c1, nil
1318 |     }
1319 | 
1320 | // Generic quote rule for italic and bold, further processed in a token
1321 | // stream transformation in doQuotes. Relies on NlTk tokens being emitted
1322 | // for each line of text to balance quotes per line.
1323 | 
1324 | // We are not using a simple pair rule here as we need to support mis-nested
1325 | // bolds/italics and MediaWiki's special heuristics for apostrophes, which are
1326 | // all not context free. */
1327 | quote <- ("''" "'"*) {
1328 |   return &html.Node{
1329 |     Type: html.ElementNode,
1330 |     Data: "b",
1331 |     Attr: []html.Attribute{
1332 |       {Key: "_parsetoken"},
1333 |     },
1334 |   }, nil
1335 | /*
1336 | // sequences of four or more than five quotes are assumed to start
1337 | // with some number of plain-text apostrophes.
1338 |     var plainticks = 0;
1339 |     var result = [];
1340 |     if (quotes.length === 4) {
1341 |         plainticks = 1;
1342 |     } else if (quotes.length > 5) {
1343 |         plainticks = quotes.length - 5;
1344 |     }
1345 |     if (plainticks > 0) {
1346 |         result.push(quotes.substring(0, plainticks));
1347 |     }
1348 | // mw-quote token Will be consumed in token transforms
1349 |     var tsr = tsrOffsets();
1350 |     tsr[0] += plainticks;
1351 |     var mwq = new SelfclosingTagTk('mw-quote', [], { tsr: tsr });
1352 |     mwq.value = quotes.substring(plainticks);
1353 |     result.push(mwq);
1354 |     return result;
1355 |     */
1356 | }
1357 | 
1358 | 
1359 | // *********************************************************
1360 | // Pre and xmlish tags
1361 | // *********************************************************/
1362 | 
1363 | extension_tag <-
1364 |   &{return false, nil  /*return !stops.onStack('extTag'); */}
1365 |   xmlish_tag
1366 | // Account for `maybeExtensionTag` returning unmatched start / end tags
1367 |   &{return false, nil /* return extToken.name === 'extension'; */}
1368 |   {return nil, nil/* return extToken; */}
1369 | 
1370 | nowiki
1371 |   <- extension_tag
1372 |     &{return false, nil /* return extToken.getAttribute('name') === 'nowiki'; */}
1373 |     {return nil, nil/* return extToken; */}
1374 | 
1375 | // Used by nowiki extension to tokenize html entities.
1376 | nowiki_content
1377 |   <- c2:(htmlentity / .)* {return c2, nil/* return tu.flattenIfArray(c); */}
1378 | 
1379 | // Used by lang_variant productions to protect special language names or
1380 | // conversion strings.
1381 | nowiki_text
1382 |   <- nowiki
1383 |   {return nil, nil
1384 |   /*
1385 |     var txt = Util.getExtArgInfo(extToken).dict.body.extsrc;
1386 |     return Util.decodeEntities(txt);
1387 |     */
1388 |   }
1389 | 
1390 | // Generic XML-like tags
1391 | 
1392 | // These also cover extensions (including Cite), which will hook into the
1393 | // token stream for further processing. The content of extension tags is
1394 | // parsed as regular inline, but the source positions of the tag are added
1395 | // to allow reconstructing the unparsed text from the input. */
1396 | 
1397 | // See http://www.w3.org/TR/html5/syntax.html#tag-open-state and
1398 | // following paragraphs.
1399 | tag_name_chars <- [^\t\n\v />\x00]
1400 | tag_name <- ([A-Za-z] tag_name_chars*)
1401 | 
1402 | xmlish_tag
1403 |   <- # {
1404 |       push(c, "table", false)
1405 |       push(c, "tableCellArg", false)
1406 |       return nil
1407 |     }
1408 | // By the time we get to `doTableStuff` in the php parser, we've already
1409 | // safely encoded element attributes. See 55313f4e in core.
1410 | //      stops.push('table', false);
1411 | //      stops.push('tableCellArg', false);
1412 |       //return true;
1413 |     //}
1414 |     "<" end:"/"?
1415 |     name:(tag_name & {return true, nil}
1416 |     ///*
1417 |     //  return isXMLTag(tn, false);  // NOTE: 'extTag' stop was pushed.
1418 |     //  */
1419 |     //}
1420 |     )
1421 |     attribs:generic_newline_attributes
1422 |     space_or_newline* // No need to preserve this -- canonicalize on RT via dirty diff
1423 |     selfclose:"/"?
1424 |     space* // not preserved - canonicalized on RT via dirty diff
1425 |     ">"
1426 |     #{
1427 |       pop(c, "table")
1428 |       pop(c, "tableCellArg")
1429 |       pop(c, "extTag")
1430 |       return nil
1431 |     }
1432 |     {
1433 |     n := &html.Node{
1434 |       Type: html.ElementNode,
1435 |       Data: concat(name),
1436 |     }
1437 | 
1438 |     for _, attr := range flatten(attribs) {
1439 |       attr := attr.(html.Attribute)
1440 |       n.Attr = append(n.Attr, attr)
1441 |     }
1442 | 
1443 |     if end != nil {
1444 |       n.Attr = append(n.Attr, html.Attribute{Key:"_parseend"})
1445 |     } else if selfclose == nil {
1446 |       n.Attr = append(n.Attr, html.Attribute{Key:"_parsestart"})
1447 |     }
1448 | 
1449 |     return n, nil
1450 |     /*
1451 |         stops.pop('table');
1452 |         stops.pop('tableCellArg');
1453 |         stops.pop('extTag');
1454 | 
1455 |         var lcName = name.toLowerCase();
1456 | 
1457 | // Extension tags don't necessarily have the same semantics as html tags,
1458 | // so don't treat them as void elements.
1459 |         var isVoidElt = Util.isVoidElement(lcName) && !env.conf.wiki.extensionTags.has(lcName);
1460 | 
1461 | // Support </br>
1462 |         if (lcName === 'br' && end) {
1463 |             end = null;
1464 |         }
1465 | 
1466 |         var res = tu.buildXMLTag(name, lcName, attribs, end, !!selfclose || isVoidElt, tsrOffsets());
1467 | 
1468 | // change up data-attribs in one scenario
1469 | // void-elts that aren't self-closed ==> useful for accurate RT-ing
1470 |         if (!selfclose && isVoidElt) {
1471 |             res.dataAttribs.selfClose = undefined;
1472 |             res.dataAttribs.noClose = true;
1473 |         }
1474 | 
1475 |         return maybeExtensionTag(res);
1476 |         */
1477 |     }
1478 | 
1479 | 
1480 | // A variant of xmlish_tag, but also checks if the tag name is a block-level
1481 | // tag as defined in
1482 | // http://www.w3.org/TR/html5/syntax.html#tag-open-state and
1483 | // following paragraphs.
1484 | //
1485 | block_tag
1486 |   <- & {
1487 |       // By the time we get to `doTableStuff` in the php parser, we've already
1488 |       // safely encoded element attributes. See 55313f4e in core.
1489 |       push(c, "table", false)
1490 |       push(c, "tableCellArg", false)
1491 |       return true, nil
1492 |     }
1493 |     "<" "/"?
1494 |     (tag_name & {
1495 |       push(c, "extTag", false)
1496 |       return false, nil
1497 |     }
1498 |     //#/*
1499 |     //#  return isXMLTag(tn, true);  // NOTE: 'extTag' stop was pushed.
1500 |     //#  */
1501 |     //#}
1502 |     )
1503 |     generic_newline_attributes
1504 |     space_or_newline*
1505 |     "/"?
1506 |     ">" {
1507 |       pop(c, "table")
1508 |       pop(c, "tableCellArg")
1509 |       pop(c, "extTag")
1510 |       return nil, nil
1511 |     /*
1512 |       stops.pop('table');
1513 |       stops.pop('tableCellArg');
1514 |       stops.pop('extTag');
1515 |       var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, !!selfclose, tsrOffsets());
1516 |       return [maybeExtensionTag(t)];
1517 |       */
1518 |     }
1519 |     / "<" "/"? tag_name & {
1520 |       pop(c, "extTag")
1521 |       return false, nil
1522 |     }
1523 |     / & {
1524 |       pop(c, "table")
1525 |       pop(c, "tableCellArg")
1526 |       return false, nil
1527 |     }
1528 | 
1529 | // A generic attribute that can span multiple lines.
1530 | generic_newline_attribute
1531 |   <- space_or_newline*
1532 |     ("" {return nil, nil/* return endOffset(); */})
1533 |     key:generic_attribute_name
1534 |     ("" {return nil, nil/* return endOffset(); */})
1535 |     val:(space_or_newline* "=" v:generic_att_value? {return v, nil/* return v; */})?
1536 | {return html.Attribute{Key: concat(key), Val: concat(val)}, nil
1537 | /*
1538 | // NB: Keep in sync w/ table_attibute
1539 |     var res;
1540 | // Encapsulate protected attributes.
1541 |     if (typeof name === 'string') {return nil, nil
1542 |         name = tu.protectAttrs(name);
1543 |     }
1544 |     if (vd !== null) {
1545 |         res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]);
1546 |         res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]);
1547 |     } else {
1548 |         res = new KV(name, '', [namePos0, namePos, namePos, namePos]);
1549 |     }
1550 |     if (Array.isArray(name)) {
1551 |         res.ksrc = input.substring(namePos0, namePos);
1552 |     }
1553 |     return res;
1554 |     */
1555 | }
1556 | 
1557 | // A single-line attribute.
1558 | table_attribute
1559 |   <- optionalSpaceToken
1560 |     ("" {return nil, nil /* return endOffset(); */})
1561 |     table_attribute_name
1562 |     ("" {return nil, nil /* return endOffset(); */})
1563 |     (optionalSpaceToken "=" table_att_value? {return nil, nil /* return v; */})?
1564 | {return nil,nil
1565 | /*
1566 | // NB: Keep in sync w/ generic_newline_attribute
1567 |     var res;
1568 | // Encapsulate protected attributes.
1569 |     if (typeof name === 'string') {
1570 |         name = tu.protectAttrs(name);
1571 |     }
1572 |     if (vd !== null) {
1573 |         res = new KV(name, vd.value, [namePos0, namePos, vd.srcOffsets[0], vd.srcOffsets[1]]);
1574 |         res.vsrc = input.substring(vd.srcOffsets[0], vd.srcOffsets[1]);
1575 |     } else {
1576 |         res = new KV(name, '', [namePos0, namePos, namePos, namePos]);
1577 |     }
1578 |     if (Array.isArray(name)) {
1579 |         res.ksrc = input.substring(namePos0, namePos);
1580 |     }
1581 |     return res;
1582 |     */
1583 | }
1584 | 
1585 | // The arrangement of chars is to emphasize the split between what's disallowed
1586 | // by html5 and what's necessary to give directive a chance.
1587 | // See: http://www.w3.org/TR/html5/syntax.html#attributes-0
1588 | generic_attribute_name
1589 |   <- q:(["'=]?)  // From #before-attribute-name-state, < is omitted for directive
1590 |     r:( [^ \t\r\n\x00/=><&{}!|-]+
1591 |         / !inline_breaks
1592 | // \0/=> is the html5 attribute name set we do not want.
1593 |           t:( directive / !( space_or_newline / [\x00/=>] ) c2:. { return c2, nil /*return c;*/ }
1594 |         ) {return t, nil  /*return t; */}
1595 |     )*
1596 |     & {
1597 |       return len(flatten(r))>0 || len(flatten(q))>0, nil
1598 |       /* return r.length > 0 || q.length > 0; */
1599 |     }
1600 |   {return concat(q, r), nil /* return tu.flattenString([q].concat(r)); */}
1601 | 
1602 | // Also accept these chars in a wikitext table or tr attribute name position.
1603 | // They are normally not matched by the table_attribute_name.
1604 | broken_table_attribute_name_char <- [\x00/=>] {return nil, nil /* return new KV(c, ''); */}
1605 | 
1606 | // Same as generic_attribute_name, except for accepting tags and wikilinks.
1607 | // (That doesn't make sense (ie. match php) in the generic case.)
1608 | // We also give a chance to break on \[ (see T2553).
1609 | table_attribute_name
1610 |   <- (["'=]?)  // From #before-attribute-name-state, < is omitted for directive
1611 |     ( [^ \t\r\n\x00/=><&{}!|[-]+
1612 |         / !inline_breaks
1613 | // \0/=> is the html5 attribute name set we do not want.
1614 |           (   wikilink
1615 |               / directive
1616 | // Accept insane tags-inside-attributes as attribute names.
1617 | // The sanitizer will strip and shadow them for roundtripping.
1618 | // Example: <hiddentext>generated with.. </hiddentext>
1619 |               / &xmlish_tag inlineline {return nil, nil/* return ill; */}
1620 |               / !( space_or_newline / [\x00/=>] ) . {return nil, nil/* return c; */}
1621 |         ) {return nil, nil/* return t; */}
1622 |     )*
1623 |     & {return false, nil/* return r.length > 0 || q.length > 0; */}
1624 |   {return nil, nil/* return tu.flattenString([q].concat(r)); */}
1625 | 
1626 | // Attribute value, quoted variants can span multiple lines.
1627 | // Missing end quote: accept /> look-ahead as heuristic.
1628 | // These need to be kept in sync with the attribute_preprocessor_text_*
1629 | generic_att_value
1630 |   <- (space_or_newline* "'") t:attribute_preprocessor_text_single? ("'" / &('/'?  '>')) {
1631 |   return t, nil
1632 |   /*
1633 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1634 |       */
1635 |     }
1636 |   / (space_or_newline* '"') t:attribute_preprocessor_text_double? ('"' / &('/'?  '>')) {
1637 |   return t, nil
1638 |   /*
1639 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1640 |       */
1641 |     }
1642 |   / space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') {
1643 |   return t, nil
1644 |   /*
1645 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset());
1646 |       */
1647 |     }
1648 | 
1649 | // Attribute value, restricted to a single line.
1650 | // Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic.
1651 | // These need to be kept in sync with the table_attribute_preprocessor_text_*
1652 | table_att_value
1653 |   <- (space* "'") table_attribute_preprocessor_text_single? ("'" / &("!!" / [|\r\n])) {return nil, nil
1654 |   /*
1655 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1656 |       */
1657 |     }
1658 |   / (space* '"') table_attribute_preprocessor_text_double? ('"' / &("!!" / [|\r\n])) {return nil, nil
1659 |   /*
1660 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset() - q.length);
1661 |       */
1662 |     }
1663 |   / space* table_attribute_preprocessor_text &(space_or_newline/ eof / "!!" / '|') {return nil, nil
1664 |   /*
1665 |       return tu.getAttrVal(t, startOffset() + s.length, endOffset());
1666 |       */
1667 |     }
1668 | 
1669 | // *******************************************************
1670 | //   Lists
1671 | // *******************************************************/
1672 | list_item <- dtdd / hacky_dl_uses / li
1673 | 
1674 | li <- bullets:list_char+
1675 |      c2:inlineline?
1676 |      // The inline_break is to check if we've hit a template end delimiter.
1677 |      &(eolf / inline_breaks)
1678 | {
1679 |   n := &html.Node{
1680 |     Type: html.ElementNode,
1681 |     Data: "li",
1682 |   }
1683 |   addChild(n, c2)
1684 |   return n ,nil
1685 | /*
1686 | // Leave bullets as an array -- list handler expects this
1687 |     var tsr = tsrOffsets('start');
1688 |     tsr[1] += bullets.length;
1689 |     var li = new TagTk('listItem', [], { tsr: tsr });
1690 |     li.bullets = bullets;
1691 |     return [ li ].concat(c || []);
1692 |     */
1693 | }
1694 | 
1695 | 
1696 | // This rule is required to support wikitext of this form
1697 | //   ::{|border="1"|foo|bar|baz|}
1698 | // where the leading colons are used to indent the entire table.
1699 | // This hack was added back in 2006 in commit
1700 | // a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl
1701 | // Fürstenberg.
1702 | //
1703 | hacky_dl_uses <- ":"+
1704 |                (table_line (sol table_line)*)
1705 |                inlineline?
1706 |                &comment_space_eolf
1707 | {return nil,nil
1708 | /*
1709 | // Leave bullets as an array -- list handler expects this
1710 |     var tsr = tsrOffsets('start');
1711 |     tsr[1] += bullets.length;
1712 |     var li = new TagTk('listItem', [], { tsr: tsr });
1713 |     li.bullets = bullets;
1714 |     return tu.flattenIfArray([li, tbl || [], line || []]);
1715 |     */
1716 | }
1717 | 
1718 | dtdd
1719 |   <- (!(";" !list_char) list_char {return nil, nil /*return lc;*/ })*
1720 |     ";"
1721 |     & {return false, nil/*return stops.inc('colon');*/}
1722 |     inlineline?
1723 |     (":" {return nil, nil /*return endOffset(); */})
1724 | // Fortunately dtdds cannot be nested, so we can simply set the flag
1725 | // back to 0 to disable it.
1726 |     & {return false, nil /*stops.counters.colon = 0; return true;*/}
1727 |     inlineline?
1728 |     &eolf {return nil, nil
1729 |     /*
1730 | // Leave bullets as an array -- list handler expects this
1731 | // TSR: +1 for the leading ";"
1732 |         var numBullets = bullets.length + 1;
1733 |         var tsr = tsrOffsets('start');
1734 |         tsr[1] += numBullets;
1735 |         var li1 = new TagTk('listItem', [], { tsr: tsr });
1736 |         li1.bullets = bullets.slice();
1737 |         li1.bullets.push(";");
1738 | // TSR: -1 for the intermediate ":"
1739 |         var li2 = new TagTk('listItem', [], { tsr: [cpos - 1, cpos], stx: 'row' });
1740 |         li2.bullets = bullets.slice();
1741 |         li2.bullets.push(":");
1742 | 
1743 |         return [ li1 ].concat(c || [], [ li2 ], d || []);
1744 |         */
1745 |     }
1746 | // Fall-back case to clear the colon flag
1747 |   / & {return false, nil /*stops.counters.colon = 0; return false; */}
1748 | 
1749 | 
1750 | list_char <- [*#:;]
1751 | 
1752 | 
1753 | 
1754 | // ****************************************************************************
1755 | // Tables
1756 | // ------
1757 | // Table rules are geared to support independent parsing of fragments in
1758 | // templates (the common table start / row / table end use case). The tokens
1759 | // produced by these fragments then match up to a table while building the
1760 | // DOM tree. For similar reasons, table rows do not emit explicit end tag
1761 | // tokens.
1762 | 
1763 | // The separate table_line rule is faster than moving those rules
1764 | // directly to block_lines.
1765 | 
1766 | // Notes about the full_table_in_link_caption rule
1767 | // -----------------------------------------------------
1768 | // However, for link-tables, we have introduced a stricter parse wherein
1769 | // we require table-start and table-end tags to not come from a template.
1770 | // In addition, this new rule doesn't accept fosterable-content in
1771 | // the table unlike the more lax (sol table_line)+ rule.
1772 | 
1773 | // This is the best we can do at this time since we cannot distinguish
1774 | // between table rows and image options entirely in the tokenizer.
1775 | 
1776 | // Consider the following examples:
1777 | 
1778 | // Example 1:
1779 | 
1780 | // [[Image:Foo.jpg|left|30px|Example 1
1781 | // {{This-template-returns-a-table-start-tag}}
1782 | // |foo
1783 | // {{This-template-returns-a-table-end-tag}}
1784 | // ]]
1785 | 
1786 | // Example 2:
1787 | 
1788 | // [[Image:Foo.jpg|left|30px|Example 1
1789 | // {{echo|a}}
1790 | // |foo
1791 | // {{echo|b}}
1792 | // ]]
1793 | 
1794 | // So, we cannot know a priori (without preprocessing or fully expanding
1795 | // all templates) if "|foo" in the two examples is a table cell or an image
1796 | // option. This is a limitation of our tokenizer-based approach compared to
1797 | // the preprocessing-based approach of the PHP parser.
1798 | 
1799 | // Given this limitation, we are okay forcing a full-table context in
1800 | // link captions (if necessary, we can relax the fosterable-content requirement
1801 | // but that is broken wikitext anyway, so we can force that edge-case wikitext
1802 | // to get fixed by rejecting it).
1803 | // ****************************************************************************/
1804 | 
1805 | full_table_in_link_caption
1806 |   <- (! inline_breaks / & "{{!}}" )
1807 |     (
1808 | // Note that "linkdesc" is suppressed here to provide a nested parsing
1809 | // context in which to parse the table.  Otherwise, we may break on
1810 | // on pipes in the `table_start_tag` and `table_row_tag` attributes.
1811 | // However, as a result, this can be more permissive than the current
1812 | // php implementation, but likelier to match the users intent.
1813 |         & {return false, nil /*stops.push('linkdesc', false); return stops.push('table', true);
1814 |         */}
1815 |         (
1816 |             table_start_tag optionalNewlines
1817 | // Accept multiple end tags since a nested table may have been
1818 | // opened in the table content line.
1819 |             ((sol (table_content_line / tplarg_or_template) optionalNewlines)*
1820 |             sol table_end_tag)+
1821 |         ){return nil, nil
1822 |         /*
1823 |             stops.pop('linkdesc');
1824 |             stops.pop('table');
1825 |             return tbl;
1826 |             */
1827 |         }
1828 |       / & {return false, nil/* stops.pop('linkdesc'); return stops.pop('table'); */}
1829 |     ) {return nil, nil/* return r; */}
1830 | 
1831 | // This rule assumes start-of-line position!
1832 | table_line
1833 |   <- (! inline_breaks / & "{{!}}" )
1834 |     (
1835 |         & {return false, nil /* return stops.push('table', true); */}
1836 |         (
1837 |              table_start_tag optionalNewlines
1838 |            / table_content_line optionalNewlines
1839 |            / table_end_tag
1840 |         ) {return nil, nil
1841 |         /*
1842 |             stops.pop('table');
1843 |             return tl;
1844 |             */
1845 |         }
1846 |       / & {return false, nil /* return stops.pop('table'); */}
1847 |     ) {return nil, nil/* return r; */}
1848 | 
1849 | table_content_line <- (space / comment)* (
1850 |     table_heading_tags
1851 |     / table_row_tag
1852 |     / table_data_tags
1853 |     / table_caption_tag
1854 |   )
1855 | 
1856 | table_start_tag
1857 |   <- (space / comment)* ("" {return nil, nil/* return endOffset(); */}) "{" pipe
1858 | // ok to normalize away stray |} on rt (see T59360)
1859 |     & {return false, nil /* return stops.push('table', false); */}
1860 |     table_attributes
1861 |     ("" {return nil, nil/* stops.pop('table'); return endOffset(); */})
1862 |     {return nil, nil
1863 |     /*
1864 |         var coms = tu.popComments(ta);
1865 |         if (coms) {
1866 |           tsEndPos = coms.commentStartPos;
1867 |         }
1868 | 
1869 |         var da = { tsr: [startPos, tsEndPos] };
1870 |         if (p !== "|") {
1871 | // Variation from default
1872 |             da.startTagSrc = b + p;
1873 |         }
1874 | 
1875 |         sc.push(new TagTk('table', ta, da));
1876 |         if (coms) {
1877 |           sc = sc.concat(coms.buf);
1878 |         }
1879 |         return sc;
1880 |         */
1881 |     }
1882 | 
1883 | // FIXME: Not sure if we want to support it, but this should allow columns.
1884 | table_caption_tag
1885 | // avoid recursion via nested_block_in_table
1886 |   <- ! {return true, nil /*return stops.onStack('tableDataBlock');*/ }
1887 |     pipe "+"
1888 |     row_syntax_table_args?
1889 |     ("" {return nil, nil /*return endOffset();*/ })
1890 |     nested_block_in_table* {return nil, nil
1891 |     /*
1892 |         return tu.buildTableTokens("caption", "|+", args, [startOffset(), tagEndPos], endOffset(), c, true);
1893 |         */
1894 |     }
1895 | 
1896 | table_row_tag
1897 |   <- // avoid recursion via nested_block_in_table
1898 |     ! {return true, nil /*return stops.onStack('tableDataBlock'); */}
1899 |     pipe "-"+
1900 |     & {return false, nil /* return stops.push('table', false); */}
1901 |     table_attributes
1902 |     ("" {return nil, nil/* stops.pop('table'); return endOffset(); */})
1903 |     {return nil, nil
1904 |     /*
1905 |         var coms = tu.popComments(a);
1906 |         if (coms) {
1907 |           tagEndPos = coms.commentStartPos;
1908 |         }
1909 | 
1910 |         var da = {
1911 |           tsr: [ startOffset(), tagEndPos ],
1912 |           startTagSrc: p + dashes,
1913 |         };
1914 | 
1915 | // We rely on our tree builder to close the row as needed. This is
1916 | // needed to support building tables from fragment templates with
1917 | // individual cells or rows.
1918 |         var trToken = new TagTk('tr', a, da);
1919 | 
1920 |         var res = [ trToken ];
1921 |         if (coms) {
1922 |           res = res.concat(coms.buf);
1923 |         }
1924 |         return res;
1925 |         */
1926 |     }
1927 | 
1928 | tds
1929 |   <- ( ( pipe_pipe / pipe & row_syntax_table_args {return nil, nil /*return p;*/ } )
1930 |       table_data_tag {return nil, nil
1931 |       /*
1932 |         var da = tdt[0].dataAttribs;
1933 |         da.stx = "row";
1934 |         da.tsr[0] -= pp.length; // include "||"
1935 |         if (pp !== "||" || (da.startTagSrc && da.startTagSrc !== pp)) {
1936 | // Variation from default
1937 |           da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : '');
1938 |         }
1939 |         return tdt;
1940 |         */
1941 |       }
1942 |     )*
1943 | 
1944 | // avoid recursion via nested_block_in_table
1945 | table_data_tags
1946 |   <- ! {return true, nil/* return stops.onStack('tableDataBlock'); */}
1947 |     pipe
1948 |     ![+-] table_data_tag
1949 |     ("" {return nil, nil/* return endOffset(); */})
1950 |     tds {return nil, nil
1951 |     // blahaskjdf;alsdf;;
1952 |     }
1953 | 
1954 | table_data_tag
1955 |   <- ! "}"
1956 |     row_syntax_table_args?
1957 | // use inline_breaks to break on tr etc
1958 |     ("" {return nil, nil/* return endOffset(); */})
1959 |     nested_block_in_table*
1960 |     {return nil, nil
1961 |     /*
1962 |         return tu.buildTableTokens("td", "|", arg, [startOffset(), tagEndPos], endOffset(), td);
1963 |         */
1964 |     }
1965 | 
1966 | table_heading_tags
1967 |   <- "!"
1968 |     & {return false, nil /*return stops.push('th', endOffset()); */}
1969 |     table_heading_tag
1970 |     ( ("!!" / pipe_pipe) table_heading_tag {return nil, nil
1971 |     /*
1972 |             var da = tht[0].dataAttribs;
1973 |             da.stx = 'row';
1974 |             da.tsr[0] -= pp.length; // include "!!" or "||"
1975 | 
1976 |             if (pp !== "!!" || (da.startTagSrc && da.startTagSrc !== pp)) {
1977 | // Variation from default
1978 |                 da.startTagSrc = pp + (da.startTagSrc ? da.startTagSrc.substring(1) : '');
1979 |             }
1980 |             return tht;
1981 |             */
1982 |           }
1983 |     )* {return nil, nil
1984 |     /*
1985 |         stops.pop('th');
1986 |         th[0].dataAttribs.tsr[0]--; // include "!"
1987 |         return th.concat(ths);
1988 |         */
1989 |     }
1990 |     / & {return false, nil /*return stops.onStack('th') !== false ? stops.pop('th') : false;*/ }
1991 | 
1992 | table_heading_tag
1993 |   <- row_syntax_table_args?
1994 |     ("" {return nil, nil /*return endOffset();*/ })
1995 |     ( & {return false, nil
1996 |     /*
1997 | // This SyntaxStop is only true until we hit the end of the line.
1998 |       if (stops.onStack('th') !== false &&
1999 |               /\n/.test(input.substring(stops.onStack('th'), endOffset()))) {
2000 | // There's been a newline. Remove the break and continue
2001 | // tokenizing nested_block_in_tables.
2002 |           stops.pop('th');
2003 |       }
2004 |       return true;
2005 |       */
2006 |     } nested_block_in_table {return nil, nil/* return d; */} )* {return nil, nil
2007 |     /*
2008 |         return tu.buildTableTokens("th", "!", arg, [startOffset(), tagEndPos], endOffset(), c);
2009 |         */
2010 |     }
2011 | 
2012 | table_end_tag
2013 |   <- (space / comment)* ("" {return nil, nil/* return endOffset(); */}) pipe "}" {return nil, nil
2014 |   /*
2015 |       var tblEnd = new EndTagTk('table', [], { tsr: [startPos, endOffset()] });
2016 |       if (p !== "|") {
2017 | // p+"<brace-char>" is triggering some bug in pegJS
2018 | // I cannot even use that expression in the comment!
2019 |           tblEnd.dataAttribs.endTagSrc = p + b;
2020 |       }
2021 |       return sc.concat([tblEnd]);
2022 |       */
2023 |   }
2024 | 
2025 | //
2026 | // Table parameters separated from the content by a single pipe. Does *not*
2027 | // match if followed by double pipe (row-based syntax).
2028 | //
2029 | row_syntax_table_args
2030 |   <- & {return false, nil /* return stops.push('tableCellArg', return true, nil); */}
2031 |     table_attributes space* pipe !pipe {return nil, nil
2032 |     /*
2033 |         stops.pop('tableCellArg');
2034 |         return [as, s, p];
2035 |         */
2036 |     }
2037 |     / & {return false, nil /* return stops.pop('tableCellArg'); */}
2038 | 
2039 | 
2040 | // *****************************************************************
2041 | // Text variants and other general rules
2042 | // *****************************************************************/
2043 | 
2044 | // All chars that cannot start syntactic structures in the middle of a line
2045 | // XXX: ] and other end delimiters should probably only be activated inside
2046 | // structures to avoid unnecessarily leaving the text rule on plain
2047 | // content.
2048 | 
2049 | // TODO: Much of this is should really be context-dependent (syntactic
2050 | // flags). The wikilink_preprocessor_text rule is an example where
2051 | // text_char is not quite right and had to be augmented. Try to minimize /
2052 | // clarify this carefully!
2053 | //
2054 | 
2055 | text_char <- [^'<~[{\n\r:;\]}|!=-]
2056 | 
2057 | // Legend
2058 | // '    quotes (italic/bold)
2059 | // <    start of xmlish_tag
2060 | // ~    signatures/dates
2061 | // [    start of links
2062 | // {    start of parser functions, transclusion and template args
2063 | // \n   all sort of block-level markup at start of line
2064 | // \r   ditto
2065 | // A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC)
2066 | 
2067 | // _    behavior switches (e.g., '__NOTOC__') (XXX: not URL related)
2068 | // ! and | table cell delimiters, might be better to specialize those
2069 | // =    headings - also specialize those!
2070 | 
2071 | // The following chars are also included for now, but only apply in some
2072 | // contexts and should probably be enabled only in those:
2073 | // :    separate definition in ; term : definition
2074 | // ]    end of link
2075 | // }    end of parser func/transclusion/template arg
2076 | // -    start of lang_variant -{ ... }-
2077 | // ;    separator in lang_variant
2078 | //
2079 | 
2080 | urltext <- ( [^-'<~[{\n/A-Za-z_|!:;\]} &=]+
2081 |           / & [/A-Za-z] al:autolink {return al, nil /*return al;*/ }
2082 |           / & "&" he:htmlentity {return he, nil /*return he;*/ }
2083 | // Convert trailing space into &nbsp;
2084 | // XXX: This should be moved to a serializer
2085 | // This is a hack to force a whitespace display before the colon
2086 |           / ' ' & ':' {return "&nbsp;", nil
2087 |           /*
2088 |               var toks = Util.placeholder('\u00a0', {
2089 |                  ' ',
2090 |                 tsr: tsrOffsets('start'),
2091 |                 isDisplayHack: true,
2092 |               }, { tsr: tsrOffsets('end'), isDisplayHack: true });
2093 |               var typeOf = toks[0].getAttribute('typeof');
2094 |               toks[0].setAttribute('typeof', 'mw:DisplaySpace ' + typeOf);
2095 |               return toks;
2096 |               */
2097 |           }
2098 |           / & ("__") bs:behavior_switch {return bs, nil /*return bs;*/ }
2099 | // About 96% of text_char calls originate here.
2100 | // pegjs 0.8 inlines this simple rule automatically.
2101 |           / text_char )+
2102 | 
2103 | raw_htmlentity <- ("&" [#0-9a-zA-Z]+ ";") {return nil, nil
2104 | /*
2105 |     return Util.decodeEntities(m);
2106 |     */
2107 | }
2108 | 
2109 | htmlentity <- raw_htmlentity {return nil, nil
2110 | /*
2111 | // if this is an invalid entity, don't tag it with 'mw:Entity'
2112 |     if (cc.length > 2 /* decoded entity would be 1 or 2 UTF-16 characters * /) {
2113 |         return cc;
2114 |     }
2115 |     return [
2116 |         new TagTk('span', [new KV('typeof', 'mw:Entity')], { src: text(), srcContent: cc, tsr: tsrOffsets('start') }),
2117 |         cc,
2118 |         new EndTagTk('span', [], { tsr: tsrOffsets('end') }),
2119 |     ];
2120 |     */
2121 | }
2122 | 
2123 | spaces <- [ \t]+
2124 | 
2125 | space <- [ \t]
2126 | 
2127 | optionalSpaceToken <- space*
2128 | 
2129 | // This rule corresponds to \s in the PHP preg_* functions,
2130 | // which is used frequently in the PHP parser.  The inclusion of
2131 | // form feed (but not other whitespace, like vertical tab) is a quirk
2132 | // of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular
2133 | // Expressions) library.
2134 | //
2135 | space_or_newline
2136 |   <- [ \t\n\r\x0c]
2137 | 
2138 | // This rule corresponds to \b in the PHP preg_* functions,
2139 | // after a word character.  That is, it's a zero-width lookahead that
2140 | // the next character is not a word character.
2141 | //
2142 | end_of_word
2143 |   <- eof / ![A-Za-z0-9_]
2144 | 
2145 | // Unicode "separator, space" category.  It covers the \u0020 space as well
2146 | // as \u3000 IDEOGRAPHIC SPACE (see bug 19052).  In PHP this is \p{Zs}.
2147 | // Keep this up-to-date with the characters tagged ;Zs; in
2148 | // http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
2149 | unispace <- [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]
2150 | 
2151 | // Non-newline whitespace, including non-breaking spaces.  Used for magic links.
2152 | space_or_nbsp
2153 |   <- space // includes \t
2154 |   / unispace
2155 |   / he:htmlentity &{ return false, nil /*return Array.isArray(he) && /^\u00A0$/.test(he[1]);*/ }
2156 |     {return he, nil /*return he;*/ }
2157 | 
2158 | // Used within ISBN magic links
2159 | space_or_nbsp_or_dash
2160 |   <- space_or_nbsp / "-"
2161 | 
2162 | // Extra newlines followed by at least another newline. Usually used to
2163 | // compress surplus newlines into a meta tag, so that they don't trigger
2164 | // paragraphs.
2165 | optionalNewlines
2166 |   <- ([\n\r\t ] &[\n\r])*
2167 | 
2168 | comment_or_includes <- (comment / (
2169 |     ( #{
2170 |       push(c, "sol_il", true)
2171 |       return nil
2172 |     }
2173 |       i:include_limits
2174 |       #{
2175 |         pop(c, "sol_il")
2176 |         return nil
2177 |       }
2178 |     ) {return i, nil}
2179 |   ))*
2180 | 
2181 | sol <- (empty_line_with_comments / sol_prefix) comment_or_includes
2182 | 
2183 | sol_prefix
2184 |   <- newlineToken
2185 |   / & {
2186 |   return c.pos.offset == 0, nil
2187 |   /*
2188 | // Use the sol flag only at the start of the input
2189 | // NOTE: Explicitly check for 'false' and not a falsy value
2190 |       return endOffset() === 0 && options.sol !== false;
2191 |       */
2192 |   } {return nil, nil /*return [];*/ }
2193 | 
2194 | empty_line_with_comments
2195 |   <- sol_prefix ("" {return "empty_line_with_comments", nil /*return endOffset();*/ }) (space* comment (space / comment)* newline)+ {return nil, nil
2196 |   /*
2197 |         return [
2198 |             sp,
2199 |             new SelfclosingTagTk("meta", [new KV('typeof', 'mw:EmptyLine')], {
2200 |                 tokens: tu.flattenIfArray(c),
2201 |                 tsr: [p, endOffset()],
2202 |             }),
2203 |         ];
2204 |         */
2205 |     }
2206 | 
2207 | comment_space <- comment / space
2208 | 
2209 | nl_comment_space <- newlineToken / comment_space
2210 | 
2211 | //
2212 | // noinclude / includeonly / onlyinclude rules. These are normally
2213 | // handled by the xmlish_tag rule, except where generic tags are not
2214 | // allowed- for example in directives, which are allowed in various attribute
2215 | // names and -values.
2216 | 
2217 | // Example test case:
2218 | // {|
2219 | // |-<includeonly>
2220 | // foo
2221 | // </includeonly>
2222 | // |Hello
2223 | // |}
2224 | //
2225 | 
2226 | include_limits <-
2227 |   il:("<" "/"? ([oyinclude]i+ & {return false, nil
2228 |   /*
2229 |     var incl = n.toLowerCase();
2230 |     return incl === "noinclude" || incl === "onlyinclude" ||
2231 |       incl === "includeonly";
2232 |       */
2233 |   }) space_or_newline* ">" {return nil, nil
2234 |   /*
2235 |     var incl = name.toLowerCase();
2236 |     var dp = { tsr: tsrOffsets() };
2237 | 
2238 | // Record variant since tag is not in normalized lower case
2239 |     if (name !== incl) {
2240 |       dp.srcTagName = name;
2241 |     }
2242 | 
2243 | // End tag only
2244 |     if (c) {
2245 |       return new EndTagTk(name, [], dp);
2246 |     }
2247 | 
2248 |     var restOfInput = input.substring(endOffset());
2249 |     var tagContent = restOfInput.match(new RegExp("^([\\s\\S]*?)(?:</\\s*" + incl + "\\s*>)", "m"));
2250 | 
2251 | // Start tag only
2252 |     if (!tagContent || !tagContent[1]) {
2253 |       return new TagTk(name, [], dp);
2254 |     }
2255 | 
2256 | // Get the content
2257 |     var inclContent = tagContent[1];
2258 | 
2259 | // Preserve SOL where necessary (for onlyinclude and noinclude)
2260 | // Note that this only works because we encounter <*include*> tags in
2261 | // the toplevel content and we rely on the php preprocessor to expand
2262 | // templates, so we shouldn't ever be tokenizing inInclude.
2263 | // Last line should be empty (except for comments)
2264 |     if (incl !== "includeonly" && stops.onStack("sol_il")) {
2265 |       var last = lastItem(inclContent.split('\n'));
2266 |       if (!/^(<!--([^-]|-(?!->))*-->)*$/.test(last)) {
2267 |         return false;
2268 |       }
2269 |     }
2270 | 
2271 | // Tokenize include content in a new tokenizer
2272 |     var inclContentToks = (new PegTokenizer(env)).tokenizeSync(inclContent);
2273 |     inclContentToks = Util.stripEOFTkfromTokens(inclContentToks);
2274 | 
2275 | // Shift tsr
2276 |     Util.shiftTokenTSR(inclContentToks, endOffset());
2277 | 
2278 | // Skip past content
2279 |     peg$currPos += inclContent.length;
2280 | 
2281 |     return [new TagTk(name, [], dp)].concat(inclContentToks);
2282 |       */
2283 |   }) & {return il != nil, nil /*return !!il; */ } {return il, nil /*return il; */ }
2284 | 
2285 | // Start of file
2286 | sof <- & {
2287 |   return c.pos.offset == 0, nil
2288 | }
2289 | 
2290 | // End of file
2291 | eof <- & {
2292 |   len := c.globalStore["len"].(int)
2293 |   return c.pos.offset == len, nil
2294 | }
2295 | 
2296 | newline <- '\n' / "\r\n"
2297 | 
2298 | newlineToken <- newline {return "\n", nil/* return [new NlTk(tsrOffsets())]; */}
2299 | 
2300 | eolf <- newline / eof
2301 | 
2302 | comment_space_eolf <- (space+ / comment)* eolf
2303 | 
2304 | // 'Preprocessor' directive- higher-level things that can occur in otherwise
2305 | // plain-text content.
2306 | directive
2307 |   <- comment
2308 |   / extension_tag
2309 |   / tplarg_or_template
2310 |   / & "-{" v:lang_variant_or_tpl {return v, nil/* return v; */}
2311 |   / & "&" e:htmlentity {return e, nil/* return e; */}
2312 |   / include_limits
2313 | 
2314 | wikilink_preprocessor_text
2315 |   <- r:( [^<[{\n\r\t|!\]}{ &-]+
2316 | // XXX gwicke: any more chars we need to allow here?
2317 |         / !inline_breaks wr:( directive / ( !"]]" ( text_char / [!<}\]\n\r-] ) ) )
2318 |         {return wr, nil/* return wr; */}
2319 |     )+ {return r, nil
2320 |     /*
2321 |       return tu.flattenStringlist(r);
2322 |       */
2323 |   }
2324 | 
2325 | extlink_preprocessor_text
2326 | // added special separator character class inline: separates url from
2327 | // description / text
2328 |   <- # { push(c, "linkdesc", false); return nil
2329 |   /*
2330 | // Prevent breaking on pipes when we're in a link description.
2331 | // See the test, 'Images with the "|" character in the comment'.
2332 |     return stops.push('linkdesc', false);
2333 |     */
2334 |   }
2335 |   r:( [^'<~[{\n\r|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000-]+
2336 |   / !inline_breaks s:( directive / no_punctuation_char / [&|{-] ) {return s, nil/* return s;
2337 |   */}
2338 | /// urlencoded_char
2339 | // !inline_breaks no_punctuation_char
2340 |   / ([.:,] !(space / eolf))
2341 |   / (['] ![']) // single quotes are ok, double quotes are bad
2342 |   )+
2343 |   #{ pop(c, "linkdesc"); return nil }
2344 |   {return r, nil
2345 |   /*
2346 |       stops.pop('linkdesc');
2347 |       return tu.flattenString(r);
2348 |       */
2349 |   }
2350 | 
2351 | // Attribute values with preprocessor support
2352 | 
2353 | // n.b. / is a permissible char in the three rules below.
2354 | // We only break on />, enforced by the negated expression.
2355 | // Hence, it isn't included in the stop set.
2356 | 
2357 | // The stop set is space_or_newline and > which matches generic_att_value.
2358 | attribute_preprocessor_text
2359 |   <- r:( [^{}&<|/ \t\n\r\x0c>-]+
2360 |   / !inline_breaks
2361 |     !"/>"
2362 |     s:( directive / [{}&<|/-] ) {return s, nil /*return s; */}
2363 |   )+ {return r, nil
2364 |   /*
2365 |     return tu.flattenString(r);
2366 |     */
2367 |   }
2368 | 
2369 | // The stop set is '> which matches generic_att_value.
2370 | attribute_preprocessor_text_single
2371 |   <- r:( [^{}&<|/'>-]+
2372 |   / !inline_breaks
2373 |     !"/>"
2374 |     s:( directive / [{}&<|/-] ) {return s, nil/* return s; */}
2375 |   )* {return r, nil
2376 |   /*
2377 |     return tu.flattenString(r);
2378 |     */
2379 |   }
2380 | 
2381 | // The stop set is "> which matches generic_att_value.
2382 | attribute_preprocessor_text_double
2383 |   <- r:( [^{}&<|/">-]+
2384 |   / !inline_breaks
2385 |     !"/>"
2386 |     s:( directive / [{}&<|/-] ) {return s, nil/* return s; */}
2387 |   )* {return r, nil
2388 |   /*
2389 |     return tu.flattenString(r);
2390 |     */
2391 |   }
2392 | 
2393 | // Variants with the entire attribute on a single line
2394 | 
2395 | // n.b. ! is a permissible char in the three rules below.
2396 | // We only break on !! in th, enforced by the inline break.
2397 | // Hence, it isn't included in the stop set.
2398 | // [ is also permissible but we give a chance to break
2399 | // for the [[ special case in php's doTableStuff (See T2553).
2400 | 
2401 | // The stop set is space_or_newline and | which matches table_att_value.
2402 | table_attribute_preprocessor_text
2403 |   <- r:( [^{}&<![ \t\n\r\x0c|-]+
2404 |   / !inline_breaks s:( directive / [{}&<![-] ) {return s, nil/* return s; */}
2405 |   )+ {return r, nil
2406 |   /*
2407 |     return tu.flattenString(r);
2408 |     */
2409 |   }
2410 | 
2411 | // The stop set is '\r\n| which matches table_att_value.
2412 | table_attribute_preprocessor_text_single
2413 |   <- r:( [^{}&<!['\r\n|-]+
2414 |   / !inline_breaks s:( directive / [{}&<![-] ) {return s, nil/* return s; */}
2415 |   )* {return r, nil
2416 |   /*
2417 |     return tu.flattenString(r);
2418 |     */
2419 |   }
2420 | 
2421 | // The stop set is "\r\n| which matches table_att_value.
2422 | table_attribute_preprocessor_text_double
2423 |   <- r:( [^{}&<!["\r\n|-]+
2424 |   / !inline_breaks s:( directive / [{}&<![-] ) {return s, nil/* return s; */}
2425 |   )* {return r, nil
2426 |   /*
2427 |     return tu.flattenString(r);
2428 |     */
2429 |   }
2430 | 
2431 | // Special-case support for those pipe templates
2432 | pipe <- "|" / "{{!}}"
2433 | 
2434 | // SSS FIXME: what about |{{!}} and {{!}}|
2435 | pipe_pipe <- "||" / "{{!}}{{!}}"
2436 | 


--------------------------------------------------------------------------------
/wikitext/wikitext_test.go:
--------------------------------------------------------------------------------
  1 | package wikitext
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"strings"
  6 | 	"testing"
  7 | 
  8 | 	"github.com/davecgh/go-spew/spew"
  9 | 	"golang.org/x/net/html"
 10 | )
 11 | 
 12 | func TestState(t *testing.T) {
 13 | 	var p parser
 14 | 	p.cur.state = make(storeDict)
 15 | 	p.restoreState(p.cloneState())
 16 | 	c := &p.cur
 17 | 
 18 | 	backup := p.cloneState()
 19 | 
 20 | 	c.state["foo"] = true
 21 | 
 22 | 	p.restoreState(backup)
 23 | 	if len(p.cur.state) > 0 {
 24 | 		t.Fatalf("leaking state! %#v", p.cur.state)
 25 | 	}
 26 | }
 27 | 
 28 | func TestConvert(t *testing.T) {
 29 | 	log.SetFlags(log.Flags() | log.Lshortfile)
 30 | 
 31 | 	cases := []struct {
 32 | 		in   string
 33 | 		want string
 34 | 	}{
 35 | 		{
 36 | 			"Blah",
 37 | 			"<p>Blah</p>",
 38 | 		},
 39 | 		{
 40 | 			"== Test ==",
 41 | 			"<h2> Test </h2>",
 42 | 		},
 43 | 		{
 44 | 			"=Test=",
 45 | 			"<h1>Test</h1>",
 46 | 		},
 47 | 		{
 48 | 			"'''Test'''",
 49 | 			"<b>Test</b>",
 50 | 		},
 51 | 		{
 52 | 			"* foo\n* nah\n* woof",
 53 | 			"<li> foo</li>\n<li> nah</li>\n<li> woof</li>",
 54 | 		},
 55 | 		{
 56 | 			"----",
 57 | 			"<hr/>",
 58 | 		},
 59 | 		{
 60 | 			"{{reflink}}\n\nBlah",
 61 | 			"<p></p><p>Blah</p>",
 62 | 		},
 63 | 		{
 64 | 			"[[Jordanstown]]",
 65 | 			`<p><a href="./Jordanstown">Jordanstown</a></p>`,
 66 | 		},
 67 | 		{
 68 | 			"[[Jordanstown|Blah]]",
 69 | 			`<p><a href="./Jordanstown">Blah</a></p>`,
 70 | 		},
 71 | 		{
 72 | 			`{{Infobox basketball club
 73 | | name = Ulster Elks
 74 | | color1 = white
 75 | | color2 = blue
 76 | | logo =
 77 | | arena = [[Ulster University]] Sports Centre
 78 | }}`,
 79 | 			"<p></p>",
 80 | 		},
 81 | 		{
 82 | 			`<div class="bar">Test</div>`,
 83 | 			`<p><div class="bar">Test</div></p>`,
 84 | 		},
 85 | 		{
 86 | 			"<ref>Foo\n</ref>Bar",
 87 | 			"<p><ref>Foo\n</ref>Bar</p>",
 88 | 		},
 89 | 		{
 90 | 			"<ref>A</ref>B",
 91 | 			"<p><ref>A</ref>B</p>",
 92 | 		},
 93 | 	}
 94 | 
 95 | 	debugRules(true)
 96 | 
 97 | 	for _, c := range cases {
 98 | 		c := c
 99 | 		t.Run(c.in, func(t *testing.T) {
100 | 			outBytes, err := Convert([]byte(c.in), strict())
101 | 			if err != nil {
102 | 				t.Fatal(err)
103 | 			}
104 | 
105 | 			out := string(outBytes)
106 | 			if out != c.want {
107 | 				t.Errorf("Covert(%q) = %q; not %q", c.in, out, c.want)
108 | 			}
109 | 		})
110 | 	}
111 | }
112 | 
113 | func TestSanitizationPolicy(t *testing.T) {
114 | 	cases := []struct {
115 | 		in   string
116 | 		want string
117 | 	}{
118 | 		{
119 | 			"<div></div>",
120 | 			"<div></div>",
121 | 		},
122 | 		{
123 | 			"<div>A</div>",
124 | 			"<div>A</div>",
125 | 		},
126 | 		{
127 | 			"<ref></ref>",
128 | 			"<ref></ref>",
129 | 		},
130 | 	}
131 | 
132 | 	p := wikitextPolicy()
133 | 
134 | 	for _, c := range cases {
135 | 		c := c
136 | 		t.Run(c.in, func(t *testing.T) {
137 | 			doc, err := html.Parse(strings.NewReader(c.in))
138 | 			if err != nil {
139 | 				t.Fatal(err)
140 | 			}
141 | 			t.Logf("Doc = %s", spew.Sdump(doc))
142 | 
143 | 			out := p.Sanitize(c.in)
144 | 			if out != c.want {
145 | 				t.Errorf("Sanitize(%q) = %q; not %q", c.in, out, c.want)
146 | 			}
147 | 		})
148 | 	}
149 | }
150 | 


--------------------------------------------------------------------------------